# Modeling Exercises

In [10]:
import pandas as pd
import numpy as np
import wrangle as w
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import explained_variance_score

import warnings
warnings.filterwarnings("ignore")

## 1. Select a dataset with a continuous target variable.

In [6]:
df = w.wrangle_zillow()

In [7]:
df.head()

Unnamed: 0,bedcount,bathcount,sqfeet,value,yearbuilt,taxamount,fips
4,4.0,2.0,3633,296425.0,2005,6941.39,6037
6,3.0,4.0,1620,847770.0,2011,10244.94,6037
7,3.0,2.0,2077,646760.0,1926,7924.68,6037
11,0.0,0.0,1200,5328.0,1972,91.6,6037
14,0.0,0.0,171,6920.0,1973,255.17,6037


## 2. Be sure your data is prepared (no missing values, numeric datatypes) and split into samples.

In [8]:
X_train, y_train, X_validate, y_validate, X_test, y_test = w.zillow_model_split(df)

## 3. Work through all of the steps outlined in the lesson, from setting the baseline to selected a model and evaluating the final model on your test data.

In [11]:
y_train = pd.DataFrame(y_train)
y_validate = pd.DataFrame(y_validate)

# add the mean baseline to the db
value_pred_mean = y_train.value.mean()
y_train['value_pred_mean'] = value_pred_mean
y_validate['value_pred_mean'] = value_pred_mean 

# RMSE of baseline
rmse_train = mean_squared_error(y_train.value, y_train.value_pred_mean) ** (.5)

rmse_validate = mean_squared_error(y_validate.value, y_validate.value_pred_mean) ** (.5)

# create a df to easily view results of models
metric_df = pd.DataFrame(data = [
        {
            'model': "mean_baseline",
            'RMSE_train': rmse_train,
            'RMSE_validate': rmse_validate,
            'R2_validate': explained_variance_score(y_validate.value, y_validate.value_pred_mean)
        }
])

In [12]:
lm = LinearRegression()

OLSmodel = lm.fit(X_train, y_train.value)

# make a prediction and save it to the y_train
y_train['value_pred_ols'] = lm.predict(X_train)

#evaluate RMSE
rmse_train_ols = mean_squared_error(y_train.value, y_train.value_pred_ols) ** .5

# predict validate
y_validate['value_pred_ols'] = lm.predict(X_validate)

# evaluate RMSE for validate
rmse_validate_ols = mean_squared_error(y_validate.value, y_validate.value_pred_ols) ** .5

#append metric
metric_df = metric_df.append({
        'model': 'ols',
        'RMSE_train': rmse_train_ols,
        'RMSE_validate': rmse_validate_ols,
        'R2_validate': explained_variance_score(y_validate.value, y_validate.value_pred_ols)    
}, ignore_index=True)

print(f"""RMSE for OLS using LinearRegression
        Training/In-Sample:  {rmse_train_ols:.2f} 
        Validation/Out-of-Sample: {rmse_validate_ols:.2f}\n""")

RMSE for OLS using LinearRegression
    Training/In-Sample:  87320.10 
    Validation/Out-of-Sample: 91122.45



In [13]:
lars = LassoLars(alpha=0.03)

Larsmodel = lars.fit(X_train, y_train.value)

# make a prediction and save it to the y_train
y_train['value_pred_lars'] = lars.predict(X_train)

#evaluate RMSE
rmse_train_lars = mean_squared_error(y_train.value, y_train.value_pred_lars) ** .5

# predict validate
y_validate['value_pred_lars'] = lars.predict(X_validate)

# evaluate RMSE for validate
rmse_validate_lars = mean_squared_error(y_validate.value, y_validate.value_pred_lars) ** .5

#append metric
metric_df = metric_df.append({
        'model': 'lasso_alpha0.03',
        'RMSE_train': rmse_train_lars,
        'RMSE_validate': rmse_validate_lars,
        'R2_validate': explained_variance_score(y_validate.value, y_validate.value_pred_lars)    
}, ignore_index=True)

print(f"""RMSE for LassoLars
        Training/In-Sample:  {rmse_train_lars:.2f} 
        Validation/Out-of-Sample: {rmse_validate_lars:.2f}\n""")

RMSE for LassoLars
        Training/In-Sample:  87320.10 
        Validation/Out-of-Sample: 91122.45



In [14]:
tr = TweedieRegressor(power=1, alpha=1.0)

Tweediemodel = tr.fit(X_train, y_train.value)

# make a prediction and save it to the y_train
y_train['value_pred_tweedie'] = tr.predict(X_train)

#evaluate RMSE
rmse_train_tweedie = mean_squared_error(y_train.value, y_train.value_pred_tweedie) ** .5

# predict validate
y_validate['value_pred_tweedie'] = tr.predict(X_validate)

# evaluate RMSE for validate
rmse_validate_tweedie = mean_squared_error(y_validate.value, y_validate.value_pred_tweedie) ** .5

# append metric
metric_df = metric_df.append({
        'model': 'tweedie_power1_alpha1.0',
        'RMSE_train': rmse_train_tweedie,
        'RMSE_validate': rmse_validate_tweedie,
        'R2_validate': explained_variance_score(y_validate.value, y_validate.value_pred_tweedie)    
}, ignore_index=True)

print(f"""RMSE for TweedieRegressor
        Training/In-Sample:  {rmse_train_tweedie:.2f} 
        Validation/Out-of-Sample: {rmse_validate_tweedie:.2f}\n""")

RMSE for TweedieRegressor
        Training/In-Sample:  676521.52 
        Validation/Out-of-Sample: 695563.06

