# Model Analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, make_scorer

sns.set()

In [2]:
property_assess = pd.read_csv('assessment_per_capita.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
property_assess.columns

Index(['Unnamed: 0', 'acc_num', 'nb', 'nb_id', 'class', 'value', 'suite',
       'number', 'street', 'garage', 'lat', 'long', 'zoning', 'lot_size',
       'year_built', 'Assault', 'Break and Enter', 'Homicide', 'Robbery',
       'Sexual Assaults', 'Theft From Vehicle', 'Theft Of Vehicle',
       'Theft Over $5000', 'population', 'num_incidents', 'crime_per_capita'],
      dtype='object')

In [4]:
ml = property_assess[['value', 'nb_id', 'garage', 'zoning', 'lot_size', 'year_built', 'crime_per_capita']]

In [5]:
dummy_cols = ['nb_id', 'garage', 'zoning']
df = pd.get_dummies(ml, columns=dummy_cols, drop_first=True)
df.columns

Index(['value', 'lot_size', 'year_built', 'crime_per_capita', 'nb_id_1020.0',
       'nb_id_1030.0', 'nb_id_1070.0', 'nb_id_1080.0', 'nb_id_1090.0',
       'nb_id_1100.0',
       ...
       'zoning_RMD', 'zoning_RMH', 'zoning_RMU', 'zoning_RPL', 'zoning_RPLt',
       'zoning_RR', 'zoning_RSL', 'zoning_TSDR', 'zoning_TSLR', 'zoning_UCRH'],
      dtype='object', length=322)

In [6]:
X = df.drop('value', axis=1).values
y = df['value'].values

In [7]:
y.reshape(-1,1)

array([[312500],
       [388500],
       [554500],
       ...,
       [408500],
       [455500],
       [446000]])

In [8]:
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV

# RandomForestRegressor

In [9]:
# Setup the pipeline steps: steps
steps = [('imputation', Imputer(missing_values='NaN', strategy='mean', axis=0)),
         ('scaler', StandardScaler()),
         ('rfr', RandomForestRegressor(max_depth=20))]

# Create the pipeline: pipeline 
pipeline = Pipeline(steps)

# Specify the hyperparameter space
parameters = {'rfr__n_estimators':(10,100)}

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=123)

# Create the GridSearchCV object: gm_cv
yeg_cv = GridSearchCV(pipeline, param_grid=parameters, scoring=make_scorer(mean_squared_error))

# Fit to the training set
yeg_cv.fit(X_train, y_train)

# Compute and print the metrics
r2 = yeg_cv.score(X_test, y_test)
print("Tuned RandomForest estimators: {}".format(yeg_cv.best_params_))
print("Tuned RandomForest R squared: {}".format(r2))

Tuned RandomForest estimators: {'rfr__n_estimators': 100}
Tuned RandomForest R squared: 0.798896304241118


# ElasticNet

In [14]:
# Setup the pipeline steps: steps
steps = [('imputation', Imputer(missing_values='NaN', strategy='mean', axis=0)),
         ('scaler', StandardScaler()),
         ('elasticnet', ElasticNet())]

# Create the pipeline: pipeline 
pipeline = Pipeline(steps)

# Specify the hyperparameter space
# parameters = {'elasticnet__l1_ratio':np.linspace(0,1,30)}
parameters = {'elasticnet__l1_ratio':[1]}

X_small = X[:10000]
y_small = y[:10000]
# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_small, y_small, test_size=0.4, random_state=123)

# Create the GridSearchCV object: gm_cv
# yeg_cv = GridSearchCV(pipeline, param_grid=parameters)
yeg_cv = GridSearchCV(pipeline, param_grid=parameters, scoring=make_scorer(mean_squared_error))

# Fit to the training set
yeg_cv.fit(X_train, y_train)

# Compute and print the metrics
r2 = yeg_cv.score(X_test, y_test)
print("Tuned ElasticNet Alpha: {}".format(yeg_cv.best_params_))
print("Tuned ElasticNet R squared: {}".format(r2))



Tuned ElasticNet Alpha: {'elasticnet__l1_ratio': 1}
Tuned ElasticNet R squared: 43120836476.16247




In [21]:
yeg_cv.predict(np.atleast_2d(X_test[0]))

array([507916.47326502])

In [23]:
np.atleast_2d(y_test[0])

array([[429500]])

# SGDRegressor

In [10]:
# Setup the pipeline steps: steps
steps = [('imputation', Imputer(missing_values='NaN', strategy='mean', axis=0)),
         ('scaler', StandardScaler()),
         ('clf', SGDRegressor(max_iter = np.ceil(10**6 / len(y))))]

# Create the pipeline: pipeline 
pipeline = Pipeline(steps)

# Specify the hyperparameter space
parameters = {'clf__alpha':10.0**-np.arange(1,7),
              'clf__learning_rate':['invscaling','optimal']}

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=123)

# Create the GridSearchCV object: gm_cv
yeg_cv = GridSearchCV(pipeline, param_grid=parameters)

# Fit to the training set
yeg_cv.fit(X_train, y_train)

# Compute and print the metrics
r2 = yeg_cv.score(X_test, y_test)
print("Tuned SGDRegressor Alpha: {}".format(yeg_cv.best_params_))
print("Tuned SGDRegressor R squared: {}".format(r2))

Tuned SGDRegressor Alpha: {'clf__alpha': 0.1, 'clf__learning_rate': 'optimal'}
Tuned SGDRegressor R squared: -332138890.67947406


In [None]:

scoring=make_scorer(mean_squared_error)

In [15]:
np.sqrt(43120836476.16247)

207655.57174360254

[sklearn.metrics.explained_variance_score](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.explained_variance_score.html#sklearn.metrics.explained_variance_score)

Best possible score is 1.0, lower values are worse.

In [2]:
from sklearn.metrics import explained_variance_score
y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
explained_variance_score(y_true, y_pred)

0.9571734475374732

In [6]:
y_true = [[0.5, 1], [-1, 1], [7, -6]]
y_pred = [[0, 2], [-1, 2], [8, -5]]
explained_variance_score(y_true, y_pred, multioutput='uniform_average')

0.9838709677419355

[sklearn.metrics.mean_absolute_error](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_error.html#sklearn.metrics.mean_absolute_error)

MAE output is non-negative floating point. The best value is 0.0.

In [7]:
from sklearn.metrics import mean_absolute_error
y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
mean_absolute_error(y_true, y_pred)

0.5

In [8]:
y_true = [[0.5, 1], [-1, 1], [7, -6]]
y_pred = [[0, 2], [-1, 2], [8, -5]]
mean_absolute_error(y_true, y_pred)

0.75

In [9]:
mean_absolute_error(y_true, y_pred, multioutput='raw_values')

array([0.5, 1. ])

In [10]:
mean_absolute_error(y_true, y_pred, multioutput=[0.3, 0.7])

0.85

[sklearn.metrics.mean_squared_error](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html#sklearn.metrics.mean_squared_error)

A non-negative floating point value (the best value is 0.0), or an array of floating point values, one for each individual target.

In [11]:
from sklearn.metrics import mean_squared_error
y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
mean_squared_error(y_true, y_pred)

0.375

In [12]:
y_true = [[0.5, 1],[-1, 1],[7, -6]]
y_pred = [[0, 2],[-1, 2],[8, -5]]
mean_squared_error(y_true, y_pred) 

0.7083333333333334

In [13]:
mean_squared_error(y_true, y_pred, multioutput='raw_values')

array([0.41666667, 1.        ])

In [14]:
mean_squared_error(y_true, y_pred, multioutput=[0.3, 0.7])

0.825

[sklearn.metrics.mean_squared_log_error](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_log_error.html#sklearn.metrics.mean_squared_log_error)

A non-negative floating point value (the best value is 0.0), or an array of floating point values, one for each individual target.

In [15]:
from sklearn.metrics import mean_squared_log_error
y_true = [3, 5, 2.5, 7]
y_pred = [2.5, 5, 4, 8]
mean_squared_log_error(y_true, y_pred) 

0.03973012298459379

In [16]:
y_true = [[0.5, 1], [1, 2], [7, 6]]
y_pred = [[0.5, 2], [1, 2.5], [8, 8]]
mean_squared_log_error(y_true, y_pred) 

0.044199361889160536

In [17]:
mean_squared_log_error(y_true, y_pred, multioutput='raw_values')

array([0.00462428, 0.08377444])

In [18]:
mean_squared_log_error(y_true, y_pred, multioutput=[0.3, 0.7])

0.060029394179700345

[sklearn.metrics.median_absolute_error](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.median_absolute_error.html#sklearn.metrics.median_absolute_error)

A positive floating point value (the best value is 0.0).

In [19]:
from sklearn.metrics import median_absolute_error
y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
median_absolute_error(y_true, y_pred)

0.5

[sklearn.metrics.r2_score](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html#sklearn.metrics.r2_score)

Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a R^2 score of 0.0.

In [20]:
from sklearn.metrics import r2_score
y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
r2_score(y_true, y_pred)

0.9486081370449679

In [21]:
y_true = [[0.5, 1], [-1, 1], [7, -6]]
y_pred = [[0, 2], [-1, 2], [8, -5]]
r2_score(y_true, y_pred, multioutput='variance_weighted')

0.9382566585956417

In [22]:
y_true = [1,2,3]
y_pred = [1,2,3]
r2_score(y_true, y_pred)

1.0

In [23]:
# A constant model that always predicts the expected value of y, 
# disregarding the input features, would get a R^2 score of 0.0.
y_true = [1,2,3]
y_pred = [2,2,2]
r2_score(y_true, y_pred)

0.0

In [24]:
# ... and it can be negative (because the model can be arbitrarily worse).
y_true = [1,2,3]
y_pred = [3,2,1]
r2_score(y_true, y_pred)

-3.0