# Model Analysis

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, make_scorer

sns.set()

In [13]:
property_assess = pd.read_csv('assessment_per_capita.csv', dtype={'suite': str})

In [14]:
property_assess.head()

Unnamed: 0.1,Unnamed: 0,acc_num,nb,nb_id,class,value,suite,number,street,garage,...,Break and Enter,Homicide,Robbery,Sexual Assaults,Theft From Vehicle,Theft Of Vehicle,Theft Over $5000,population,num_incidents,crime_per_capita
0,1,9201484,KENILWORTH,6350.0,Residential,312500,,8503,52 STREET NW,True,...,5,0,0,0,10,13,1,2553.0,33,0.012926
1,80,9226986,KENILWORTH,6350.0,Residential,388500,,8404,71 STREET NW,True,...,5,0,0,0,10,13,1,2553.0,33,0.012926
2,256,9512732,KENILWORTH,6350.0,Residential,554500,,8317,75 STREET NW,True,...,5,0,0,0,10,13,1,2553.0,33,0.012926
3,1189,9216847,KENILWORTH,6350.0,Residential,421500,,8416,56 STREET NW,True,...,5,0,0,0,10,13,1,2553.0,33,0.012926
4,1510,9203720,KENILWORTH,6350.0,Residential,413000,,8903,68 STREET NW,True,...,5,0,0,0,10,13,1,2553.0,33,0.012926


In [15]:
property_assess.columns

Index(['Unnamed: 0', 'acc_num', 'nb', 'nb_id', 'class', 'value', 'suite',
       'number', 'street', 'garage', 'lat', 'long', 'zoning', 'lot_size',
       'year_built', 'Assault', 'Break and Enter', 'Homicide', 'Robbery',
       'Sexual Assaults', 'Theft From Vehicle', 'Theft Of Vehicle',
       'Theft Over $5000', 'population', 'num_incidents', 'crime_per_capita'],
      dtype='object')

In [16]:
ml = property_assess[['value', 'nb_id', 'garage', 'zoning', 'lot_size', 'year_built', 'crime_per_capita']]

In [17]:
dummy_cols = ['nb_id', 'garage', 'zoning']
df = pd.get_dummies(ml, columns=dummy_cols, drop_first=True)
df.columns

Index(['value', 'lot_size', 'year_built', 'crime_per_capita', 'nb_id_1020.0',
       'nb_id_1030.0', 'nb_id_1070.0', 'nb_id_1080.0', 'nb_id_1090.0',
       'nb_id_1100.0',
       ...
       'zoning_RMD', 'zoning_RMH', 'zoning_RMU', 'zoning_RPL', 'zoning_RPLt',
       'zoning_RR', 'zoning_RSL', 'zoning_TSDR', 'zoning_TSLR', 'zoning_UCRH'],
      dtype='object', length=322)

In [18]:
X = df.drop('value', axis=1).values
y = df['value'].values

In [19]:
y.reshape(-1,1)

array([[312500],
       [388500],
       [554500],
       ...,
       [408500],
       [455500],
       [446000]])

In [20]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV

In [21]:
scoring = make_scorer(mean_squared_error)

# RandomForestRegressor

In [22]:
# Setup the pipeline steps: steps
steps = [('imputation', SimpleImputer(missing_values=np.nan, strategy='mean')),
         ('scaler', StandardScaler()),
         ('rfr', RandomForestRegressor(max_depth=20))]

# Create the pipeline: pipeline 
pipeline = Pipeline(steps)

# Specify the hyperparameter space
parameters = {'rfr__n_estimators':(10,100)}

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=123)

# Create the GridSearchCV object: gm_cv
yeg_cv = GridSearchCV(pipeline, param_grid=parameters, scoring= scoring, cv=5)

# Fit to the training set
yeg_cv.fit(X_train, y_train)

# Compute and print the metrics
mse = yeg_cv.score(X_test, y_test)
print("Tuned RandomForest estimators: {}".format(yeg_cv.best_params_))
print("Tuned RandomForest Mean Squared Error: {}".format(mse))
print('Mean Error:', np.sqrt(mse))

Tuned RandomForest estimators: {'rfr__n_estimators': 10}
Tuned RandomForest Mean Squared Error: 9669791505.216879
Mean Error: 98335.09803329063


In [23]:
print('predicted value:', yeg_cv.predict(np.atleast_2d(X_test[1])), 'actual value:', np.atleast_2d(y_test[1]))
print('predicted value:', yeg_cv.predict(np.atleast_2d(X_test[10])), 'actual value:', np.atleast_2d(y_test[10]))
print('predicted value:', yeg_cv.predict(np.atleast_2d(X_test[100])), 'actual value:', np.atleast_2d(y_test[100]))
print('predicted value:', yeg_cv.predict(np.atleast_2d(X_test[1000])), 'actual value:', np.atleast_2d(y_test[1000]))
print('predicted value:', yeg_cv.predict(np.atleast_2d(X_test[10000])), 'actual value:', np.atleast_2d(y_test[10000]))

predicted value: [344040.89171958] actual value: [[323000]]
predicted value: [188813.29898447] actual value: [[214000]]
predicted value: [369098.85881082] actual value: [[388500]]
predicted value: [964446.66666667] actual value: [[807000]]
predicted value: [271179.25896181] actual value: [[220000]]


# ElasticNet

In [24]:
# Setup the pipeline steps: steps
steps = [('imputation', SimpleImputer(missing_values=np.nan, strategy='mean')),
         ('scaler', StandardScaler()),
         ('elasticnet', ElasticNet())]

# Create the pipeline: pipeline 
pipeline = Pipeline(steps)

# Specify the hyperparameter space
# parameters = {'elasticnet__l1_ratio':np.linspace(0,1,30)}
parameters = {'elasticnet__l1_ratio':[1]}

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=123)

# Create the GridSearchCV object: gm_cv
# yeg_cv = GridSearchCV(pipeline, param_grid=parameters)
yeg_cv = GridSearchCV(pipeline, param_grid=parameters, scoring= scoring, cv=5)

# Fit to the training set
yeg_cv.fit(X_train, y_train)

# Compute and print the metrics
mse = yeg_cv.score(X_test, y_test)
print("Tuned ElasticNet Alpha: {}".format(yeg_cv.best_params_))
print("Tuned ElasticNet Mean Squared Error: {}".format(mse))
print('Mean Error:', np.sqrt(mse))



Tuned ElasticNet Alpha: {'elasticnet__l1_ratio': 1}
Tuned ElasticNet Mean Squared Error: 24218480600.384903
Mean Error: 155622.87942453995


In [25]:
print('predicted value:', yeg_cv.predict(np.atleast_2d(X_test[1])), 'actual value:', np.atleast_2d(y_test[1]))
print('predicted value:', yeg_cv.predict(np.atleast_2d(X_test[10])), 'actual value:', np.atleast_2d(y_test[10]))
print('predicted value:', yeg_cv.predict(np.atleast_2d(X_test[100])), 'actual value:', np.atleast_2d(y_test[100]))
print('predicted value:', yeg_cv.predict(np.atleast_2d(X_test[1000])), 'actual value:', np.atleast_2d(y_test[1000]))
print('predicted value:', yeg_cv.predict(np.atleast_2d(X_test[10000])), 'actual value:', np.atleast_2d(y_test[10000]))

predicted value: [391135.48925111] actual value: [[323000]]
predicted value: [174742.11326664] actual value: [[214000]]
predicted value: [292624.92776119] actual value: [[388500]]
predicted value: [1037495.46691092] actual value: [[807000]]
predicted value: [290396.64424401] actual value: [[220000]]


In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score

predictors = list(df.columns)
predictors.pop(0)

models = []

models.append(("LogisticRegression",LogisticRegression(solver='lbfgs', multi_class='multinomial', random_state=123)))
models.append(("SVC",SVC(gamma='scale', random_state=123)))
models.append(("LinearSVC",LinearSVC(random_state=123)))
models.append(("KNeighbors",KNeighborsClassifier()))
models.append(("DecisionTree",DecisionTreeClassifier(random_state=123)))
models.append(("RandomForest",RandomForestClassifier(n_estimators=100, random_state=123)))
rf2 = RandomForestClassifier(n_estimators=100, criterion='gini',
                                max_depth=10, random_state=123, max_features=None)
models.append(("RandomForest2",rf2))
models.append(("MLPClassifier",MLPClassifier(solver='lbfgs', random_state=123)))

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
X_new = imp_mean.fit_transform(X)

results = []
names = []
for name,model in models:
    result = cross_val_score(model, X_new, y,  cv=5)
    names.append(name)
    results.append(result)

for i in range(len(names)):
    print(names[i],results[i].mean())



KeyboardInterrupt: 

# SGDRegressor

In [None]:
# Setup the pipeline steps: steps
steps = [('imputation', SimpleImputer(missing_values=np.nan, strategy='mean')),
         ('scaler', StandardScaler()),
         ('clf', SGDRegressor(max_iter = np.ceil(10**6 / len(y))))]

# Create the pipeline: pipeline 
pipeline = Pipeline(steps)

# Specify the hyperparameter space
parameters = {'clf__alpha':10.0**-np.arange(1,7),
              'clf__learning_rate':['invscaling','optimal']}

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=123)

# Create the GridSearchCV object: gm_cv
yeg_cv = GridSearchCV(pipeline, param_grid=parameters, scoring= scoring, cv=5)

# Fit to the training set
yeg_cv.fit(X_train, y_train)

# Compute and print the metrics
mse = yeg_cv.score(X_test, y_test)
print("Tuned SGDRegressor Alpha: {}".format(yeg_cv.best_params_))
print("Tuned SGDRegressor Mean Squared Error: {}".format(mse))
print('Mean Error:', np.sqrt(mse))

In [None]:
print('predicted value:', yeg_cv.predict(np.atleast_2d(X_test[1])), 'actual value:', np.atleast_2d(y_test[1]))
print('predicted value:', yeg_cv.predict(np.atleast_2d(X_test[10])), 'actual value:', np.atleast_2d(y_test[10]))
print('predicted value:', yeg_cv.predict(np.atleast_2d(X_test[100])), 'actual value:', np.atleast_2d(y_test[100]))
print('predicted value:', yeg_cv.predict(np.atleast_2d(X_test[1000])), 'actual value:', np.atleast_2d(y_test[1000]))
print('predicted value:', yeg_cv.predict(np.atleast_2d(X_test[10000])), 'actual value:', np.atleast_2d(y_test[10000]))

# Documentation Review and Research

[sklearn.metrics.explained_variance_score](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.explained_variance_score.html#sklearn.metrics.explained_variance_score)

Best possible score is 1.0, lower values are worse.

In [None]:
from sklearn.metrics import explained_variance_score
y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
explained_variance_score(y_true, y_pred)

In [None]:
y_true = [[0.5, 1], [-1, 1], [7, -6]]
y_pred = [[0, 2], [-1, 2], [8, -5]]
explained_variance_score(y_true, y_pred, multioutput='uniform_average')

[sklearn.metrics.mean_absolute_error](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_error.html#sklearn.metrics.mean_absolute_error)

MAE output is non-negative floating point. The best value is 0.0.

In [None]:
from sklearn.metrics import mean_absolute_error
y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
mean_absolute_error(y_true, y_pred)

In [None]:
y_true = [[0.5, 1], [-1, 1], [7, -6]]
y_pred = [[0, 2], [-1, 2], [8, -5]]
mean_absolute_error(y_true, y_pred)

In [None]:
mean_absolute_error(y_true, y_pred, multioutput='raw_values')

In [None]:
mean_absolute_error(y_true, y_pred, multioutput=[0.3, 0.7])

[sklearn.metrics.mean_squared_error](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html#sklearn.metrics.mean_squared_error)

A non-negative floating point value (the best value is 0.0), or an array of floating point values, one for each individual target.

In [None]:
from sklearn.metrics import mean_squared_error
y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
mean_squared_error(y_true, y_pred)

In [None]:
y_true = [[0.5, 1],[-1, 1],[7, -6]]
y_pred = [[0, 2],[-1, 2],[8, -5]]
mean_squared_error(y_true, y_pred) 

In [None]:
mean_squared_error(y_true, y_pred, multioutput='raw_values')

In [None]:
mean_squared_error(y_true, y_pred, multioutput=[0.3, 0.7])

[sklearn.metrics.mean_squared_log_error](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_log_error.html#sklearn.metrics.mean_squared_log_error)

A non-negative floating point value (the best value is 0.0), or an array of floating point values, one for each individual target.

In [None]:
from sklearn.metrics import mean_squared_log_error
y_true = [3, 5, 2.5, 7]
y_pred = [2.5, 5, 4, 8]
mean_squared_log_error(y_true, y_pred) 

In [None]:
y_true = [[0.5, 1], [1, 2], [7, 6]]
y_pred = [[0.5, 2], [1, 2.5], [8, 8]]
mean_squared_log_error(y_true, y_pred) 

In [None]:
mean_squared_log_error(y_true, y_pred, multioutput='raw_values')

In [None]:
mean_squared_log_error(y_true, y_pred, multioutput=[0.3, 0.7])

[sklearn.metrics.median_absolute_error](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.median_absolute_error.html#sklearn.metrics.median_absolute_error)

A positive floating point value (the best value is 0.0).

In [None]:
from sklearn.metrics import median_absolute_error
y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
median_absolute_error(y_true, y_pred)

[sklearn.metrics.r2_score](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html#sklearn.metrics.r2_score)

Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a R^2 score of 0.0.

In [None]:
from sklearn.metrics import r2_score
y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
r2_score(y_true, y_pred)

In [None]:
y_true = [[0.5, 1], [-1, 1], [7, -6]]
y_pred = [[0, 2], [-1, 2], [8, -5]]
r2_score(y_true, y_pred, multioutput='variance_weighted')

In [None]:
y_true = [1,2,3]
y_pred = [1,2,3]
r2_score(y_true, y_pred)

In [None]:
# A constant model that always predicts the expected value of y, 
# disregarding the input features, would get a R^2 score of 0.0.
y_true = [1,2,3]
y_pred = [2,2,2]
r2_score(y_true, y_pred)

In [None]:
# ... and it can be negative (because the model can be arbitrarily worse).
y_true = [1,2,3]
y_pred = [3,2,1]
r2_score(y_true, y_pred)