### ***Modeling Durham County Inspection Grades ***

Notebook by Temilola Famakinwa


**Overview**

In the last notebook inspection data was processed to create training and test data from Durham county inspection data, and stored as csv files. 

In this notebook we will model the data using:
  * Linear Regression
  * K-Nearest Neighbor (KNN)
  * Support Vector Machine (SVM)
  * RanfomForest
  * XGBoost
The best model will be found by using cross validation will be used  to evaluate model accuracy and select the best one. Finally, hyperparameter tuning will be used on the best model using Random Search and Bayes Optimization to determine best hyperparameters.


In [2]:
!pip install xgboost

Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/24/ec/ad387100fa3cc2b9b81af0829b5ecfe75ec5bb19dd7c19d4fea06fb81802/xgboost-2.0.3-py3-none-win_amd64.whl.metadata
  Downloading xgboost-2.0.3-py3-none-win_amd64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.8 MB 325.1 kB/s eta 0:05:07
   ---------------------------------------- 0.1/99.8 MB 544.7 kB/s eta 0:03:04
   ---------------------------------------- 0.2/99.8 MB 1.2 MB/s eta 0:01:26
   ---------------------------------------- 0.4/99.8 MB 2.0 MB/s eta 0:00:50
   ---------------------------------------- 0.7/99.8 MB 2.8 MB/s eta 0:00:36
    --------------------------------------- 1.3/99.8 MB 4.6 MB/s eta 0:00:22
    --------------------------------------

In [25]:
# Import ML models
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Import modeling tools
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import learning_curve
from sklearn.metrics import confusion_matrix

# Import data analysis modules
import pandas as pd
import math 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [32]:
y_test = pd.read_csv('y_test.csv')
y_train = pd.read_csv('y_train.csv')
X_test = pd.read_csv('Xtest_scaled.csv')
X_train = pd.read_csv('Xtrain_scaled.csv')

### **Linear Regression Model**

In [33]:
print(y_test.head(2))
print(y_train.head(2))
print(X_train.head(2))
print(X_test.head(2))

   Unnamed: 0  score_sum
0        7448        0.0
1        3896       94.0
   Unnamed: 0  score_sum
0        1215      100.0
1         728        8.0
   Unnamed: 0         0         1         2         3         4         5  \
0           0  1.484854  1.482477 -0.739203 -0.148884 -0.119389  0.454282   
1           1 -0.668479 -0.674189  1.127149 -0.712652  1.202334  0.744185   

          6         7         8  ...      222       223       224       225  \
0  3.120858 -0.265396  1.870302  ... -0.13803 -0.155735 -0.104262 -0.156081   
1 -0.453834 -0.265396 -0.173483  ... -0.13803 -0.155735 -0.104262 -0.156081   

        226       227       228       229       230       231  
0 -0.118751 -0.349072 -0.112773 -0.131879 -0.134282 -0.121716  
1 -0.118751 -0.349072 -0.112773 -0.131879 -0.134282 -0.121716  

[2 rows x 233 columns]
   Unnamed: 0         0         1         2         3         4         5  \
0           0 -0.668479 -0.674189  1.127149  2.362975  1.202334  0.743246   
1         

In [34]:
# Remove Unnnamed: 0 column from every dataframe.
y_train = y_train.drop(['Unnamed: 0'], axis = 1)
y_test = y_test.drop(['Unnamed: 0'], axis = 1)
X_train = X_train.drop(['Unnamed: 0'], axis = 1)
X_test = X_test.drop(['Unnamed: 0'], axis = 1)

In [35]:
print(y_test.head(2))
print(y_train.head(2))
print(X_train.head(2))
print(X_test.head(2))

   score_sum
0        0.0
1       94.0
   score_sum
0      100.0
1        8.0
          0         1         2         3         4         5         6  \
0  1.484854  1.482477 -0.739203 -0.148884 -0.119389  0.454282  3.120858   
1 -0.668479 -0.674189  1.127149 -0.712652  1.202334  0.744185 -0.453834   

          7         8         9  ...      222       223       224       225  \
0 -0.265396  1.870302  1.618212  ... -0.13803 -0.155735 -0.104262 -0.156081   
1 -0.265396 -0.173483 -0.755277  ... -0.13803 -0.155735 -0.104262 -0.156081   

        226       227       228       229       230       231  
0 -0.118751 -0.349072 -0.112773 -0.131879 -0.134282 -0.121716  
1 -0.118751 -0.349072 -0.112773 -0.131879 -0.134282 -0.121716  

[2 rows x 232 columns]
          0         1         2         3         4         5         6  \
0 -0.668479 -0.674189  1.127149  2.362975  1.202334  0.743246 -0.367791   
1  1.484854  1.482477 -1.101162 -0.245382 -0.479859 -1.609843 -0.125307   

          7     

In [36]:
# Create set cross validation folds
kf = KFold(n_splits = 5, shuffle = True, random_state = 5)

# Instantiate linear regression object
reg = LinearRegression()

cv_results = cross_val_score(reg, X_train, y_train, cv = kf)
cv_results_reg = pd.DataFrame(cv_results, columns =['CV_Score'])
cv_results_reg['Folds'] = [1,2,3,4,5]
cv_results_reg
    
#reg.fit(X_train, y_train)
#predictions = reg.predict(X_train)
#plt.plot(X_train, predictions)

Traceback (most recent call last):
  File "C:\Users\fabby\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 140, in __call__
    score = scorer(estimator, *args, **routed_params.get(name).score)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\fabby\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 527, in __call__
    return estimator.score(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\fabby\anaconda3\Lib\site-packages\sklearn\base.py", line 759, in score
    y_pred = self.predict(X)
             ^^^^^^^^^^^^^^^
  File "C:\Users\fabby\anaconda3\Lib\site-packages\sklearn\linear_model\_base.py", line 386, in predict
    return self._decision_function(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\fabby\anaconda3\Lib\site-packages\sklearn\linear_model\_base.py", line 369, in _decision_function
    X = self._validate_data(X, accept_sparse=["csr", "csc", "coo"], reset=False)
        ^^^

Unnamed: 0,CV_Score,Folds
0,,1
1,,2
2,,3
3,,4
4,,5


In [37]:
reg_cv_avg = np.mean(cv_results)
reg_cv_std = np.std(cv_results)
print('Mean of CV score for linear regression: ' + str(reg_cv_avg))
print('Standard deviation of  CV score for linear regression: ' + str(reg_cv_std))

Mean of CV score for linear regression: nan
Standard deviation of  CV score for linear regression: nan


In [38]:
knn = KNeighborsClassifier(n_neighbors=len(X_train))
cv_results = cross_val_score(knn, X_train, y_train, cv = kf)
cv_results_knn = pd.DataFrame(cv_results, columns =['CV_Score'])
cv_results_knn['Folds'] = [1,2,3,4,5]
cv_results_knn

  return self._fit(X, y)
Traceback (most recent call last):
  File "C:\Users\fabby\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 140, in __call__
    score = scorer(estimator, *args, **routed_params.get(name).score)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\fabby\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 527, in __call__
    return estimator.score(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\fabby\anaconda3\Lib\site-packages\sklearn\base.py", line 705, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
                             ^^^^^^^^^^^^^^^
  File "C:\Users\fabby\anaconda3\Lib\site-packages\sklearn\neighbors\_classification.py", line 246, in predict
    if self._fit_method == "brute" and ArgKminClassMode.is_usable_for(
                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\fabby\anaconda3\Lib\site-packa

Unnamed: 0,CV_Score,Folds
0,,1
1,,2
2,,3
3,,4
4,,5


In [39]:
knn_cv_avg = np.mean(cv_results)
knn_cv_std = np.std(cv_results)
print('Mean of CV score for KNN: ' + str(knn_cv_avg))
print('Standard deviation of  CV score for KNN: ' + str(knn_cv_std))

Mean of CV score for KNN: nan
Standard deviation of  CV score for KNN: nan


0      0
1      0
2      0
3      0
4      1
      ..
227    0
228    0
229    0
230    0
231    0
Length: 232, dtype: int64