In [10]:
import pandas as pd
import numpy as np

In [16]:
Auto = pd.read_csv('auto.csv')
Auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [17]:
small_index = Auto['displacement'] <= np.mean(Auto['displacement'])
Auto.loc[small_index,'displacement_binary'] = 'small'
Auto.loc[~small_index,'displacement_binary'] = 'big'
Auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name,displacement_binary
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu,big
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320,big
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite,big
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst,big
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino,big


In [18]:
Auto['displacement_big'] = np.where(Auto['displacement_binary'] == 'big',1,0)

In [19]:
from sklearn.model_selection import KFold ## for regression
from sklearn.model_selection import StratifiedKFold ## recommended for classification
kfolds = StratifiedKFold(n_splits = 10,random_state = 1,shuffle = True) ## a random state for reproducibility purpose

In [20]:
print(kfolds)

StratifiedKFold(n_splits=10, random_state=1, shuffle=True)


In [24]:
for train_index, test_index in kfolds.split(Auto,Auto['displacement_big']):
    print(f'train_index: {train_index}\n\ntest_index: {test_index}')
    break

train_index: [  0   1   2   3   4   5   6   8   9  12  13  14  15  16  17  18  19  20
  21  22  24  25  26  27  28  29  30  31  32  33  34  35  36  38  39  40
  41  43  44  45  46  47  48  49  50  51  52  53  54  56  57  58  59  61
  62  63  64  65  66  67  68  69  70  71  72  73  74  76  77  78  79  81
  82  83  84  85  87  88  89  90  91  92  93  94  95  96  97  99 100 101
 102 103 104 105 106 107 108 109 110 111 112 113 114 115 118 119 120 121
 122 124 125 127 128 129 130 131 133 134 135 136 137 138 139 141 143 144
 145 146 147 148 149 150 151 152 153 154 155 156 157 159 160 161 162 163
 164 165 166 167 168 169 170 171 173 174 175 176 177 178 179 180 181 182
 183 184 185 186 187 188 190 191 192 194 195 196 197 198 199 200 201 202
 203 204 205 206 207 208 209 210 211 212 214 215 216 217 218 219 220 221
 222 223 224 225 226 227 228 230 231 232 233 234 236 237 238 240 241 242
 243 244 245 246 247 248 249 250 251 252 253 255 256 258 259 260 261 263
 264 265 266 267 268 269 270 271 272 2

In [26]:
cv_classification_errors_1 = []
cv_auc_1 = []

In [27]:
import statsmodels.formula.api as smf
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

for train_index, test_index in kfolds.split(Auto,Auto['displacement_big']):
    # train the logistic model
    result = smf.logit('displacement_big ~ mpg + horsepower', data = Auto, subset = train_index).fit()
    
    # select the test set according to test_index produced by kfolds.split
    X_test = Auto.loc[test_index,['mpg','horsepower']]
    y_test = Auto.loc[test_index,'displacement_big']
    
    # compute the possibilities of test data
    result_prob = result.predict(X_test)
    # select 0.5 as the threshold
    result_pred = (result_prob > 0.5)
    # classification error
    classification_error = np.mean(result_pred != y_test)
    # add the computed classification error to cv_classifciation_errors_1 to store the result
    cv_classification_errors_1.append(classification_error)
    
    # calculate the auc
    fpr,tpr,threshold = roc_curve(y_test,result_prob)
    roc_auc = auc(fpr,tpr)
    # add the computed auc to cv_auc_1 to store the result
    cv_auc_1.append(roc_auc)

Optimization terminated successfully.
         Current function value: 0.224324
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.240459
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.221340
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.232408
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.224961
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.227415
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.218369
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.220679
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.202915
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.229121
  

In [29]:
print("classification errors using 10-fold CV: {}\n".format(cv_classification_errors_1))
print("mean of classification errors using 10-fold CV: {}\n".format(np.mean(cv_classification_errors_1)))

classification errors using 10-fold CV: [0.125, 0.025, 0.1282051282051282, 0.05128205128205128, 0.1282051282051282, 0.05128205128205128, 0.15384615384615385, 0.10256410256410256, 0.07692307692307693, 0.10256410256410256]

mean of classification errors using 10-fold CV: 0.09448717948717947



In [30]:
print('auc using 10-fold CV: {}\n'.format(cv_auc_1))
print('mean of auc using 10-fold CV: {}\n'.format(np.mean(cv_auc_1)))

auc using 10-fold CV: [0.9744245524296675, 1.0, 0.9545454545454546, 0.9893048128342246, 0.9679144385026738, 0.9705882352941176, 0.9598930481283422, 0.9572192513368984, 0.93048128342246, 0.981283422459893]

mean of auc using 10-fold CV: 0.9685654498953731



In [34]:
cv_classification_errors_2  = []
cv_auc_2 = []

In [35]:
import statsmodels.formula.api as smf
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
for train_index, test_index in kfolds.split(Auto,Auto['displacement_big']):
    # train the logistic model
    result = smf.logit('displacement_big ~ weight + acceleration', data=Auto,subset = train_index).fit()
    # select the test set according to test_index produced by kfolds.split
    X_test = Auto.loc[test_index,["weight","acceleration"]]
    y_test = Auto.loc[test_index,"displacement_big"]
    # compute the probabilities of test data
    result_prob = result.predict(X_test)
    # select 0.5 as the threshold
    result_pred = (result_prob > 0.5)
    # compute the classification error
    classification_error = np.mean(result_pred != y_test)
    # add the computed classification error to "cv_classification_errors_1" to␣store the result
    cv_classification_errors_2.append(classification_error)
    # calculate the auc
    fpr,tpr,threshold = roc_curve(y_test, result_prob)
    roc_auc = auc(fpr,tpr)
    # add the computed auc to "cv_auc_1" to store the result
    cv_auc_2.append(roc_auc)

Optimization terminated successfully.
         Current function value: 0.166688
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.176276
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.160031
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.173053
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.166417
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.151830
         Iterations 10
Optimization terminated successfully.
         Current function value: 0.141794
         Iterations 10
Optimization terminated successfully.
         Current function value: 0.168865
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.174350
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.159999


In [36]:
print("classification errors using 10-fold CV: {}\n".format(cv_classification_errors_2))
print("mean of classification errors using 10-fold CV: {}".format(np.mean(cv_classification_errors_2)))

classification errors using 10-fold CV: [0.1, 0.025, 0.15384615384615385, 0.02564102564102564, 0.05128205128205128, 0.10256410256410256, 0.1282051282051282, 0.05128205128205128, 0.02564102564102564, 0.1282051282051282]

mean of classification errors using 10-fold CV: 0.07916666666666666


In [37]:
print("auc using 10-fold CV: {}\n".format(cv_auc_2))
print("mean of auc using 10-fold CV: {}".format(np.mean(cv_auc_2)))

auc using 10-fold CV: [0.9872122762148338, 1.0, 0.9759358288770053, 1.0, 0.9893048128342246, 0.9545454545454546, 0.9679144385026738, 0.9973262032085561, 1.0, 0.9759358288770053]

mean of auc using 10-fold CV: 0.9848174843059754


In [38]:
print("predictor varible: mpg, horsepower; response variable: displacement_big")
print("mean of classification errors using 10-fold CV: {}".format(np.mean(cv_classification_errors_1)))
print("mean of auc using 10-fold CV: {}\n".format(np.mean(cv_auc_1)))
print("predictor varible: weight, acceleration; response variable:displacement_big")
print("mean of classification errors using 10-fold CV: {}".format(np.mean(cv_classification_errors_2)))
print("mean of auc using 10-fold CV: {}".format(np.mean(cv_auc_2)))

predictor varible: mpg, horsepower; response variable: displacement_big
mean of classification errors using 10-fold CV: 0.09448717948717947
mean of auc using 10-fold CV: 0.9685654498953731

predictor varible: weight, acceleration; response variable:displacement_big
mean of classification errors using 10-fold CV: 0.07916666666666666
mean of auc using 10-fold CV: 0.9848174843059754


### Quick way to do CV for models in sklearn

In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

logistic_model = LogisticRegression(penalty='none', max_iter = 10000)
error_model_1_cv = cross_val_score(logistic_model, Auto[['mpg','horsepower']], Auto['displacement_big'], cv=10)
error_model_2_cv = cross_val_score(logistic_model, Auto[['weight','acceleration']], Auto['displacement_big'], cv=10)
print("Logisgic Regression: \n")
print("accuracies of 10-folds:",error_model_1_cv,"(mean classification error:",1-np.mean(error_model_1_cv),")")
print("accuracies of 10-folds:",error_model_2_cv,"(mean classification error:",1-np.mean(error_model_2_cv),")")

Logisgic Regression: 

accuracies of 10-folds: [0.925      0.75       0.97435897 0.92307692 0.87179487 0.8974359
 0.8974359  1.         0.87179487 0.84615385] (mean classification error: 0.10429487179487162 )
accuracies of 10-folds: [0.925      0.95       1.         0.84615385 0.8974359  0.94871795
 0.92307692 0.94871795 0.87179487 0.92307692] (mean classification error: 0.07660256410256405 )


In [46]:
logistic_model = LogisticRegression(penalty='none', max_iter = 10000)
error_model_1_cv = cross_val_score(logistic_model, Auto[['mpg','horsepower']], Auto['displacement_big'], cv=10, scoring = 'accuracy')
error_model_2_cv = cross_val_score(logistic_model, Auto[['weight','acceleration']], Auto['displacement_big'], cv=10,scoring = 'accuracy')
print("Logisgic Regression: \n")
print("accuracies of 10-folds:",error_model_1_cv,"(mean classification error:",1-np.mean(error_model_1_cv),")")
print("accuracies of 10-folds:",error_model_2_cv,"(mean classification error:",1-np.mean(error_model_2_cv),")")

Logisgic Regression: 

accuracies of 10-folds: [0.925      0.75       0.97435897 0.92307692 0.87179487 0.8974359
 0.8974359  1.         0.87179487 0.84615385] (mean classification error: 0.10429487179487162 )
accuracies of 10-folds: [0.925      0.95       1.         0.84615385 0.8974359  0.94871795
 0.92307692 0.94871795 0.87179487 0.92307692] (mean classification error: 0.07660256410256405 )


### CV for regression problems

In [49]:
from sklearn.linear_model import LinearRegression
# for r2, scoring = r2 can be used in cross_val_score function
# for MSE, scoring = neg_mean_squared_error can be used in cross_val_score function
kfolds_regresssion = KFold(n_splits = 10, random_state = 1, shuffle = True)
regresssion_model = LinearRegression()
r2_model_1_cv = cross_val_score(regresssion_model, Auto[['mpg','horsepower']], Auto['displacement'], cv=kfolds_regresssion)
r2_model_2_cv = cross_val_score(regresssion_model, Auto[['weight','acceleration']], Auto['displacement'], cv=kfolds_regresssion)
print("Linear Regression: \n")
print("r squared of 10-folds:",r2_model_1_cv,"(mean r squared:",np.mean(r2_model_1_cv),")")
print("r squared of 10-folds:",r2_model_2_cv,"(mean r squared:",np.mean(r2_model_2_cv),")")

Linear Regression: 

r squared of 10-folds: [0.90009344 0.85287829 0.84442486 0.75050943 0.79788919 0.82361727
 0.81645006 0.83507344 0.82927405 0.83134877] (mean r squared: 0.8281558811130347 )
r squared of 10-folds: [0.92741689 0.83846578 0.93362245 0.88999302 0.8949359  0.88834162
 0.8851957  0.90013406 0.93578374 0.87790923] (mean r squared: 0.8971798397399839 )


In [None]:
# using r2

from sklearn.linear_model import LinearRegression
kfolds_regresssion = KFold(n_splits = 10, random_state = 1, shuffle = True)
regresssion_model = LinearRegression()
r2_model_1_cv = cross_val_score(regresssion_model, Auto[['mpg','horsepower']], Auto['displacement'], cv=kfolds_regresssion,scoring = 'r2')
r2_model_2_cv = cross_val_score(regresssion_model, Auto[['weight','acceleration']], Auto['displacement'], cv=kfolds_regresssion, scoring = 'r2')
print("Linear Regression: \n")
print("r squared of 10-folds:",r2_model_1_cv,"(mean r squared:",np.mean(r2_model_1_cv),")")
print("r squared of 10-folds:",r2_model_2_cv,"(mean r squared:",np.mean(r2_model_2_cv),")")