# Cross Validation Exercises

In [110]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV

In [88]:
df = pd.read_csv("gapminder1.csv")
df.head()

Unnamed: 0,year,country,measure,measurement
0,1955,Afghanistan,pop,8891209.0
1,1960,Afghanistan,pop,9829450.0
2,1965,Afghanistan,pop,10997885.0
3,1970,Afghanistan,pop,12430623.0
4,1975,Afghanistan,pop,14132019.0


In [89]:
df = df.pivot_table(values="measurement", index=["year", "country"], columns="measure").reset_index()

In [91]:
df.head(5)

measure,year,country,fertility,life_expect,pop
0,1955,Afghanistan,7.7,30.332,8891209.0
1,1955,Argentina,3.1265,64.399,18927821.0
2,1955,Aruba,5.15,64.381,53865.0
3,1955,Australia,3.406,70.33,9277087.0
4,1955,Austria,2.52,67.48,6946885.0


----

In [7]:
df_1 = pd.read_csv("gapminder2.csv")
df_1.head()

Unnamed: 0,country,life_expect_1955,life_expect_1960,life_expect_1965,life_expect_1970,life_expect_1975,life_expect_1980,life_expect_1985,life_expect_1990,life_expect_1995,...,pop_1960,pop_1965,pop_1970,pop_1975,pop_1980,pop_1985,pop_1990,pop_1995,pop_2000,pop_2005
0,Afghanistan,30.332,31.997,34.02,36.088,38.438,39.854,40.822,41.674,41.763,...,9829450,10997885,12430623,14132019,15112149,13796928,14669339,20881480,23898198,29928987
1,Argentina,64.399,65.142,65.634,67.065,68.481,69.942,70.774,71.868,73.275,...,20616009,22283100,23962313,26081880,28369799,30675059,33022202,35311049,37497728,39537943
2,Aruba,64.381,66.606,68.336,70.941,71.83,74.116,74.494,74.108,73.011,...,57203,59020,59039,59390,60266,64129,66653,67836,69539,71566
3,Australia,70.33,70.93,71.1,71.93,73.49,74.74,76.32,77.56,78.83,...,10361273,11439384,12660160,13771400,14615900,15788300,17022133,18116171,19164620,20090437
4,Austria,67.48,69.54,70.14,70.63,72.17,73.18,74.94,76.04,77.51,...,7047437,7270889,7467086,7578903,7549433,7559776,7722953,8047433,8113413,8184691


In [15]:
df_melt = df_1.melt(id_vars="country", var_name="measure")
df_melt.head(5)

Unnamed: 0,country,measure,value
0,Afghanistan,life_expect_1955,30.332
1,Argentina,life_expect_1955,64.399
2,Aruba,life_expect_1955,64.381
3,Australia,life_expect_1955,70.33
4,Austria,life_expect_1955,67.48


In [50]:
df_melt["year"] = df_melt.measure.str[-4:]

In [53]:
df_melt.measure = df_melt.measure.str[:-5]

In [64]:
df_melt

Unnamed: 0,country,measure,value,year
0,Afghanistan,life_expect,3.033200e+01,1955
1,Argentina,life_expect,6.439900e+01,1955
2,Aruba,life_expect,6.438100e+01,1955
3,Australia,life_expect,7.033000e+01,1955
4,Austria,life_expect,6.748000e+01,1955
...,...,...,...,...
1381,Switzerland,pop,7.489370e+06,2005
1382,Turkey,pop,6.966056e+07,2005
1383,United Kingdom,pop,6.044146e+07,2005
1384,United States,pop,2.957341e+08,2005


In [68]:
df_melt.pivot_table(values="value",index=["country", "year"], columns="measure").reset_index()
df_melt.columns.name = ' '
df_melt

Unnamed: 0,country,measure,value,year
0,Afghanistan,life_expect,3.033200e+01,1955
1,Argentina,life_expect,6.439900e+01,1955
2,Aruba,life_expect,6.438100e+01,1955
3,Australia,life_expect,7.033000e+01,1955
4,Austria,life_expect,6.748000e+01,1955
...,...,...,...,...
1381,Switzerland,pop,7.489370e+06,2005
1382,Turkey,pop,6.966056e+07,2005
1383,United Kingdom,pop,6.044146e+07,2005
1384,United States,pop,2.957341e+08,2005


In [76]:
df_melt.shape

(1386, 4)

-----

# Modeling w/ Cross Validation

In [92]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.tree import DecisionTreeClassifier

## Can we predict the country based on the year, fertility, life expectancy and population?

In [93]:
df.head()

measure,year,country,fertility,life_expect,pop
0,1955,Afghanistan,7.7,30.332,8891209.0
1,1955,Argentina,3.1265,64.399,18927821.0
2,1955,Aruba,5.15,64.381,53865.0
3,1955,Australia,3.406,70.33,9277087.0
4,1955,Austria,2.52,67.48,6946885.0


In [95]:
X = df[["year", "fertility", "life_expect", "pop"]]
y = df.country

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123, test_size = .2)

In [102]:
scaler = MinMaxScaler().fit(X_train)
train_scaled = pd.DataFrame(scaler.transform(X_train), columns=train.columns.values).set_index([train.index.values])

In [105]:
test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns.values).set_index([X_test.index.values])

In [109]:
dtree = DecisionTreeClassifier(max_depth = 4)

cross_val_score(dtree, train_scaled, y_train, cv=4).mean()

0.11551193827546659

## Grid Search

In [111]:
# keys are names of hyperparams, values are a list of values to try for that hyper parameter
params = {
    'max_depth': range(1, 11),
    'criterion': ['gini', 'entropy']
}

# cv=4 means 4-fold cross-validation, i.e. k = 4
grid = GridSearchCV(dtree, params, cv=4)
grid.fit(train_scaled, y_train)

grid.best_params_

{'criterion': 'entropy', 'max_depth': 9}

In [114]:
# .best_estimator_ gives us a model that is prefit with the best hyperparams
model = grid.best_estimator_
model.score(test_scaled, y_test)

0.5467625899280576

In [113]:
grid.best_score_

0.49826660410801793

In [115]:
results = grid.cv_results_
# Modify each parameter dictionary such that it also contains the model's average performance after cross validation
for score, p in zip(results['mean_test_score'], results['params']):
    # adding a new key to the dictionary
    p['score'] = score

# a list of dictionaries where each dictionary has the same keys can be turned into a dataframe
results['params']

pd.DataFrame(results['params'])

Unnamed: 0,criterion,max_depth,score
0,gini,1,0.03429
1,gini,2,0.061373
2,gini,3,0.086631
3,gini,4,0.115512
4,gini,5,0.153412
5,gini,6,0.187728
6,gini,7,0.232822
7,gini,8,0.281527
8,gini,9,0.323037
9,gini,10,0.362749


## How does that compare with the train / validate / test split?