# Cross Validation Exercises

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV

import warnings; warnings.simplefilter('ignore')

In [2]:
df = pd.read_csv("gapminder1.csv")
df.head()

Unnamed: 0,year,country,measure,measurement
0,1955,Afghanistan,pop,8891209.0
1,1960,Afghanistan,pop,9829450.0
2,1965,Afghanistan,pop,10997885.0
3,1970,Afghanistan,pop,12430623.0
4,1975,Afghanistan,pop,14132019.0


In [3]:
df = df.pivot_table(values="measurement", index=["year", "country"], columns="measure").reset_index()

In [4]:
df.head(5)

measure,year,country,fertility,life_expect,pop
0,1955,Afghanistan,7.7,30.332,8891209.0
1,1955,Argentina,3.1265,64.399,18927821.0
2,1955,Aruba,5.15,64.381,53865.0
3,1955,Australia,3.406,70.33,9277087.0
4,1955,Austria,2.52,67.48,6946885.0


----

In [5]:
df_1 = pd.read_csv("gapminder2.csv")
df_1.head()

Unnamed: 0,country,life_expect_1955,life_expect_1960,life_expect_1965,life_expect_1970,life_expect_1975,life_expect_1980,life_expect_1985,life_expect_1990,life_expect_1995,...,pop_1960,pop_1965,pop_1970,pop_1975,pop_1980,pop_1985,pop_1990,pop_1995,pop_2000,pop_2005
0,Afghanistan,30.332,31.997,34.02,36.088,38.438,39.854,40.822,41.674,41.763,...,9829450,10997885,12430623,14132019,15112149,13796928,14669339,20881480,23898198,29928987
1,Argentina,64.399,65.142,65.634,67.065,68.481,69.942,70.774,71.868,73.275,...,20616009,22283100,23962313,26081880,28369799,30675059,33022202,35311049,37497728,39537943
2,Aruba,64.381,66.606,68.336,70.941,71.83,74.116,74.494,74.108,73.011,...,57203,59020,59039,59390,60266,64129,66653,67836,69539,71566
3,Australia,70.33,70.93,71.1,71.93,73.49,74.74,76.32,77.56,78.83,...,10361273,11439384,12660160,13771400,14615900,15788300,17022133,18116171,19164620,20090437
4,Austria,67.48,69.54,70.14,70.63,72.17,73.18,74.94,76.04,77.51,...,7047437,7270889,7467086,7578903,7549433,7559776,7722953,8047433,8113413,8184691


In [6]:
df_melt = df_1.melt(id_vars="country", var_name="measure")
df_melt.head(5)

Unnamed: 0,country,measure,value
0,Afghanistan,life_expect_1955,30.332
1,Argentina,life_expect_1955,64.399
2,Aruba,life_expect_1955,64.381
3,Australia,life_expect_1955,70.33
4,Austria,life_expect_1955,67.48


In [7]:
df_melt["year"] = df_melt.measure.str[-4:]

In [8]:
df_melt.measure = df_melt.measure.str[:-5]

In [9]:
df_melt

Unnamed: 0,country,measure,value,year
0,Afghanistan,life_expect,3.033200e+01,1955
1,Argentina,life_expect,6.439900e+01,1955
2,Aruba,life_expect,6.438100e+01,1955
3,Australia,life_expect,7.033000e+01,1955
4,Austria,life_expect,6.748000e+01,1955
...,...,...,...,...
1381,Switzerland,pop,7.489370e+06,2005
1382,Turkey,pop,6.966056e+07,2005
1383,United Kingdom,pop,6.044146e+07,2005
1384,United States,pop,2.957341e+08,2005


In [10]:
df_melt.pivot_table(values="value",index=["country", "year"], columns="measure").reset_index()
df_melt.columns.name = ' '
df_melt

Unnamed: 0,country,measure,value,year
0,Afghanistan,life_expect,3.033200e+01,1955
1,Argentina,life_expect,6.439900e+01,1955
2,Aruba,life_expect,6.438100e+01,1955
3,Australia,life_expect,7.033000e+01,1955
4,Austria,life_expect,6.748000e+01,1955
...,...,...,...,...
1381,Switzerland,pop,7.489370e+06,2005
1382,Turkey,pop,6.966056e+07,2005
1383,United Kingdom,pop,6.044146e+07,2005
1384,United States,pop,2.957341e+08,2005


In [11]:
df_melt.shape

(1386, 4)

-----

# Modeling w/ Cross Validation

In [43]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## Can we predict the country based on the year, fertility, life expectancy and population?

In [35]:
# Preprocessing

df = pd.read_csv("gapminder1.csv")

df = df.pivot_table(values="measurement", index=["year", "country"], columns="measure").reset_index()

X = df[["year", "fertility", "life_expect", "pop"]]
y = df.country

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123, test_size = .2)

scaler = MinMaxScaler().fit(X_train)
train_scaled = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns.values).set_index([X_train.index.values])

test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns.values).set_index([X_test.index.values])

In [39]:
dtree = DecisionTreeClassifier(max_depth = 15)

cross_val_score(dtree, train_scaled, y_train, cv=4).mean()

0.4891825669898864

## Grid Search

In [36]:
# keys are names of hyperparams, values are a list of values to try for that hyper parameter
params = {
    'max_depth': range(1, 15),
    'criterion': ['gini', 'entropy']
}

# cv=4 means 4-fold cross-validation, .e. k = 4
grid = GridSearchCV(dtree, params, cv=10)
grid.fit(train_scaled, y_train)

grid.best_params_

{'criterion': 'entropy', 'max_depth': 12}

In [37]:
# .best_estimator_ gives us a model that is prefit with the best hyperparams
model = grid.best_estimator_
model.score(test_scaled, y_test)

0.5467625899280576

In [38]:
grid.best_score_

0.5179870129870129

In [23]:
results = grid.cv_results_
# Modify each parameter dictionary such that it also contains the model's average performance after cross validation
for score, p in zip(results['mean_test_score'], results['params']):
    # adding a new key to the dictionary
    p['score'] = score

# a list of dictionaries where each dictionary has the same keys can be turned into a dataframe
results['params']

pd.DataFrame(results['params'])

Unnamed: 0,criterion,max_depth,score
0,gini,1,0.034286
1,gini,2,0.061396
2,gini,3,0.090325
3,gini,4,0.128149
4,gini,5,0.17513
5,gini,6,0.187792
6,gini,7,0.238247
7,gini,8,0.294123
8,gini,9,0.335812
9,gini,10,0.389968


----

## Random Forest

In [44]:
rf = RandomForestClassifier()

In [45]:
# keys are names of hyperparams, values are a list of values to try for that hyper parameter
params = {
    'max_depth': range(1, 15),
    'criterion': ['gini', 'entropy']
}

# cv=4 means 4-fold cross-validation, .e. k = 4
grid = GridSearchCV(rf, params, cv=10)
grid.fit(train_scaled, y_train)

grid.best_params_

{'criterion': 'entropy', 'max_depth': 11}

In [49]:
# .best_estimator_ gives us a model that is prefit with the best hyperparams
model = grid.best_estimator_
model.score(test_scaled, y_test)

0.539568345323741

In [50]:
grid.best_score_

0.5649350649350648

In [51]:
results = grid.cv_results_

# Modify each parameter dictionary such that it also contains the model's average performance after cross validation
for score, p in zip(results['mean_test_score'], results['params']):
    # adding a new key to the dictionary
    p['score'] = score

# a list of dictionaries where each dictionary has the same keys can be turned into a dataframe
results['params']

pd.DataFrame(results['params'])

Unnamed: 0,criterion,max_depth,score
0,gini,1,0.15526
1,gini,2,0.213084
2,gini,3,0.270812
3,gini,4,0.314221
4,gini,5,0.364708
5,gini,6,0.400649
6,gini,7,0.420747
7,gini,8,0.465877
8,gini,9,0.507338
9,gini,10,0.516136


-----

## How does that compare with the train / validate / test split?

In [24]:
# Preprocessing

df = pd.read_csv("gapminder1.csv")

df = df.pivot_table(values="measurement", index=["year", "country"], columns="measure").reset_index()

In [25]:
train, test = train_test_split(df, random_state = 123, train_size=.75)

In [26]:
train, validate = train_test_split(train, random_state = 123, train_size=.75)

In [27]:
assert(train.shape[1] == validate.shape[1])
assert(train.shape[1] == test.shape[1])

In [28]:
X_train = train.drop(columns="country")
y_train = train.country
X_validate = validate.drop(columns="country")
y_validate = validate.country
X_test = test.drop(columns = "country")
y_test = test.country

In [29]:
def return_values(scaler, train, validate, test):
    '''
    Helper function used to updated the scaled arrays and transform them into usable dataframes
    '''
    train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index([train.index.values])
    validate_scaled = pd.DataFrame(scaler.transform(validate), columns=validate.columns.values).set_index([validate.index.values])
    test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index([test.index.values])
    return scaler, train_scaled, validate_scaled, test_scaled

# Linear scaler
def min_max_scaler(train,validate, test):
    '''
    Helper function that scales that data. Returns scaler, as well as the scaled dataframes
    '''
    scaler = MinMaxScaler().fit(test)
    scaler, train_scaled, validate_scaled, test_scaled = return_values(scaler, train, validate, test)
    return scaler, train_scaled, validate_scaled, test_scaled


In [30]:
# scale features
_, X_train_scaled,X_validate_scaled, X_test_scaled = min_max_scaler(X_train, X_validate, X_test)

In [31]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score

def run_clf(X_train, y_train, max_depth):
    '''
    Function used to create and fit decision tree models. It requires a max_depth parameter. Returns model and predictions.
    '''
    clf = DecisionTreeClassifier(criterion='entropy', max_depth=max_depth, random_state=123)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_train)
    return clf, y_pred

def accuracy_report(model, y_pred, y_train):
    '''
    Main function used to create printable versions of the classification accuracy score, confusion matrix and classification report.
    '''
    report = classification_report(y_train, y_pred, output_dict = True)
    report = pd.DataFrame.from_dict(report)
    accuracy_score = f'Accuracy on dataset: {report.accuracy[0]:.2f}'

    labels = sorted(y_train.unique())
    matrix = pd.DataFrame(confusion_matrix(y_train, y_pred), index = labels, columns = labels)

    return accuracy_score, matrix, report

In [33]:
# Model
for i in range(1, 20):
    clf, y_pred = run_clf(X_train_scaled, y_train, i)
    score = clf.score(X_train_scaled, y_train)
    score_val = clf.score(X_validate_scaled, y_validate)
    print(f"For i {i}, score = {score:.0%}, validate = {score_val:.0%}")

For i 1, score = 5%, validate = 2%
For i 2, score = 9%, validate = 3%
For i 3, score = 16%, validate = 5%
For i 4, score = 26%, validate = 14%
For i 5, score = 42%, validate = 25%
For i 6, score = 61%, validate = 35%
For i 7, score = 78%, validate = 43%
For i 8, score = 88%, validate = 42%
For i 9, score = 96%, validate = 44%
For i 10, score = 99%, validate = 42%
For i 11, score = 100%, validate = 42%
For i 12, score = 100%, validate = 43%
For i 13, score = 100%, validate = 43%
For i 14, score = 100%, validate = 43%
For i 15, score = 100%, validate = 43%
For i 16, score = 100%, validate = 43%
For i 17, score = 100%, validate = 43%
For i 18, score = 100%, validate = 43%
For i 19, score = 100%, validate = 43%
