### Using CV with:
  - Linear Regression
  - Decision Tree classification
  

In [1]:
import pandas as pd
import numpy as np
from sklearn import tree 
from sklearn.datasets import load_iris
from sklearn.datasets import load_diabetes

# Calculating Generalized Error on Linear Regression with K-fold Cross Validation

## Load in data set
[Data Set](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html)

In [2]:
df_X = load_diabetes(as_frame = True)['data']
s_y = load_diabetes(as_frame = True)['target']
df_X

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068330,-0.092204
2,0.085299,0.050680,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.025930
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018118,0.044485
439,0.041708,0.050680,-0.015906,0.017282,-0.037344,-0.013840,-0.024993,-0.011080,-0.046879,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044528,-0.025930


## Linear least squares regression model function 

In [3]:
# Returns LSR Model
def get_linear_regression_model( df_X, s_y ):
    ones = pd.DataFrame({'intercept': np.ones(len(df_X))})
    df_X = ones.join(df_X)
    return np.linalg.lstsq(df_X, s_y, rcond = -1)[0]

In [4]:
# Check beta_hat
np.random.seed(23)
beta_hat = get_linear_regression_model( pd.DataFrame(np.random.random((34,4))), pd.Series(np.random.random(34)*10.0) )
beta_hat

array([ 4.18818425,  1.77890808,  0.74032569, -1.3506416 ,  0.14535984])

## Partition function

In [5]:
# returns fold dictionaries: partitions = ({1: sub-df, 2: sub-df ...}, {1: sub-s, 2: sub-s ...})
def partition_data( df_X, s_y, k ):
    partitions = ({}, {})
    df = df_X.join(s_y).sample(frac = 1)
    arrays = np.array_split(df, k)
    for i in range(1, k + 1):
        partitions[0][i] = arrays[i - 1].iloc[:, :-1]
        partitions[1][i] = arrays[i - 1].iloc[:, -1:]
    return partitions 

In [6]:
(dict_k_df_X, dict_k_s_y) = partition_data( df_X, s_y, 5 )
dict_k_df_X[1]

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
133,-0.041840,0.050680,-0.053630,-0.040099,-0.084126,-0.071772,-0.002903,-0.039493,-0.072128,-0.030072
198,-0.052738,-0.044642,0.054152,-0.026328,-0.055231,-0.033881,-0.013948,-0.039493,-0.074089,-0.059067
158,-0.012780,-0.044642,-0.065486,-0.069938,0.001183,0.016849,-0.002903,-0.007020,-0.030751,-0.050783
21,-0.085430,0.050680,-0.022373,0.001215,-0.037344,-0.026366,0.015505,-0.039493,-0.072128,-0.017646
319,0.019913,-0.044642,0.004572,0.045972,-0.018080,-0.054549,0.063367,-0.039493,0.028661,0.061054
...,...,...,...,...,...,...,...,...,...,...
13,0.005383,0.050680,-0.001895,0.008101,-0.004321,-0.015719,-0.002903,-0.002592,0.038393,-0.013504
386,0.019913,-0.044642,-0.040696,-0.015999,-0.008449,-0.017598,0.052322,-0.039493,-0.030751,0.003064
64,0.067136,0.050680,-0.025607,-0.040099,-0.063487,-0.059873,-0.002903,-0.039493,-0.019197,0.011349
395,-0.060003,-0.044642,0.001339,-0.029771,-0.007073,-0.021669,0.011824,-0.002592,0.031815,-0.054925


In [7]:
# Check fold sizes
sum = 0
for i in dict_k_df_X:
    print(f'Fold {i}: df_X length = {len(dict_k_df_X[i])} and s_y = {len(dict_k_df_X[i])}')
    sum += len(dict_k_df_X[i])
print(f'The sum of the number of elements in each fold is {sum} and there are {len(df_X)} rows in the original df' )

Fold 1: df_X length = 89 and s_y = 89
Fold 2: df_X length = 89 and s_y = 89
Fold 3: df_X length = 88 and s_y = 88
Fold 4: df_X length = 88 and s_y = 88
Fold 5: df_X length = 88 and s_y = 88
The sum of the number of elements in each fold is 442 and there are 442 rows in the original df


## MAE function
$MAE = \sum\limits_{i=1}^n\frac{|{s\_y_i - {s\_y\_hat}_i}|}{n}$ 



In [8]:
def get_mae(s_y, s_y_hat):
    mae = 0
    n = len(s_y)
    for i in range(n):
        mae += abs(s_y[i] - s_y_hat[i])
    return float(mae / n)

In [9]:
# Test 
x = np.array([1,2,3])
y = np.array([2,2,3])
get_mae(x,y)

0.3333333333333333

## Get $MAE$ for each fold

In [10]:
mae = np.array([])
for k in dict_k_df_X.keys():
    # Concat training arrays
    train_X = pd.DataFrame(np.concatenate([v for (n, v) in dict_k_df_X.items() if n != k]))
    train_y = pd.DataFrame(np.concatenate([v for (n, v) in dict_k_s_y.items() if n != k]))
    # Get beta_hat from training arrays
    beta_hat = get_linear_regression_model(train_X, train_y)
    # Add intercept col to test_X
    test_X = dict_k_df_X[k].reset_index(drop = True)
    test_X = pd.DataFrame({'intercept': np.ones(len(test_X))}).join(test_X)
    # Get s_y_hat from test set
    s_y_hat = np.matmul(np.array(test_X), np.array(beta_hat))
    # Calc mae using s_y (with reset index) and s_y_hat
    s_y = np.array(dict_k_s_y[k].reset_index(drop = True))
    mae = np.append(mae, get_mae(s_y, s_y_hat))
mae

array([41.87109201, 42.88369483, 43.73687201, 46.46223755, 47.02704001])

In [11]:
print("The min MAE is {:.2f}, the max MAE is {:.2f}, and the mean MAE is {:.2f}".format(mae.min(),mae.max(),mae.mean()))

The min MAE is 41.87, the max MAE is 47.03, and the mean MAE is 44.40


#  Find the best Decision Tree hyperparameter

## Load iris data
[Data](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_iris.html)

In [12]:
df_X = load_iris(as_frame = True)['data']
s_y = load_iris(as_frame = True)['target']

## Partition `df_X` and `s_y`

In [13]:
(dict_k_df_X, dict_k_s_y) = partition_data(df_X, s_y, 5)
sum = 0
for i in dict_k_df_X:
    print(f'Fold {i}: df_X length = {len(dict_k_df_X[i])} and s_y = {len(dict_k_df_X[i])}')
    sum += len(dict_k_df_X[i])
print(f'The sum of the number of elements in each fold is {sum} and there are {len(df_X)} rows in the original df' )

Fold 1: df_X length = 30 and s_y = 30
Fold 2: df_X length = 30 and s_y = 30
Fold 3: df_X length = 30 and s_y = 30
Fold 4: df_X length = 30 and s_y = 30
Fold 5: df_X length = 30 and s_y = 30
The sum of the number of elements in each fold is 150 and there are 150 rows in the original df


## Accuracy function

In [14]:
def get_acc(s_1, s_2):
    j = len(s_1)
    for i in range(len(s_1)):
        if s_1[i] != s_2[i]:
            j -= 1
    return j / len(s_1)

In [15]:
get_acc(s_y,np.ones(len(s_y)))

0.3333333333333333

## Using Nested Cross validation, find the best hyperparameter
Using:\
[Decision Tree Classifier](https://scikit-learn.org/stable/modules/tree.html#classification)

In [16]:
possible_min_impurity_decrease = np.array([0.1,0.25,0.3,0.4])

# Outer loop
outer_acc = np.array([])
for k in dict_k_df_X.keys():
    print()
    train_partitions_X = [v for (n, v) in dict_k_df_X.items() if n != k]
    train_partitions_y = [v for (n, v) in dict_k_s_y.items() if n != k]
    # Choosing best m_i_d
    max_acc = float('-inf')
    min_decrease = float('inf')
    
    for pos_min_impurity in possible_min_impurity_decrease:
        print(f'Testing {pos_min_impurity} min impurity decrease')
        
        # get average acc. for impurity measure
        # Inner loop cross validation code here (use 4 folds, where the fold does not include k)
        s = 0
        for i in range(len(train_partitions_X)):
            # Get mini train partitions
            small_train_partition_X = np.concatenate([x for (idx, x) in enumerate(train_partitions_X) if idx != i])
            small_train_partition_y = np.concatenate([y for (idx, y) in enumerate(train_partitions_y) if idx != i])
            # Create and fit tree
            clf = tree.DecisionTreeClassifier(min_impurity_decrease = pos_min_impurity)
            clf.fit(small_train_partition_X, small_train_partition_y)
            # Predict 
            small_y_hat = clf.predict(train_partitions_X[i])
            # add accuracy to sum 
            
            s += get_acc(np.array(train_partitions_y[i]), small_y_hat)

        # get avg. acc. for m.i.d and set to min_acc if smaller
        print(f'    Average accuracy over 4 folds is {round(s / (i + 1), 2)}')
        if s / (i + 1) > max_acc:
            max_acc = s / (i + 1)
            min_decrease = pos_min_impurity
            
        elif s / i == max_acc and pos_min_impurity < min_decrease:
            min_decrease = pos_min_impurity

    # Use best min impurity decrease to train model
    train_X = np.concatenate(train_partitions_X)
    train_y = np.concatenate(train_partitions_y)
    clf = tree.DecisionTreeClassifier(min_impurity_decrease = min_decrease)
    clf.fit(train_X, train_y)
    # Predict 
    y_hat = clf.predict(dict_k_df_X[k])
    # Outer accuracy calculation
    this_acc = get_acc(y_hat, np.array(dict_k_s_y[k]))
    outer_acc = np.append(outer_acc, this_acc) # make sure and calculate this_acc in your loop


Testing 0.1 min impurity decrease
    Average accuracy over 4 folds is 0.94
Testing 0.25 min impurity decrease
    Average accuracy over 4 folds is 0.84
Testing 0.3 min impurity decrease
    Average accuracy over 4 folds is 0.63
Testing 0.4 min impurity decrease
    Average accuracy over 4 folds is 0.28

Testing 0.1 min impurity decrease
    Average accuracy over 4 folds is 0.95
Testing 0.25 min impurity decrease
    Average accuracy over 4 folds is 0.95
Testing 0.3 min impurity decrease
    Average accuracy over 4 folds is 0.64
Testing 0.4 min impurity decrease
    Average accuracy over 4 folds is 0.27

Testing 0.1 min impurity decrease
    Average accuracy over 4 folds is 0.93
Testing 0.25 min impurity decrease
    Average accuracy over 4 folds is 0.93
Testing 0.3 min impurity decrease
    Average accuracy over 4 folds is 0.58
Testing 0.4 min impurity decrease
    Average accuracy over 4 folds is 0.26

Testing 0.1 min impurity decrease
    Average accuracy over 4 folds is 0.93
Testi

## Generalized performance of classifier 

In [17]:
print(f'The minimum accuracy of the outer fold set is {outer_acc.min()}.')
print(f'The maximum accuracy of the outer fold set is {outer_acc.max()}.')
print(f'The mean accuracy of the outer fold set is {outer_acc.mean()}.')

The minimum accuracy of the outer fold set is 0.8666666666666667.
The maximum accuracy of the outer fold set is 0.9666666666666667.
The mean accuracy of the outer fold set is 0.9333333333333333.
