In [15]:
# SETUP : importing

import numpy as np
import pandas as pd
from sklearn.datasets import load_boston


from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style="darkgrid")


import sklearn.linear_model as lm

import utilcompute as uc
from pprint import pprint

from sklearn.model_selection import KFold


In [16]:
# SETUP : reading in the datasets

data = np.column_stack( (load_boston().data, load_boston().target) )
df = pd.DataFrame(data)
df.columns = [
    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE',
    'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'
]

print('df.shape[0] : ', df.shape[0])


df.shape[0] :  506


In [17]:
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.593761,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.596783,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.647423,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [18]:
# DATA PREPROCESSING : deleting features

to_delete = []
cols = [c for c in df.columns.values.tolist() if (c not in to_delete)]
df = df[cols]

#print('columns : ', df.columns.values.tolist())

In [19]:
target = 'MEDV'
if (isinstance(target, list)):
    features = [c for c in df.columns.values.tolist() if (c not in target)]
else:
    features = [c for c in df.columns.values.tolist() if (c != target)]

#print('features : ', features)
#print('target   : ', target)

In [20]:
# DATA PREPROCESSING : features standardization

df_std = uc.standardize(df = df, included = features, excluded = target)

vif_dict = uc.compute_vif(df = df_std, features = features)
pprint(vif_dict)


{'AGE': 3.1008597090649377,
 'B': 1.3458321362245256,
 'CHAS': 1.073942576235468,
 'CRIM': 1.7733207336404873,
 'DIS': 3.9565506707875113,
 'INDUS': 3.9911944587426742,
 'LSTAT': 2.9381269138596409,
 'NOX': 4.3950638803302251,
 'PTRATIO': 1.7992202856110953,
 'RAD': 7.48053920326217,
 'RM': 1.9341610244158234,
 'TAX': 9.0084721742418967,
 'ZN': 2.2986411129153459}


In [21]:
# DATA PREPROCESSING : vif subset selection [reduces multicollinearity]

VIF = False

if (VIF):
    selected_features = uc.vif_best_subset_selection(
        vif_threshold = 5, 
        df = df_std, 
        features = features, 
        level = len(features), 
        debug = False
    )
    t = uc.concatenate(features, target)
    df_std = df_std[t]
    
    vif_dict = uc.compute_vif(df = df_std, features = selected_features)
    pprint(vif_dict)
else:
    selected_features = features


In [22]:
# DATA PREPROCESSING : final setup

df = df_std
features = selected_features

print(df_std.columns.values)

['AGE' 'B' 'CHAS' 'CRIM' 'DIS' 'INDUS' 'LSTAT' 'MEDV' 'NOX' 'PTRATIO' 'RAD'
 'RM' 'TAX' 'ZN']


In [23]:
# GLOBAL PARAMETERS 

n_splits = 10

print('k (# of folds) : ', n_splits)


n_splits in k fold cv :  10


In [24]:
# STRATIFIED CROSS VALIDATION

# description :

# ...


# ----------------------------------------------------------------------------------------------------------------------------- #


