In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from HW3_helper import *
RANDOM_STATE = 12345 #Do not change it!
np.random.seed(RANDOM_STATE) #Do not change it!

In [4]:
diabetes = pd.read_csv("datasets/diabetes.csv")
X = diabetes.drop(["Outcome"], axis=1)
y = diabetes["Outcome"]

In [5]:
def imputation(df, columns):
    from sklearn.impute import SimpleImputer
    columns = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age"]
    df_parts = df.copy()[columns]

    df_parts[df_parts==0] = np.nan
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')

    df_converted = pd.DataFrame(imp.fit_transform(df_parts), columns=columns)
    diabetes[columns] = df_converted
    diabetes.describe()
    return df_converted

In [6]:
diabetes = imputation(diabetes, ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age"])

# Classification

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

In [8]:
rfc = RandomForestClassifier(random_state=RANDOM_STATE)
skf = StratifiedKFold(n_splits=5)

In [9]:
param_grid = [
    {'max_depth':[1,2,3,4,5,6,7,8,9, 10]},
    {'min_samples_split':[2,3,4,5,6,7,8,9,10]},
    ]

In [10]:
search = GridSearchCV(rfc, param_grid=param_grid, cv=5)
search.fit(X, y)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [11]:
rfc_cross_val_score = np.mean(cross_val_score(rfc, X, y, cv=skf))
rfc_cross_val_score

0.7669977081741788

In [12]:
rf_best_classifier = search.best_estimator_
rf_best_classifier

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=9,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=12345,
                       verbose=0, warm_start=False)

## SVM with diverse kernels

In [13]:
scaler = StandardScaler()

scaled_data = scaler.fit_transform(X)
scaled_data = pd.DataFrame(scaled_data, columns=X.columns)

scaled_data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.639947,0.848324,0.149641,0.907270,-0.692891,0.204013,0.468492,1.425995
1,-0.844885,-1.123396,-0.160546,0.530902,-0.692891,-0.684422,-0.365061,-0.190672
2,1.233880,1.943724,-0.263941,-1.288212,-0.692891,-1.103255,0.604397,-0.105584
3,-0.844885,-0.998208,-0.160546,0.154533,0.123302,-0.494043,-0.920763,-1.041549
4,-1.141852,0.504055,-1.504687,0.907270,0.765836,1.409746,5.484909,-0.020496
...,...,...,...,...,...,...,...,...
763,1.827813,-0.622642,0.356432,1.722735,0.870031,0.115169,-0.908682,2.532136
764,-0.547919,0.034598,0.046245,0.405445,-0.692891,0.610154,-0.398282,-0.531023
765,0.342981,0.003301,0.149641,0.154533,0.279594,-0.735190,-0.685193,-0.275760
766,-0.844885,0.159787,-0.470732,-1.288212,-0.692891,-0.240205,-0.371101,1.170732


In [14]:
X_train, X_test, y_train, y_test = train_test_split(scaled_data, y, test_size=0.3, random_state=RANDOM_STATE)

In [15]:
grid_param = [
    {'kernel' : ['linear', 'poly', 'rbf']},
    {'C' : [1, 10, 100]},
]

In [16]:
svc = SVC()

In [17]:
search = GridSearchCV(svc, param_grid=grid_param, cv=10)
search.fit(X, y)

GridSearchCV(cv=10, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid=[{'kernel': ['linear', 'poly', 'rbf']},
                         {'C': [1, 10, 100]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [18]:
svm_best_classifier = search.best_estimator_
svm_best_classifier

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [19]:
svm_ho_score = search.best_score_
svm_ho_score

0.7669685577580314

# Evaluation

In [20]:
svc = SVC()
scaler = StandardScaler()
scaled_data = scaler.fit_transform(X)
scaled_data = pd.DataFrame(scaled_data, columns=X.columns)

X_train, X_test, y_train, y_test = train_test_split(scaled_data, y, test_size=0.3, random_state=RANDOM_STATE, shuffle=False)

svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)

In [21]:
recall_score_svc = recall_score(y_test, y_pred)
precision_score_svc = precision_score(y_test, y_pred)
f1_score_svc = f1_score(y_test, y_pred)

print('Recall:', recall_score_svc, '\nPrecision:', precision_score_svc, '\nF1:', f1_score_svc)

Recall: 0.569620253164557 
Precision: 0.8035714285714286 
F1: 0.6666666666666666


In [None]:
rfc = RandomForestClassifier()