In [25]:
# Initial imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [26]:
# Loading data
file_path = Path("data/myopia.csv")
df = pd.read_csv(file_path)
df.head(5)

Unnamed: 0,AGE,SPHEQ,AL,ACD,LT,VCD,SPORTHR,READHR,COMPHR,STUDYHR,TVHR,DIOPTERHR,MOMMY,DADMY,MYOPIC
0,6,-0.052,21.889999,3.69,3.498,14.7,45,8,0,0,10,34,1,1,1
1,6,0.608,22.379999,3.702,3.392,15.29,4,0,1,1,7,12,1,1,0
2,6,1.179,22.49,3.462,3.514,15.52,14,0,2,0,10,14,0,0,0
3,6,0.525,22.200001,3.862,3.612,14.73,18,11,0,0,4,37,0,1,1
4,5,0.697,23.290001,3.676,3.454,16.16,14,0,0,0,4,4,1,0,0


In [27]:
# Check dataset balance
df["MYOPIC"].value_counts()

0    537
1     81
Name: MYOPIC, dtype: int64

In [28]:
# Check dataset balance
df["MYOPIC"].value_counts(normalize=True)

0    0.868932
1    0.131068
Name: MYOPIC, dtype: float64

In [29]:
# Define X,y
label = df["MYOPIC"]
X = df.iloc[:,:-1].copy()
X.head()

Unnamed: 0,AGE,SPHEQ,AL,ACD,LT,VCD,SPORTHR,READHR,COMPHR,STUDYHR,TVHR,DIOPTERHR,MOMMY,DADMY
0,6,-0.052,21.889999,3.69,3.498,14.7,45,8,0,0,10,34,1,1
1,6,0.608,22.379999,3.702,3.392,15.29,4,0,1,1,7,12,1,1
2,6,1.179,22.49,3.462,3.514,15.52,14,0,2,0,10,14,0,0
3,6,0.525,22.200001,3.862,3.612,14.73,18,11,0,0,4,37,0,1
4,5,0.697,23.290001,3.676,3.454,16.16,14,0,0,0,4,4,1,0


## Pipeline with SVC and Cross-Validation

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, label, random_state=42)

In [31]:
# create process steps
pipes = [
    ("scaler", StandardScaler()),
    ("pca", PCA()),
    ("svc", SVC())
]

In [32]:
# create pipeline
pipeline = Pipeline(pipes)

In [33]:
# set gridsearch parameters
params = {
    "svc__C": [0.0001, 0.001, 0.01, 0.1, 1, 5, 10, 50, 100, 1000],
    "svc__gamma": [0.0001, 0.001, 0.01, 0.1, 1, 5, 10, 100],
    "pca__n_components":[10]
}

In [34]:
# gridsearch setup
grid = GridSearchCV(pipeline, params)

In [35]:
# Train the scaler with the X_train data.
grid.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('pca', PCA()), ('svc', SVC())]),
             param_grid={'pca__n_components': [10],
                         'svc__C': [0.0001, 0.001, 0.01, 0.1, 1, 5, 10, 50, 100,
                                    1000],
                         'svc__gamma': [0.0001, 0.001, 0.01, 0.1, 1, 5, 10,
                                        100]})

In [36]:
# evaluate model
grid.score(X_train, y_train)

0.9028077753779697

In [37]:
y_train_pred = grid.predict(X_train)

In [38]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.91      0.99      0.95       400
           1       0.82      0.37      0.51        63

    accuracy                           0.90       463
   macro avg       0.86      0.68      0.73       463
weighted avg       0.90      0.90      0.89       463



In [39]:
# gridsearch results
grid.best_params_

{'pca__n_components': 10, 'svc__C': 1000, 'svc__gamma': 0.001}

In [40]:
# make predictions
y_pred = grid.predict(X_test)
accuracy_score(y_test, y_pred)

0.8903225806451613

In [41]:
confusion_matrix(y_test, y_pred)

array([[134,   3],
       [ 14,   4]], dtype=int64)

In [42]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.98      0.94       137
           1       0.57      0.22      0.32        18

    accuracy                           0.89       155
   macro avg       0.74      0.60      0.63       155
weighted avg       0.87      0.89      0.87       155



## SVC with Balanced Data

In [82]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

In [83]:
ros = RandomOverSampler(random_state=42)
x_ros, y_ros = ros.fit_resample(X_train, y_train)

In [84]:
print('Original dataset shape', Counter(y_train))
print('Resample dataset shape', Counter(y_ros))

Original dataset shape Counter({0: 400, 1: 63})
Resample dataset shape Counter({0: 400, 1: 400})


In [85]:
# create pipeline
pipeline_bal = Pipeline(pipes)

In [149]:
# set gridsearch parameters
params_bal = {
    "svc__C": [0.0001, 0.001, 0.01, 0.05, 0.1, 0.18],
    "svc__gamma": [0.0001, 0.001, 0.01, 0.1, 1, 5, 10, 100],
    "pca__n_components":[10]
}

In [150]:
# gridsearch setup
grid_bal = GridSearchCV(pipeline_bal, params_bal)

In [151]:
# Train the scaler with the balanced training data
grid_bal.fit(x_ros, y_ros)

GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('pca', PCA()), ('svc', SVC())]),
             param_grid={'pca__n_components': [10],
                         'svc__C': [0.0001, 0.001, 0.01, 0.05, 0.1, 0.18],
                         'svc__gamma': [0.0001, 0.001, 0.01, 0.1, 1, 5, 10,
                                        100]})

In [152]:
# make predictions
y_pred = grid_bal.predict(X_train)
accuracy_score(y_train, y_pred)

0.8336933045356372

In [153]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.82      0.90       400
           1       0.45      0.90      0.60        63

    accuracy                           0.83       463
   macro avg       0.71      0.86      0.75       463
weighted avg       0.91      0.83      0.85       463



In [154]:
# gridsearch results
grid_bal.best_params_

{'pca__n_components': 10, 'svc__C': 0.18, 'svc__gamma': 0.1}

In [155]:
# make predictions
y_pred = grid_bal.predict(X_test)
accuracy_score(y_test, y_pred)

0.7806451612903226

In [156]:
confusion_matrix(y_test, y_pred)

array([[112,  25],
       [  9,   9]], dtype=int64)

In [157]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.82      0.87       137
           1       0.26      0.50      0.35        18

    accuracy                           0.78       155
   macro avg       0.60      0.66      0.61       155
weighted avg       0.85      0.78      0.81       155



## Analysis


I think SVC with the balanced data training set is over-fitting.  It's predicing perfectly in training 
but not predicting the minority outcome.  

Need to forloop changing C until f1 is optimized.