In [1]:
# Initial imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
# Loading data
file_path = Path("data/myopia.csv")
df = pd.read_csv(file_path)
df.head(5)

Unnamed: 0,AGE,SPHEQ,AL,ACD,LT,VCD,SPORTHR,READHR,COMPHR,STUDYHR,TVHR,DIOPTERHR,MOMMY,DADMY,MYOPIC
0,6,-0.052,21.889999,3.69,3.498,14.7,45,8,0,0,10,34,1,1,1
1,6,0.608,22.379999,3.702,3.392,15.29,4,0,1,1,7,12,1,1,0
2,6,1.179,22.49,3.462,3.514,15.52,14,0,2,0,10,14,0,0,0
3,6,0.525,22.200001,3.862,3.612,14.73,18,11,0,0,4,37,0,1,1
4,5,0.697,23.290001,3.676,3.454,16.16,14,0,0,0,4,4,1,0,0


In [3]:
# Check dataset balance
df["MYOPIC"].value_counts()

0    537
1     81
Name: MYOPIC, dtype: int64

In [4]:
# Check dataset balance
df["MYOPIC"].value_counts(normalize=True)

0    0.868932
1    0.131068
Name: MYOPIC, dtype: float64

In [5]:
# Define X,y
label = df["MYOPIC"]
X = df.iloc[:,:-1].copy()
X.head()

Unnamed: 0,AGE,SPHEQ,AL,ACD,LT,VCD,SPORTHR,READHR,COMPHR,STUDYHR,TVHR,DIOPTERHR,MOMMY,DADMY
0,6,-0.052,21.889999,3.69,3.498,14.7,45,8,0,0,10,34,1,1
1,6,0.608,22.379999,3.702,3.392,15.29,4,0,1,1,7,12,1,1
2,6,1.179,22.49,3.462,3.514,15.52,14,0,2,0,10,14,0,0
3,6,0.525,22.200001,3.862,3.612,14.73,18,11,0,0,4,37,0,1
4,5,0.697,23.290001,3.676,3.454,16.16,14,0,0,0,4,4,1,0


## Pipeline with SVC and Cross-Validation

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, label, random_state=42)

In [7]:
# create process steps
pipes = [
    ("scaler", StandardScaler()),
    ("pca", PCA()),
    ("svc", SVC())
]

In [8]:
# create pipeline
pipeline = Pipeline(pipes)

In [12]:
# set gridsearch parameters
params = {
    "svc__C": [0.0001, 0.001, 0.01, 0.1, 1, 5, 10, 50, 100, 1000],
    "svc__gamma": [0.0001, 0.001, 0.01, 0.1, 1, 5, 10, 100],
    "pca__n_components":[10]
}

In [13]:
# gridsearch setup
grid = GridSearchCV(pipeline, params)

In [14]:
# Train the scaler with the X_train data.
grid.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('pca', PCA()), ('svc', SVC())]),
             param_grid={'pca__n_components': [10],
                         'svc__C': [0.0001, 0.001, 0.01, 0.1, 1, 5, 10, 50, 100,
                                    1000],
                         'svc__gamma': [0.0001, 0.001, 0.01, 0.1, 1, 5, 10,
                                        100]})

In [15]:
# evaluate model
grid.score(X_train, y_train)

0.9028077753779697

In [16]:
# gridsearch results
grid.best_params_

{'pca__n_components': 10, 'svc__C': 1000, 'svc__gamma': 0.001}

In [17]:
# make predictions
y_pred = grid.predict(X_test)
accuracy_score(y_test, y_pred)

0.8903225806451613

In [18]:
confusion_matrix(y_test, y_pred)

array([[134,   3],
       [ 14,   4]], dtype=int64)

In [19]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.98      0.94       137
           1       0.57      0.22      0.32        18

    accuracy                           0.89       155
   macro avg       0.74      0.60      0.63       155
weighted avg       0.87      0.89      0.87       155



## SVN with Balanced Data

In [20]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

In [21]:
ros = RandomOverSampler(random_state=42)
x_ros, y_ros = ros.fit_resample(X_train, y_train)

In [22]:
print('Original dataset shape', Counter(y_train))
print('Resample dataset shape', Counter(y_ros))

Original dataset shape Counter({0: 400, 1: 63})
Resample dataset shape Counter({0: 400, 1: 400})


In [23]:
# Train the scaler with the balanced training data
grid.fit(x_ros, y_ros)

GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('pca', PCA()), ('svc', SVC())]),
             param_grid={'pca__n_components': [10],
                         'svc__C': [0.0001, 0.001, 0.01, 0.1, 1, 5, 10, 50, 100,
                                    1000],
                         'svc__gamma': [0.0001, 0.001, 0.01, 0.1, 1, 5, 10,
                                        100]})

In [24]:
# make predictions
y_pred = grid.predict(X_test)
accuracy_score(y_test, y_pred)

0.8838709677419355

In [25]:
confusion_matrix(y_test, y_pred)

array([[137,   0],
       [ 18,   0]], dtype=int64)

In [26]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      1.00      0.94       137
           1       0.00      0.00      0.00        18

    accuracy                           0.88       155
   macro avg       0.44      0.50      0.47       155
weighted avg       0.78      0.88      0.83       155



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Analysis
