## Phase 1 and Phase 2 Classification metrics for George Bush

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.svm import SVC
from sklearn.decomposition import PCA

In [2]:
X = pd.read_csv("X.csv", sep=' ', header=None, dtype=float)
X = X.values

In [3]:
y = pd.read_csv("y_bush_vs_others.csv", header=None)
y_bush = y.values.ravel()
y = pd.read_csv("y_williams_vs_others.csv", header=None)
y_williams = y.values.ravel()

In [4]:
y_bush.shape
y_williams.shape

(13233,)

## Adding Prinicipal component analysis to reduce the dimensionality of the data

In [23]:
pca = PCA(n_components=1550)
pca.fit(X)
X1 = pca.transform(X)

In [24]:
X1.shape

(13233, 1550)

In [7]:
#Positive instances of bush in the array 
#For binary classification the others are 0 and data points of bush will have a value of 1
#hence the sum of the array is the number of instances
np.sum(y_bush)

530

In [8]:
#positive instances for serena williams
np.sum(y_williams)

52

### Calculating precision,recall and F1 for KNN classifier

In [21]:
print("KNN for williams, pca = 43")
for i in (1,3,5):
    knn = KNeighborsClassifier(n_neighbors=i)
    print("n_neighbours = ",i)
    st_cv_results = cross_validate(knn,X1,y_williams,cv=StratifiedKFold(n_splits=3,shuffle=True,random_state=3243),scoring=('precision','recall','f1'),return_train_score=False,n_jobs=-1)
    print(st_cv_results)

KNN for williams, pca = 43
n_neighbours =  1
{'fit_time': array([0.02793121, 0.02792668, 0.03391027]), 'score_time': array([6.51858401, 6.52455378, 6.49765015]), 'test_precision': array([0.6       , 0.4       , 0.44444444]), 'test_recall': array([0.16666667, 0.23529412, 0.23529412]), 'test_f1': array([0.26086957, 0.2962963 , 0.30769231])}
n_neighbours =  3
{'fit_time': array([0.02693892, 0.02793217, 0.04288626]), 'score_time': array([6.9912653 , 7.16685128, 7.1000464 ]), 'test_precision': array([0., 0., 1.]), 'test_recall': array([0.        , 0.        , 0.11764706]), 'test_f1': array([0.        , 0.        , 0.21052632])}
n_neighbours =  5
{'fit_time': array([0.02692938, 0.02693295, 0.02789092]), 'score_time': array([7.22365451, 7.30642533, 7.35157228]), 'test_precision': array([0., 0., 0.]), 'test_recall': array([0., 0., 0.]), 'test_f1': array([0., 0., 0.])}


In [88]:
st_cv_results

{'fit_time': array([0.03091717, 0.03487277, 0.03491974]),
 'score_time': array([8.60801506, 8.54220819, 8.49726486]),
 'test_precision': array([0.        , 0.22727273, 0.35294118]),
 'test_recall': array([0.        , 0.02824859, 0.03409091]),
 'test_f1': array([0.        , 0.05025126, 0.06217617])}

### Calculating precision,recall and F1 for SVC classifier

In [14]:
#SVM 
svc = SVC()


In [25]:
print("Bush rbf pca = 1500\n")
for i in range(10):
    svc = SVC(C = 10**(i-5),kernel='poly')
    print("10^ ",i-5)
    svc_st_cv_results = cross_validate(svc,X1,y_bush,cv=StratifiedKFold(n_splits=3,shuffle=True,random_state=3243),scoring=('precision','recall','f1'),return_train_score=False,n_jobs=-1)
    print(svc_st_cv_results)

Bush rbf pca = 1500

10^  -5
{'fit_time': array([11.34465408, 11.32175469, 11.3746233 ]), 'score_time': array([15.88549042, 15.88055468, 15.90148926]), 'test_precision': array([0., 0., 0.]), 'test_recall': array([0., 0., 0.]), 'test_f1': array([0., 0., 0.])}
10^  -4
{'fit_time': array([19.3592453 , 19.52077055, 19.80801821]), 'score_time': array([24.09256864, 24.04570436, 23.7694416 ]), 'test_precision': array([0., 0., 0.]), 'test_recall': array([0., 0., 0.]), 'test_f1': array([0., 0., 0.])}
10^  -3
{'fit_time': array([15.12555408, 15.14450502, 15.16744184]), 'score_time': array([20.4702642 , 20.45430589, 20.36554313]), 'test_precision': array([0., 0., 0.]), 'test_recall': array([0., 0., 0.]), 'test_f1': array([0., 0., 0.])}
10^  -2
{'fit_time': array([14.13919711, 14.18008327, 14.232939  ]), 'score_time': array([21.47357702, 21.41473722, 21.42171836]), 'test_precision': array([0., 0., 0.]), 'test_recall': array([0., 0., 0.]), 'test_f1': array([0., 0., 0.])}
10^  -1
{'fit_time': array(

In [12]:
svc_st_cv_results

{'fit_time': array([12.34393287, 11.0092845 , 11.05972958]),
 'score_time': array([13.05108976, 11.74315977, 12.72191119]),
 'test_precision': array([0.66666667, 0.875     , 0.75      ]),
 'test_recall': array([0.44444444, 0.41176471, 0.35294118]),
 'test_f1': array([0.53333333, 0.56      , 0.48      ])}