In [12]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

In [42]:
df = pd.read_csv('../../clean_2019.csv')

df.isna().sum()

df.dropna(inplace=True)

X = df.drop(columns=['Unnamed: 0','pitch'])
y = df['pitch']

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, 
                                                    random_state=16, stratify=y)

In [44]:
ss = StandardScaler()

In [45]:
X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.transform(X_test)

In [54]:
pca = PCA().fit(X_train)

In [55]:
pca.explained_variance_ratio_

array([1.56517890e-01, 8.03285315e-02, 7.49323004e-02, 6.65132892e-02,
       5.80756958e-02, 5.26358428e-02, 4.38649796e-02, 4.05876268e-02,
       4.00568612e-02, 3.87609342e-02, 3.74944836e-02, 3.50607877e-02,
       3.41946544e-02, 3.30228556e-02, 3.24764899e-02, 2.85532571e-02,
       2.78150664e-02, 2.77068209e-02, 2.63990455e-02, 1.64768563e-02,
       1.23079482e-02, 1.04103126e-02, 9.50695714e-03, 6.99611167e-03,
       6.40477512e-03, 1.91034006e-03, 9.89286231e-04, 6.96589036e-34])

In [48]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('pca', PCA()),
    ('svc', SVC())
])

In [49]:
gamma_range = [0.001, 0.01, 0.1, 1, 10]# np.logspace(-5, 2, 10)
C_range = [ 0.0001, 0.001, 0.01] # np.logspace(-3, 2, 10)
kernel_range = ['linear']

param_grid = {
    'pca__n_components': [3],
    #'svc__gamma': gamma_range,
    'svc__C': C_range,
    'svc__kernel': kernel_range,
    'svc__class_weight': ['balanced']
             }

grid = GridSearchCV(pipe, param_grid, cv = 3,
                    scoring = 'accuracy', verbose = 1, n_jobs = -1)

In [50]:
grid.fit(X_train, y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed: 66.7min finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('ss', StandardScaler()), ('pca', PCA()),
                                       ('svc', SVC())]),
             n_jobs=-1,
             param_grid={'pca__n_components': [3], 'svc__C': [0.01],
                         'svc__class_weight': ['balanced'],
                         'svc__kernel': ['linear']},
             scoring='accuracy', verbose=1)

In [53]:
print(grid.best_params_)
print(grid.best_score_)

{'pca__n_components': 3, 'svc__C': 0.01, 'svc__class_weight': 'balanced', 'svc__kernel': 'linear'}
0.3476427914547844


In [52]:
grid.score(X_test, y_test)

0.34136713068772073