In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MaxAbsScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import plot_confusion_matrix, accuracy_score, plot_roc_curve, classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier

In [2]:
magnus_df = pd.read_pickle("./magnus.pkl")

magnus_df = magnus_df.drop(columns = ['opponent', 'opening_moves', 'opening_variant'])


In [3]:
magnus_df.rename(columns = {'player_color' : 'magnus_color', 'player_rating' : 'magnus_rating'}, inplace = True)

In [4]:
small_opening_list = magnus_df['opening'].value_counts()[magnus_df['opening'].value_counts() < 10].index
small_opening_list
magnus_df['opening'].isin(small_opening_list)

0       False
1       False
2       False
3       False
4       False
        ...  
1495    False
1496    False
1497    False
1498    False
1499    False
Name: opening, Length: 1500, dtype: bool

In [7]:
magnus_df = magnus_df[~magnus_df['opening'].isin(small_opening_list)]

Unnamed: 0,magnus_color,opponent_rating,magnus_rating,length,year,opening,result
0,white,2741.0,2881.0,29,2021,Ruy López Opening,draw
1,black,2761.0,2881.0,60,2021,Ruy López Opening,draw
2,black,2710.0,2847.0,41,2021,Queen's Gambit Declined,win
3,black,2736.0,2847.0,39,2021,Queen's Gambit Declined,win
4,black,2716.0,2847.0,36,2021,Giuoco Piano Game,draw
...,...,...,...,...,...,...,...
1495,white,2798.0,2876.0,60,2015,Semi-Slav Defense,loss
1496,white,2666.0,2862.0,49,2015,Sicilian Defense,draw
1497,white,2731.0,2853.0,43,2015,English Opening,win
1498,white,2771.0,2853.0,66,2015,Sicilian Defense,loss


In [None]:
hot_potato = OneHotEncoder(sparse = False, drop = 'first') 


In [None]:
X = magnus_df.drop(columns = 'result')
y = magnus_df['result']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = .3)

In [None]:
numeric_pipeline = Pipeline([('numimputer', SimpleImputer(strategy= 'mean')), ('numnorm', StandardScaler())])

ordinal_pipeline = Pipeline([
    ('ordimputer', SimpleImputer(strategy = 'most_frequent')), 
    ('ordnorm', StandardScaler())
    ])

nominal_pipeline = Pipeline([
    ('onehotimputer', SimpleImputer(strategy = 'most_frequent')),
    # ('onehotenc', OneHotEncoder(sparse = False, drop = 'first')), 
    ('onehotnorm', MaxAbsScaler())
])

In [None]:
numeric_cols = ['opponent_rating', 'magnus_rating', 'length']
ordinal_cols = ['year']
nominal_cols = ['magnus_color', 'opening']

In [None]:
ct = ColumnTransformer(
    [("ordinal_pipe", ordinal_pipeline, ordinal_cols),
    ("nominal_pipe", nominal_pipeline, nominal_cols),
    ("numeric_pipe", numeric_pipeline, numeric_cols)]
)

In [None]:
k_pipe = Pipeline([
    ('preprocessing', ct),
    ('model', KNeighborsClassifier())
])

In [None]:
k_pipe.fit(X_train, y_train)

In [None]:
plot_confusion_matrix(k_pipe, X_train, y_train)

In [None]:
dtree_pipe = Pipeline([
    ('preprocessing', ct),
    ('model', DecisionTreeClassifier())
])

In [None]:
dtree_pipe.fit(X_train, y_train)

In [None]:
plot_confusion_matrix(dtree_pipe, X_train, y_train)