In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from category_encoders import OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [2]:
chess = pd.read_csv(r'C:\Users\ebner\Desktop\lamb\chessgames.csv')

In [3]:
chess.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20058 entries, 0 to 20057
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              20058 non-null  object 
 1   rated           20058 non-null  bool   
 2   created_at      20058 non-null  float64
 3   last_move_at    20058 non-null  float64
 4   turns           20058 non-null  int64  
 5   victory_status  20058 non-null  object 
 6   winner          20058 non-null  object 
 7   increment_code  20058 non-null  object 
 8   white_id        20058 non-null  object 
 9   white_rating    20058 non-null  int64  
 10  black_id        20058 non-null  object 
 11  black_rating    20058 non-null  int64  
 12  moves           20058 non-null  object 
 13  opening_eco     20058 non-null  object 
 14  opening_name    20058 non-null  object 
 15  opening_ply     20058 non-null  int64  
dtypes: bool(1), float64(2), int64(4), object(9)
memory usage: 2.3+ MB


In [4]:
chess.set_index('id',inplace=True)

In [5]:
chess.nunique()

rated                 2
created_at        13151
last_move_at      13186
turns               211
victory_status        4
winner                3
increment_code      400
white_id           9438
white_rating       1516
black_id           9331
black_rating       1521
moves             18920
opening_eco         365
opening_name       1477
opening_ply          23
dtype: int64

In [6]:
def clean(df):
    df = df[df.rated != 0]
    df.drop('rated',axis=1,inplace=True)
    co = 365
    for col in df.select_dtypes(['object','float']).columns:
        if df[col].nunique() > co:
            df.drop(col,axis=1,inplace=True)
    return df


In [7]:
df = clean(chess)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [8]:
df

Unnamed: 0_level_0,turns,victory_status,winner,increment_code,white_rating,black_rating,opening_eco,opening_ply
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
l1NXvwaE,16,resign,black,5+10,1322,1261,B00,4
mIICvQHh,61,mate,white,5+10,1496,1500,C20,3
kWKvrqYL,61,mate,white,20+0,1439,1454,D02,3
9tXo1AUZ,95,mate,white,30+3,1523,1469,C41,5
qwU9rasv,33,resign,white,10+0,1520,1423,D00,10
...,...,...,...,...,...,...,...,...
EfqH7VVH,24,resign,white,10+10,1691,1220,A80,2
WSJDhbPl,82,mate,black,10+0,1233,1196,A41,2
yrAas0Kj,35,mate,white,10+0,1219,1286,D00,3
b0v4tRyF,109,resign,white,10+0,1360,1227,B07,4


In [9]:
target = 'winner'
X = df.drop(columns=target)
y = df[target]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [11]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=.25, random_state=42)

In [12]:
baseline_acc = y_train.value_counts(normalize=True).max()
print('Baseline =', baseline_acc)

Baseline = 0.4994325802125245


In [13]:
model_rf = make_pipeline(
    OrdinalEncoder(),
    RandomForestClassifier(random_state=42,n_estimators=200,n_jobs=-1)
)

model_rf.fit(X_train, y_train)

Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['victory_status', 'increment_code',
                                      'opening_eco'],
                                mapping=[{'col': 'victory_status',
                                          'data_type': dtype('O'),
                                          'mapping': outoftime    1
mate         2
resign       3
draw         4
NaN         -2
dtype: int64},
                                         {'col': 'increment_code',
                                          'data_type': dtype('O'),
                                          'mapping': 10+0       1
15+0       2
20+20      3
8+0        4
13+15      5
        ... 
0+13     291
11+1     292
16+20    293
19+17    294
NaN       -2
Length: 295, dtype: int64},
                                         {'col': 'opening_eco',
                                          'data_type': dtype('O'),
                                          'mapping': C45      1
D06    

In [14]:
model_lr = make_pipeline(
    OrdinalEncoder(),
    LogisticRegression(random_state=42,n_jobs=-1)
)

model_lr.fit(X_train, y_train)

Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['victory_status', 'increment_code',
                                      'opening_eco'],
                                mapping=[{'col': 'victory_status',
                                          'data_type': dtype('O'),
                                          'mapping': outoftime    1
mate         2
resign       3
draw         4
NaN         -2
dtype: int64},
                                         {'col': 'increment_code',
                                          'data_type': dtype('O'),
                                          'mapping': 10+0       1
15+0       2
20+20      3
8+0        4
13+15      5
        ... 
0+13     291
11+1     292
16+20    293
19+17    294
NaN       -2
Length: 295, dtype: int64},
                                         {'col': 'opening_eco',
                                          'data_type': dtype('O'),
                                          'mapping': C45      1
D06    

In [15]:
model_xgb = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    XGBClassifier(eta=.3,random_state=42,n_estimators=200,n_jobs=-1,verbosity=1)
)

model_xgb.fit(X_train, y_train);





In [16]:
print('Forest Training Accuracy:', model_rf.score(X_train, y_train))
print('Forest Validation Accuracy:', model_rf.score(X_val, y_val))

Forest Training Accuracy: 1.0
Forest Validation Accuracy: 0.695140823274528


In [17]:
print('XGBoost Training Accuracy:', model_xgb.score(X_train, y_train))
print('XGBoost Validation Accuracy:', model_xgb.score(X_val, y_val))

XGBoost Training Accuracy: 0.9800887238213144
XGBoost Validation Accuracy: 0.8836273599504797


In [18]:
print('Regression Training Accuracy:', model_lr.score(X_train, y_train))
print('Regression Validation Accuracy:', model_lr.score(X_val, y_val))

Regression Training Accuracy: 0.6154957185597855
Regression Validation Accuracy: 0.6326214794181368


In [19]:
model_xgb.get_params()

{'memory': None,
 'steps': [('ordinalencoder',
   OrdinalEncoder(cols=['victory_status', 'increment_code', 'opening_eco'],
                  mapping=[{'col': 'victory_status', 'data_type': dtype('O'),
                            'mapping': outoftime    1
   mate         2
   resign       3
   draw         4
   NaN         -2
   dtype: int64},
                           {'col': 'increment_code', 'data_type': dtype('O'),
                            'mapping': 10+0       1
   15+0       2
   20+20      3
   8+0        4
   13+15      5
           ... 
   0+13     291
   11+1     292
   16+20    293
   19+17    294
   NaN       -2
   Length: 295, dtype: int64},
                           {'col': 'opening_eco', 'data_type': dtype('O'),
                            'mapping': C45      1
   D06      2
   C21      3
   A43      4
   C50      5
         ... 
   B73    321
   E88    322
   C12    323
   A82    324
   NaN     -2
   Length: 325, dtype: int64}])),
  ('simpleimputer', SimpleImputer()

In [20]:
params = {'xgbclassifier__learning_rate' : np.arange(0.3, 0.6, 0.1),
          'xgbclassifier__max_depth' : range(1,7,1),
          'xgbclassifier__n_estimators' : range(100,601,100)
    }

model_xgbcv = RandomizedSearchCV(
    model_xgb, 
    param_distributions=params,
    n_iter=10,
    cv=3,
    n_jobs=-1,
    verbose=1
)

model_xgbcv.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


  elif pd.api.types.is_categorical(cols):




RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('ordinalencoder',
                                              OrdinalEncoder(cols=['victory_status',
                                                                   'increment_code',
                                                                   'opening_eco'],
                                                             mapping=[{'col': 'victory_status',
                                                                       'data_type': dtype('O'),
                                                                       'mapping': outoftime    1
mate         2
resign       3
draw         4
NaN         -2
dtype: int64},
                                                                      {'col': 'increment_code',
                                                                       'data_type': dtype('O'),
                                                                       'mapping': 10+0       1
15+0   

In [21]:
best_score = model_xgbcv.best_score_
best_params = model_xgbcv.best_params_

print('Best score for `model`:', best_score)
print('Best params for `model`:', best_params)

Best score for `model`: 0.876199319096255
Best params for `model`: {'xgbclassifier__n_estimators': 300, 'xgbclassifier__max_depth': 4, 'xgbclassifier__learning_rate': 0.3}


In [22]:
model_xgb1 = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    XGBClassifier(eta=.3,random_state=42,n_estimators=400,n_jobs=-1,max_depth=3,verbosity=1)
)

model_xgb1.fit(X_train, y_train)





Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['victory_status', 'increment_code',
                                      'opening_eco'],
                                mapping=[{'col': 'victory_status',
                                          'data_type': dtype('O'),
                                          'mapping': outoftime    1
mate         2
resign       3
draw         4
NaN         -2
dtype: int64},
                                         {'col': 'increment_code',
                                          'data_type': dtype('O'),
                                          'mapping': 10+0       1
15+0       2
20+20      3
8+0        4
13+15      5
        ... 
0+13     291
11+1     292
16+20    293
19+17    294
NaN       -2
Length: 295, dty...
                               importance_type='gain',
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                    

In [23]:
print('XGBoost Training Accuracy:', model_xgb.score(X_train, y_train))
print('XGBoost Validation Accuracy:', model_xgb.score(X_val, y_val))

XGBoost Training Accuracy: 0.9800887238213144
XGBoost Validation Accuracy: 0.8836273599504797


In [24]:
print('XGBoost CV Training Accuracy:', model_xgb1.score(X_train, y_train))
print('XGBoost CV Validation Accuracy:', model_xgb1.score(X_val, y_val))

XGBoost CV Training Accuracy: 0.9329412978438049
XGBoost CV Validation Accuracy: 0.8857938718662952


In [25]:
model_lr.get_params()

{'memory': None,
 'steps': [('ordinalencoder',
   OrdinalEncoder(cols=['victory_status', 'increment_code', 'opening_eco'],
                  mapping=[{'col': 'victory_status', 'data_type': dtype('O'),
                            'mapping': outoftime    1
   mate         2
   resign       3
   draw         4
   NaN         -2
   dtype: int64},
                           {'col': 'increment_code', 'data_type': dtype('O'),
                            'mapping': 10+0       1
   15+0       2
   20+20      3
   8+0        4
   13+15      5
           ... 
   0+13     291
   11+1     292
   16+20    293
   19+17    294
   NaN       -2
   Length: 295, dtype: int64},
                           {'col': 'opening_eco', 'data_type': dtype('O'),
                            'mapping': C45      1
   D06      2
   C21      3
   A43      4
   C50      5
         ... 
   B73    321
   E88    322
   C12    323
   A82    324
   NaN     -2
   Length: 325, dtype: int64}])),
  ('logisticregression', LogisticRe

In [26]:
params = {'logisticregression__max_iter' : range(50,251,50),
          'logisticregression__solver' : ['liblinear','sag','saga','lbfgs'],
          'logisticregression__C' : np.arange(1.0,5.1,1.0),
    }

model_lrcv = RandomizedSearchCV(
    model_lr, 
    param_distributions=params,
    n_iter=10,
    cv=3,
    n_jobs=-1,
    verbose=1
)

model_lrcv.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


  elif pd.api.types.is_categorical(cols):


RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('ordinalencoder',
                                              OrdinalEncoder(cols=['victory_status',
                                                                   'increment_code',
                                                                   'opening_eco'],
                                                             mapping=[{'col': 'victory_status',
                                                                       'data_type': dtype('O'),
                                                                       'mapping': outoftime    1
mate         2
resign       3
draw         4
NaN         -2
dtype: int64},
                                                                      {'col': 'increment_code',
                                                                       'data_type': dtype('O'),
                                                                       'mapping': 10+0       1
15+0   

In [27]:
best_score = model_lrcv.best_score_
best_params = model_lrcv.best_params_

print('Best score for `model`:', best_score)
print('Best params for `model`:', best_params)

Best score for `model`: 0.6538739296399463
Best params for `model`: {'logisticregression__solver': 'liblinear', 'logisticregression__max_iter': 100, 'logisticregression__C': 2.0}


In [34]:
model_lr1 = make_pipeline(
    OrdinalEncoder(),
    LogisticRegression(random_state=42,n_jobs=-1,max_iter=200,solver='liblinear',C=5.0)
)

model_lr1.fit(X_train, y_train)



Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['victory_status', 'increment_code',
                                      'opening_eco'],
                                mapping=[{'col': 'victory_status',
                                          'data_type': dtype('O'),
                                          'mapping': outoftime    1
mate         2
resign       3
draw         4
NaN         -2
dtype: int64},
                                         {'col': 'increment_code',
                                          'data_type': dtype('O'),
                                          'mapping': 10+0       1
15+0       2
20+20      3
8+0        4
13+15      5
        ... 
0+13     291
11+1     292
16+20    293
19+17    294
NaN       -2
Length: 295, dtype: int64},
                                         {'col': 'opening_eco',
                                          'data_type': dtype('O'),
                                          'mapping': C45      1
D06    

In [29]:
print('Regression Training Accuracy:', model_lr.score(X_train, y_train))
print('Regression Validation Accuracy:', model_lr.score(X_val, y_val))

Regression Training Accuracy: 0.6154957185597855
Regression Validation Accuracy: 0.6326214794181368


In [35]:
print('Regression CV Training Accuracy:', model_lr1.score(X_train, y_train))
print('Regression CV Validation Accuracy:', model_lr1.score(X_val, y_val))

Regression CV Training Accuracy: 0.6559372743216755
Regression CV Validation Accuracy: 0.6740947075208914


In [31]:
print('XGBoost Cross-Validated Test Accuracy = ',model_xgb1.score(X_test,y_test))
print('LinearRegression Cross-Validated Test Accuracy = ',model_lr1.score(X_test,y_test))

XGBoost Cross-Validated Test Accuracy =  0.8907458991024451
LinearRegression Cross-Validated Test Accuracy =  0.6682141751779634
