In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn import linear_model # for linear algebra 
from sklearn.model_selection import train_test_split # for splitting and testing 

from sklearn.preprocessing import LabelEncoder # for label encoding
from sklearn.metrics import mean_absolute_error as mae, median_absolute_error as mee, classification_report as cr, accuracy_score as ac # for mean absolute error, mediam absolute error, classification reports and accuracy


In [3]:
# import dataset

# URL of the data
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/chess/king-rook-vs-king-pawn/kr-vs-kp.data'

# Read the data from the URL and store it in a Pandas dataframe
df = pd.read_csv(url)
df.head()

Unnamed: 0,f,f.1,f.2,f.3,f.4,f.5,f.6,f.7,f.8,f.9,...,f.23,f.24,f.25,f.26,f.27,f.28,t.2,t.3,n.1,won
0,f,f,f,f,t,f,f,f,f,f,...,f,f,f,f,f,f,t,t,n,won
1,f,f,f,f,t,f,t,f,f,f,...,f,f,f,f,f,f,t,t,n,won
2,f,f,f,f,f,f,f,f,t,f,...,f,f,f,f,f,f,t,t,n,won
3,f,f,f,f,f,f,f,f,f,f,...,f,f,f,f,f,f,t,t,n,won
4,f,f,f,f,f,f,f,f,f,f,...,f,f,t,f,f,f,t,t,n,won


In [4]:
# Get the stats of the data set
df.describe()

Unnamed: 0,f,f.1,f.2,f.3,f.4,f.5,f.6,f.7,f.8,f.9,...,f.23,f.24,f.25,f.26,f.27,f.28,t.2,t.3,n.1,won
count,3195,3195,3195,3195,3195,3195,3195,3195,3195,3195,...,3195,3195,3195,3195,3195,3195,3195,3195,3195,3195
unique,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
top,f,f,f,f,f,f,f,f,f,f,...,f,f,f,f,f,f,t,t,n,won
freq,2838,2970,3075,2873,2128,1721,2025,2499,1979,2224,...,3194,3148,3059,2630,3020,1983,2006,2344,2406,1668


In [5]:
# here we use label encoding to better differentiate between wins and losses by encoding the outcome values to 0s and 1s 
column=['f', 'f.1', 'f.2', 'f.3', 'f.4', 'f.5', 'f.6', 'f.7', 'f.8', 'f.9',
       'f.10', 'f.11', 'l', 'f.12', 'n', 'f.13', 'f.14', 't', 'f.15', 'f.16',
       'f.17', 'f.18', 'f.19', 'f.20', 'f.21', 't.1', 'f.22', 'f.23', 'f.24',
       'f.25', 'f.26', 'f.27', 'f.28', 't.2', 't.3', 'n.1', 'won']
for i in column:
    df[i]=LabelEncoder().fit_transform(df[i])

In [6]:
df.columns

Index(['f', 'f.1', 'f.2', 'f.3', 'f.4', 'f.5', 'f.6', 'f.7', 'f.8', 'f.9',
       'f.10', 'f.11', 'l', 'f.12', 'n', 'f.13', 'f.14', 't', 'f.15', 'f.16',
       'f.17', 'f.18', 'f.19', 'f.20', 'f.21', 't.1', 'f.22', 'f.23', 'f.24',
       'f.25', 'f.26', 'f.27', 'f.28', 't.2', 't.3', 'n.1', 'won'],
      dtype='object')

In [8]:
# we have to make columns to train the model with 
# this is a dataframe of all the columns
xArray=df[['f', 'f.1', 'f.2', 'f.3', 'f.4', 'f.5', 'f.6', 'f.7', 'f.8', 'f.9',
       'f.10', 'f.11', 'l', 'f.12', 'n', 'f.13', 'f.14', 't', 'f.15', 'f.16',
       'f.17', 'f.18', 'f.19', 'f.20', 'f.21', 't.1', 'f.22', 'f.23', 'f.24',
       'f.25', 'f.26', 'f.27', 'f.28', 't.2', 't.3', 'n.1']]

# and create a dataframe with only "wins" outcomes by singling out the "won" column 
yArray=df[['won']]

# Train and test the set
x_train,x_test,y_train,y_test=train_test_split(xArray,yArray,test_size=0.25,random_state=40)

In [9]:
# additional imports for using decision trees
from sklearn.tree import DecisionTreeClassifier # for decision trees

# Use a decision tree to classify the win or no win outcomes, these are used in non-linear datasets and this win no win dataset is nonlinear
# as it does not depend on a specific set of moves to result in a win function 
model=DecisionTreeClassifier()
model.fit(x_train,y_train)

DecisionTreeClassifier()

In [10]:
# Now predict the number of wins (y is the dataframe with the win outcomes)
yPredict = model.predict(x_test)
yPredict

array([0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0,

In [12]:
# now confirm the accuracy of these tests using cross validation 
from sklearn.model_selection import cross_val_score # for cross validation


# cross validation is a technique for evaluating ML models by training several Machine Learning models on SUBSETS of the AVAILABLE input data
# and evaluating them on the COMPLEMENTARY subset of the data. 
print(ac(y_test,yPredict)*100) 

# get the score of the cross validation of array X, array Y using 5 for cross validation splitting strategy
score=(cross_val_score(model,xArray,yArray,cv=5))

# get the mean of the score 
print(np.mean(score)*100)

99.24906132665832
97.71517996870112


In [1]:
!pip install pycaret

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
