In [16]:
import math
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import normalize
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_json('new_data/data.json')
w = []
b = []
for i in df['moves'].tolist():
    if not len(i) > 2:
        w.append(0)
        b.append(0)
    else:
        w.append(i[0])
        b.append(i[1])

df['w'] = pd.DataFrame(w)
df['b'] = pd.DataFrame(b)
X_raw = df.loc[:,['WhiteElo', 'BlackElo','w','TimeControl']].to_numpy()
y_raw = df.loc[:, 'Result'].to_numpy()
for i in X_raw:
    i[3] = int(i[3].split('+')[0])

In [3]:
def threshold_features(features, elo_threshold, time_threshold):
    """
    Transform age and salary to binary
    :param features: data array of shape (m, n_features) where features[:,0] for age, features[:,1} for salary
    :param age_threshold: used to "binarize" the data, 1 if age > age_threshold and 0 otherwise
    :param salary_threshold: used to "binarize" the data, 1 if salary > salary_threshold and 0 otherwise
    :return: binary features matrix
    """
    binary_features = features * 1  #This row just creates a "hard copy" of the X array so we can manipulate it as needed

    # Workspace 3.1
    #BEGIN 
    w_moves = ['e4', 'd4', 'Nf3', 'c4', 'g3', 'b3']
    draw_moves = ['']
    for i in binary_features:
        # Elo
        if i[0] < i[1]:
            i[0] = 1
            i[1] = 1
        else:
            i[0] = 0
            i[1] = 0
            
        #Move
        if i[2] in w_moves:
            i[2] = 0
        else: 
            i[2] = 1
            
        # Time
        if i[3] > time_threshold:
            i[3] = 0
        else:
            i[3] = 1
        
    #END

    return binary_features

def result_features(features):
    binary_features = features * 1
    
    for i in range(len(binary_features)):
        if binary_features[i] == '1-0':
            binary_features[i] = 0
        elif binary_features[i] == '0-1':
            binary_features[i] = 1
        else:
            binary_features[i] = 2
            
    return binary_features

In [4]:
X = normalize(threshold_features(X_raw, 100, 900))
y = result_features(y_raw).astype('int')

In [5]:
gnb = MultinomialNB()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
y_pred = gnb.fit(X_train, y_train).predict(X_test)   
print(accuracy_score(y_pred, y_test))

0.6268917345750873


In [6]:
accuracy = 0
best_l_elo = 0
best_h_elo = 0
best_time = 0
for k in range(0, 1500, 50):
    X = normalize(threshold_features(X_raw, 50, k))
    gnb = MultinomialNB()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    y_pred = gnb.fit(X_train, y_train).predict(X_test)
    new_acc = accuracy_score(y_pred, y_test)
    if new_acc > accuracy:
        accuracy = new_acc
        best_time = k
                
print(best_time, accuracy)

900 0.6268917345750873


In [7]:
# Using a binary Naive Bayes learning model resulted in decent results but could be impoved greatly

In [8]:
def convert_to_numbers(data):
    
    new_data = data * 1
    
    for i in new_data:
        if isinstance(i[2], str):
            s = 0
            for x in i[2]:
                s += ord(x) - 49

            i[2] = s
            
    for i in new_data:
        if isinstance(i[2], str):
            print(string)
        
    return new_data

In [9]:
X = convert_to_numbers(X_raw)
y = result_features(y_raw).astype('int')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, random_state=0)

clf = LogisticRegression(random_state=0).fit(X_train, y_train)
pred = clf.predict(X_test)
print(accuracy_score(pred, y_test))

0.6680900621118012


In [10]:
# also running into issues with low accuracy, thinking it might be because of data set size, only using 8500 games
print(len(X))

8586


In [26]:
clf = MLPClassifier()
X = normalize(convert_to_numbers(X_raw))
y = result_features(y_raw).astype('int')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)    
print(accuracy_score(pred, y_test))

0.6522245515956208


In [35]:
#trying Decision Tree Classifier as they are better with threshold data sets
best_time = 0
accuracy = 0
for k in range(0, 5400, 50):
    clf = DecisionTreeClassifier()
    X = normalize(threshold_features(X_raw, 100, 5340))
    y = result_features(y_raw).astype('int')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)    
    accuracy = accuracy_score(pred, y_test)
    if new_acc > accuracy:
        accuracy = new_acc
        best_time = k
        
print(best_time, accuracy)

0 0.6380153738644304


In [34]:
clf = DecisionTreeClassifier()
X = normalize(threshold_features(X_raw, 100, 5350))
y = result_features(y_raw).astype('int')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)    
accuracy = accuracy_score(pred, y_test)
print(accuracy)

0.6380153738644304
