In [134]:
import pandas as pd
import glob
import numpy as np
import altair as alt
import json
import math
import matplotlib as mpp
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 40)
filename = "processed/quantleague.csv"
quantleague = pd.read_csv(filename, index_col=None)
defaultxcols = ['gamelength', 'totalgoldblue', 'earlygoldblue', 'midgoldblue',
                'totalgoldred', 'earlygoldred', 'midgoldred',
                'earlygolddiff', 'midgolddiff', 'bKills',
                'bTowers', 'bInhibs', 'bDragons', 'bBarons', 'bHeralds', 'rKills',
                'rTowers', 'rInhibs', 'rDragons', 'rBarons', 'rHeralds', 'Year']
defaultycols = ['bluewin']

In [135]:

def createDataset(xcols, ycols):
    x = quantleague[xcols]
    # #leaving out late gold columns
    y = quantleague[ycols].to_numpy()
    x = StandardScaler().fit_transform(x)
    return train_test_split(x, y, train_size=.8, random_state=42)

In [136]:
x_train, x_test, y_train, y_test  = createDataset(defaultxcols, defaultycols)

In [137]:
def visualizePCA(x_train, y_train, y_name):
    pca = PCA(n_components=2)
    principalcomponents = pca.fit_transform(x_train)
    pcs = pd.DataFrame(data=principalcomponents, columns=['PC1', 'PC2'])
    pcs[y_name] = y_train
    vis = alt.Chart(pcs.sample(5000)).mark_point(size=1).encode(x='PC1', y='PC2', color=y_name)
    return pca, vis


In [138]:
pca, vis = visualizePCA(x_train, y_train, 'Blue win')
vis

In [139]:
sum(pca.explained_variance_ratio_)
# This is great news! Around 63% of the variance of our data can be explained with just two principal components. Let's look at how many components we would need to capture
# 95% of the variance:

0.6300642868346678

In [140]:


def visualizePCAComponents(x_train):
    numcomponents = x_train.shape[1]
    pca = PCA(numcomponents).fit(x_train)

    pca_variance = pd.DataFrame({
        'x': list(range(1,numcomponents+1)),
        'y': np.cumsum(pca.explained_variance_ratio_)
    })
    return alt.Chart(pca_variance).mark_line().encode(
        x = alt.X('x', title='Number of components'),
        y = alt.Y('y', title='Cumulative explained variance', scale=alt.Scale(domain=(0.3,1.02)))
    ).interactive()

#To explain >=95% of variance, we need to use 11 components. We'll do this to speed up logistic regression training later
 #First, we try creating a model to predict probability of winning based on early-mid game state, meaning we look at gold differences and neutral objectives taken.
  #Let's try doing a PCA to find which variables are most indicative of winning:


In [141]:
explainedVariance = visualizePCAComponents(x_train)
explainedVariance

In [144]:
def LogReg(x_train, y_train):
    return LogisticRegression().fit(x_train, y_train)

def evaluateModel(model, x_test, y_test):    
    return logisticRegr.score(x_test, y_test)

In [110]:
def confusion(model, x_test, y_test):
    predictions = model.predict()
    metrics.confusion_matrix(y_test, predictions, normalize='true')

def confusion_graph(confusion_matrix):
    cmlen = confusion_matrix.shape[0]
    x,y = np.meshgrid(range(0,cmlen),range(0,cmlen))
    source = pd.DataFrame({
        'x':x.ravel(),
        'y':y.ravel(),
        'z':confusion_matrix.ravel()
    })
    alt.Chart(source).mark_rect().encode(
        x=alt.X('x:O', title="Predicted"),
        y=alt.Y('y:O', title="Target"),
        color='z:Q'
    ).properties(width=200, height=200)

array([[0.97829233, 0.02170767],
       [0.01440576, 0.98559424]])