In [208]:
import pandas as pd
import glob
import numpy as np
import altair as alt
import json
import math
import matplotlib as mpp
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 40)
filename = "processed/quantleague.csv"
quantleague = pd.read_csv(filename, index_col=None)
defaultxcols = ['gamelength', 'totalgoldblue', 'earlygoldblue', 'midgoldblue',
                'totalgoldred', 'earlygoldred', 'midgoldred',
                'earlygolddiff', 'midgolddiff', 'bKills',
                'bTowers', 'bInhibs', 'bDragons', 'bBarons', 'bHeralds', 'rKills',
                'rTowers', 'rInhibs', 'rDragons', 'rBarons', 'rHeralds', 'Year']
defaultycols = ['bluewin']

In [209]:

def createDataset(xcols, ycols):
    x = quantleague[xcols]
    # #leaving out late gold columns
    y = quantleague[ycols].to_numpy()
    x = StandardScaler().fit_transform(x)
    return train_test_split(x, y, train_size=.8, random_state=42)

In [210]:
x_train, x_test, y_train, y_test  = createDataset(defaultxcols, defaultycols)

In [211]:
def visualizePCA(x_train, y_train, y_name):
    pca = PCA(n_components=2)
    principalcomponents = pca.fit_transform(x_train)
    pcs = pd.DataFrame(data=principalcomponents, columns=['PC1', 'PC2'])
    pcs[y_name] = y_train
    alt.Chart(pcs.sample(5000)).mark_point(size=1).encode(x='PC1', y='PC2', color=y_name).display()
    return pca


In [212]:
pca = visualizePCA(x_train, y_train, 'Blue win')


In [213]:
sum(pca.explained_variance_ratio_)
# This is great news! Around 63% of the variance of our data can be explained with just two principal components. Let's look at how many components we would need to capture
# 95% of the variance:

0.6300642868346679

In [214]:


def visualizePCAComponents(x_train):
    numcomponents = x_train.shape[1]
    pca = PCA(numcomponents).fit(x_train)

    pca_variance = pd.DataFrame({
        'x': list(range(1,numcomponents+1)),
        'y': np.cumsum(pca.explained_variance_ratio_)
    })
    alt.Chart(pca_variance).mark_line().encode(
        x = alt.X('x', title='Number of components'),
        y = alt.Y('y', title='Cumulative explained variance', scale=alt.Scale(domain=(0.3,1.02)))
    ).interactive().display()

#To explain >=95% of variance, we need to use 11 components. We can see this means .... ?
 #First, we try creating a model to predict probability of winning based on early-mid game state, meaning we look at gold differences and neutral objectives taken.
  #Let's try doing a PCA to find which variables are most indicative of winning:


In [215]:
 visualizePCAComponents(x_train)

In [216]:
def LogReg(x_train, y_train):
    return LogisticRegression().fit(x_train, y_train)

def evaluateModel(model, x_test, y_test):    
    return model.score(x_test, y_test)

In [217]:
blueWinModel = LogReg(x_train, y_train)
evaluateModel(blueWinModel, x_test, y_test)

0.9822834645669292

In [288]:
def confusion(model, x_test, y_test, labels):
    predictions = model.predict(x_test)
    return metrics.confusion_matrix(y_test, predictions, labels=labels, normalize='true')

def confusion_graph(confusion_matrix, labels):
    cmlen = confusion_matrix.shape[0]
    x,y = np.meshgrid(labels,labels)    
    source = pd.DataFrame({
        'x': x.ravel(),
        'y': y.ravel(),
        'z':confusion_matrix.ravel()
    })
    return alt.Chart(source).mark_rect().encode(
        x=alt.X('x:N', title="Predicted"),
        y=alt.Y('y:N', title="Target"),
        color='z:Q'
    ).properties(width=200, height=200)

In [289]:
confusion_graph(confusion(blueWinModel, x_test, y_test, np.unique(y_test)), np.unique(y_test))

In [290]:
def pipeline(xcols, ycols):
    print('Creating data...')
    x_train, x_test, y_train, y_test  = createDataset(xcols, ycols)
    print('Creating model...')
    model = LogReg(x_train, y_train)
    print('Evaluating model...')
    labels = np.unique(y_test)
    print(evaluateModel(model, x_test, y_test))    
    print('Displaying confusion matrix')    
    confusion_graph(confusion(model, x_test, y_test, labels), labels).display()


In [291]:
pipeline(defaultxcols, defaultycols)

Creating data...
Creating model...
Evaluating model...
0.9822834645669292
Displaying confusion matrix


In [292]:
pipeline(['gamelength', 'totalgoldblue', 'earlygoldblue', 'midgoldblue',
       'lategoldblue', 'totalgoldred', 'earlygoldred', 'midgoldred',
       'lategoldred', 'earlygolddiff', 'midgolddiff', 'lategolddiff', 'bKills',
       'bTowers', 'bInhibs', 'bDragons', 'bBarons', 'bHeralds', 'rKills',
       'rTowers', 'rInhibs', 'rDragons', 'rBarons', 'rHeralds',
       'Year'], ['League'])

Creating data...
Creating model...
Evaluating model...
0.23818897637795275
Displaying confusion matrix


In [293]:
pipeline(['gamelength', 'totalgoldblue', 'earlygoldblue', 'midgoldblue',
       'lategoldblue', 'totalgoldred', 'earlygoldred', 'midgoldred',
       'lategoldred', 'earlygolddiff', 'midgolddiff', 'lategolddiff', 'bKills',
       'bTowers', 'bInhibs', 'bDragons', 'bBarons', 'bHeralds', 'rKills',
       'rTowers', 'rInhibs', 'rDragons', 'rBarons', 'rHeralds',
       'Year'], ['Season'])

Creating data...
Creating model...
Evaluating model...
0.5872703412073491
Displaying confusion matrix


In [294]:
pipeline(['gamelength', 'totalgoldblue', 'earlygoldblue', 'midgoldblue',
       'lategoldblue', 'totalgoldred', 'earlygoldred', 'midgoldred',
       'lategoldred', 'earlygolddiff', 'midgolddiff', 'lategolddiff', 'bKills',
       'bTowers', 'bInhibs', 'bDragons', 'bBarons', 'bHeralds', 'rKills',
       'rTowers', 'rInhibs', 'rDragons', 'rBarons', 'rHeralds',
       ], ['Year'])

Creating data...
Creating model...
Evaluating model...
0.7152230971128609
Displaying confusion matrix


In [295]:
pipeline(['gamelength', 'totalgoldblue', 'earlygoldblue', 'midgoldblue',
       'lategoldblue', 'totalgoldred', 'earlygoldred', 'midgoldred',
       'lategoldred', 'earlygolddiff', 'midgolddiff', 'lategolddiff', 'bKills',
       'bTowers', 'bInhibs', 'bDragons', 'bBarons', 'bHeralds', 'rKills',
       'rTowers', 'rInhibs', 'rDragons', 'rBarons', 'rHeralds',
       'Year'], ['Type'])

Creating data...
Creating model...
Evaluating model...
0.7637795275590551
Displaying confusion matrix
