In [3]:
import os
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score,precision_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from wrangle import train_validate_test_split,impute_nulls, split_X_y
from explore import explore_univariate, get_lol_heatmap, explore_multivariate, plot_swarm_grid_with_color, plot_violin_grid_with_color
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text, export_graphviz
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, explained_variance_score
from sklearn.ensemble import RandomForestClassifier
import statsmodels.api as sm
import sklearn.metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn import svm, naive_bayes
import requests
from env import api_key
import time
import pandas as pd
from bs4 import BeautifulSoup


pd.set_option("display.max_rows", None, "display.max_columns", None)

pd.reset_option("display.max_rows", "display.max_columns")


In [None]:
df = pd.read_json('games_0_597.json')

In [None]:
### explore functions
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from scipy import stats

def plot_variable_pairs(train, cols, hue=None):
    '''
    This function takes in a df, a list of cols to plot, and default hue=None 
    and displays a pairplot with a red regression line.
    '''
    plot_kws={'line_kws':{'color':'red'}, 'scatter_kws': {'alpha': 0.7}}
    sns.pairplot(train[cols], hue=hue, kind="reg",plot_kws={'line_kws':{'color':'red'}, 'scatter_kws': {'alpha': 0.1}})
    plt.show()

    
def plot_pairplot(train, cols, hue=None):
    '''
    Take in train df, list of columns to plot, and hue=None
    and display scatter plots and hists.
    '''
    sns.pairplot(train[cols], corner=True)
    plt.show()
    
    
    
def correlation_exploration(train, x_string, y_string):
    '''
    This function takes in a df, a string for an x-axis variable in the df, 
    and a string for a y-axis variable in the df and displays a scatter plot, the r-
    squared value, and the p-value. It explores the correlation between input the x 
    and y variables.
    '''
    r, p = stats.pearsonr(train[x_string], train[y_string])
    train.plot.scatter(x_string, y_string)
    plt.title(f"{x_string}'s Relationship with {y_string}")
    print(f'The p-value is: {p}. There is {round(p,3)}% chance that we see these results by chance.')
    print(f'r = {round(r, 2)}')
    plt.show()
    
def explore_univariate(df, variable):
    '''
    explore_univariate will take in a dataframe, and one feature or variable. It graphs a box plot and a distribution 
    of the single variable.
    '''
    #set figure size, font for axis ticks, and turns off gridlines.
    plt.figure(figsize=(30,10))
    sns.set(font_scale = 2)
    sns.set_style("whitegrid", {'axes.grid' : False})
    
    # boxplot
    plt.subplot(1, 2, 1)
    sns.boxplot(x=variable, data=df)
    plt.xlabel('')
    plt.title('Box Plot', fontsize=30)
    
    # distribution
    plt.subplot(1, 2, 2)
    sns.histplot(data=df, x=variable, element='step', kde=True, color='blue')
    plt.xlabel('')
    plt.ylabel('')
    plt.title('Distribution', fontsize=30)
    
    #title
    plt.suptitle(f'{variable}', fontsize = 45)
    plt.tight_layout()
    plt.show()
    
    
def explore_multivariate(train, target, cat_vars, quant_vars):
    '''
    '''
    plot_swarm_grid_with_color(train, target, cat_vars, quant_vars)
    plt.show()
    violin = plot_violin_grid_with_color(train, target, cat_vars, quant_vars)
    plt.show()
    pair = sns.pairplot(data=train, vars=quant_vars, hue=target)
    plt.show()    

def plot_swarm_grid_with_color(train, target, cat_vars, quant_vars):
    cols = len(cat_vars)
    for quant in quant_vars:
        _, ax = plt.subplots(nrows=1, ncols=cols, figsize=(16, 6), sharey=True)
        for i, cat in enumerate(cat_vars):
            sns.swarmplot(x=cat, y=quant, data=train, ax=ax[i], hue=target, palette="Set2")
            ax[i].set_xlabel('')
            ax[i].set_ylabel(quant)
            ax[i].set_title(cat)
        
def plot_violin_grid_with_color(train, target, cat_vars, quant_vars):
    cols = len(cat_vars)
    for quant in quant_vars:
        _, ax = plt.subplots(nrows=1, ncols=cols, figsize=(16, 4), sharey=True)
        for i, cat in enumerate(cat_vars):
            sns.violinplot(x=cat, y=quant, data=train, split=True, 
                           ax=ax[i], hue=target, palette="Set2")
            ax[i].set_xlabel('')
            ax[i].set_ylabel(quant)
            ax[i].set_title(cat) 

In [None]:
#pairplots 
plot_variable_pairs(train, cols)

In [None]:
# correlation
correlation_exploration(train, 'beds', 'tax_value')

In [None]:
# hypothesis testing

null_hypothesis = "Houses > 1600 square feet are independent of their tax value price"
alternative_hypothesis = "Houses <= 1600 square feet have a significant outcome on their tax value price"
a = 0.05 #a for alpha 

big_house = train[train.sqft>1600]
small_house = train[train.baths<=1600]
t, p = stats.ttest_ind(big_house.tax_value, small_house.tax_value)

if p < a:
    print(f'Reject null hypothesis that: {null_hypothesis}')
    print (f'There is evidence to suggest: {alternative_hypothesis}')
else:
    print(f'Fail to reject null hypothesis that: {null_hypothesis} There is not sufficient evidence to reject it.')

In [None]:
# univariate study
univariate_study = ['R_avg_SUB_ATT','R_total_rounds_fought','R_Height_cms','R_Reach_cms','R_avg_CTRL_time(seconds)','R_avg_SIG_STR_pct']

for i in univariate_study:
    explore_univariate(train, i)
    print(f'Summary Statistics for {i}\n{train[i].describe()}')

In [None]:
### compares spells
spellColors = ["#6E2C00","#1A5276","#9A7D0A","#F1C40F","#3498DB","#58D68D","#E74C3C","#F39C12","#8E44AD"]
sns.barplot(x=spellsTotals.index,y=spellsTotals['count'],palette=spellColors)

In [None]:
# for champions picks and bans
fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=False, figsize=(15,30))
plt.xticks(rotation=90)
sns.countplot(y=teams, data=data, ax=ax1)
sns.countplot(y=sortedBans, data=data, ax=ax2)
ax1.set_title('Champion Picks')
ax2.set_title('Champion Bans')

In [None]:
# barplot for who chose to be tanks, assassin, mage, etc
sns.barplot(x=tagsTotals.index,y=tagsTotals['count'])

In [None]:
# barplots for the following ideas

plotColors = ['#3498DB','#E74C3C','#BDC3C7']
firstLabels = ['First Blood','First Tower', 'First Inhibitor', 'First Baron', 'First Dragon', 'First Rift Herald']
nrows, ncols = 2,3
fig = plt.figure(figsize=(15,10))
for i in range(1,7):
    ax = fig.add_subplot(nrows,ncols,i)
    sns.barplot(x=firstSort.index,y=firstSort[firstSort.columns[i-1]],palette=plotColors)
    ax.set_ylabel('Count')
    ax.yaxis.set_ticklabels([])
    ax.set_title(firstLabels[i-1])

In [None]:
# pie chart for the distribution of winner for blue and red

sides = ['Blue', 'Red']
def count_win_on_side(row):
    if (row['winner'] == row['blue']):
        return pd.Series([1, 0], sides)
    else:
        return pd.Series([0, 1], sides)

data_sides = df.apply(lambda row: count_win_on_side(row), axis=1).mean()

fig, ax = plt.subplots(figsize=(20, 7), subplot_kw=dict(aspect="equal"))

colors = ['#51acc6', '#ea96a3']
plt.pie(data_sides, colors=colors, labels=sides)

ax.set_title('Distribution of the winning percentage by side', pad=20)
plt.axis('equal')
plt.show()

In [None]:
# Setstyle options
sns.set_style('whitegrid')
sns.palplot(sns.color_palette('Blues', 20))
colors = sns.color_palette('Blues', 20)

# Create Figure
fig, ax = plt.subplots(2,4, figsize=(16,14))
fig.suptitle('Game Length Distribution', x=0.065, y=1.03, fontsize=24, fontweight='bold', 
             horizontalalignment='left')
fig.subplots_adjust(top=0.9)

percentiles = np.array([25, 50, 75])
ptiles_gl = np.percentile(df['gamelength'], percentiles)

# Create Subplots

# 1 Box and Whisker
p1 = plt.subplot2grid((2,4), (0,0), colspan=1)
sns.boxplot(y=df['gamelength'], color=colors[14])
# Swarm plot adds no value here, ignore below
# sns.swarmplot(y=df['gamelength'], color=colors[2])
plt.yticks(fontsize=14)
plt.xticks(fontsize=14)
plt.xlabel('All Games', fontsize=18)
plt.ylabel('Minutes', fontsize = 18, fontweight = 'bold')

# 2 ECDF Plot
p2 = plt.subplot2grid((2,4), (0,1), colspan=3)
x = np.sort(df['gamelength'])
y = np.arange(1, len(x) + 1) / len(x)
plt.plot(x,y, marker='.', linestyle='none', color=colors[16])
plt.plot(ptiles_gl, percentiles/100, marker='D', color='red', linestyle='none')

# 2 ECDF Formatting (a lot)
yvals = p2.get_yticks()
p2.set_yticklabels(['{:3.0f}%'.format(y*100) for y in yvals])
plt.yticks(fontsize=14)
plt.xticks(np.arange(0, 85, 5), fontsize=14)
plt.xlabel('Minutes', fontsize=18, fontweight = 'bold')
plt.ylabel('ECDF', fontsize=18, fontweight='bold')
plt.margins(0.02)

plt.annotate('25% of games were 32 minutes or less', xy=(32, .25), xytext=(37, .23), fontsize=18, 
             arrowprops=dict(facecolor='black'))
plt.annotate('50% of games were 37 minutes or less', xy=(37, .5), xytext=(42, .48), 
             fontsize=18, arrowprops=dict(facecolor='black'))
plt.annotate('75% of games were 42 minutes or less', xy=(42, .75), xytext=(47, .73), fontsize=18, 
             arrowprops=dict(facecolor='black'))

# 3 Histogram Count
p3 = plt.subplot2grid((2,4), (1,0), colspan=2)
plt.hist(x='gamelength', bins=80, data=df, color=colors[14])
plt.yticks(fontsize=14)
plt.xticks(fontsize=14)
plt.xlabel('Minutes', fontweight = 'bold', fontsize = 18)
plt.ylabel('Count of All Games', fontsize=18, fontweight='bold')

# 3 Histogram Percentage - Second Y Axis for Percent (To DO - align Y2 ytick values to Y1 ytick lines)
weights = np.ones_like(df['gamelength']) / len(df['gamelength'])
p3 = plt.twinx()
plt.hist(x='gamelength', bins=80, weights= weights, data=df, color=colors[14])
yvals = p3.get_yticks()
p3.set_yticklabels(['{:3.0f}%'.format(y*100) for y in yvals])
plt.yticks(fontsize=14)
plt.xticks(fontsize=14)
p3.grid(b=False)

# 4 Distribution Plot across Years
p4 = plt.subplot2grid((2,4), (1,2), colspan=2)
sns.distplot((df['gamelength'][df['Year']==2014]), hist=False, color='r', label='2014')
sns.distplot((df['gamelength'][df['Year']==2015]), hist=False, color='grey', label='2015')
sns.distplot((df['gamelength'][df['Year']==2016]), hist=False, color='y', label='2016')
sns.distplot((df['gamelength'][df['Year']==2017]), hist=False, color='g', label='2017')
sns.distplot((df['gamelength']), hist=False, color='b', label='All Years')

# Formatting
yvals = p4.get_yticks()
p4.set_yticklabels(['{:3.0f}%'.format(y*100) for y in yvals])
plt.yticks(fontsize=14)
plt.xticks(fontsize=14)
plt.ylabel('Percent of All Games\n', fontsize=18, fontweight='bold')
plt.xlabel('Minutes', fontsize = 18, fontweight = 'bold')

# Show everything
plt.tight_layout()
plt.show()

In [None]:
lst_gameID = pd.unique(df['gameID'])
games = 4
lst_games = list(np.arange(0,games,1))

fig, ax = plt.subplots(figsize=(14,16))
fig.suptitle('Cumulative Kills by Game', fontsize= 24, fontweight='bold', x=0.04, y=1.02, horizontalalignment='left')
fig.subplots_adjust(top=0.85)

for g, c in zip(lst_gameID, lst_games):
    gameID = lst_gameID[c]
    
    ax = plt.subplot(games,1,c+1)

    df_kills = melt_kills[(melt_kills['gameID']==gameID) & melt_kills['minute_bin'].notnull()]

    plt.title('GameID = ' + gameID, fontsize= 14, loc='left')
    sns.regplot(x="value", y="action_count", y_jitter=True, data=df_kills[df_kills['win']==True], color=color_win, 
                label='winning team')
    sns.regplot(x="value", y="action_count", y_jitter=True, data=df_kills[df_kills['win']==False], color=color_lose, 
                label='losing team')
    plt.yticks(np.arange(0, 25, 2))
    plt.xticks(np.arange(0, 45, 1))
    plt.ylabel('Cumulative Kills')
    plt.xlabel('Minute')

    ax.legend(loc='best')

plt.tight_layout()
plt.show()

In [None]:
df_kills = melt_kills[melt_kills['minute_bin'].notnull()]

fig, ax = plt.subplots(figsize=(14,6))
fig.suptitle('Cumulative Kills by Team', fontsize= 24, fontweight='bold', x=0.04, y=1.04, horizontalalignment='left')
# fig.subplots_adjust(top=0.85)

ax = plt.subplot(111)

sns.regplot(x="value", y="action_count", y_jitter=True, data=df_kills[df_kills['win']==True], color=color_win, 
            label='winning team', scatter_kws={'s':2})
sns.regplot(x="value", y="action_count", y_jitter=True, data=df_kills[df_kills['win']==False], color=color_lose, 
            label='losing team', scatter_kws={'s':2})
plt.yticks(np.arange(0, max(df_kills['action_count'])+2, 2))
plt.xticks(np.arange(0, max(df_kills['value'])+2, 2))
plt.ylabel('Cumulative Kills')
plt.xlabel('Minute')

ax.legend(loc='best')

plt.tight_layout()
plt.show()


In [None]:
# firstInhibitor vs winner - stats
df[["firstInhibitor","winner"]].groupby(["firstInhibitor"],as_index = False).mean().sort_values(by = "winner",ascending = False)

In [None]:
# firstBaron vs winner -stats
data_game[["firstBaron","winner"]].groupby(["firstBaron"],as_index = False).mean().sort_values(by = "winner",ascending = False)

In [None]:
## Heatmap for after we decide features
sns.heatmap(data_game[["firstInhibitor","firstBaron","firstRiftHerald","winner"]].corr(),annot = True)
plt.show()

In [None]:
## plt for game duration
def pltDuration(df):
    plt.figure(figsize = (15, 10))
    Duration_plot = plt.hist(df['gameDuration'], bins = 200)
    my_x_ticks = np.arange(0, 4200, 300)
    plt.xticks(my_x_ticks)
    plt.xlabel("gameDuration (s)", fontsize = 13)
    plt.ylabel('Frequency', fontsize = 13)
    plt.title('GameDuration Distribution', fontsize = 15)
    plt.show()
pltDuration(df)

In [None]:
p_firstBlood = data[(data['firstBlood'] == 1) & (data['winner'] == 1)].count()/data[data['firstBlood'] == 1].count()
p_firstTower = data[(data['firstTower'] == 1) & (data['winner'] == 1)].count()/data[data['firstTower'] == 1].count()
p_firstInhibitor = data[(data['firstInhibitor'] == 1) & (data['winner'] == 1)].count()/data[data['firstInhibitor'] == 1].count()
p_firstBaron = data[(data['firstBaron'] == 1) & (data['winner'] == 1)].count()/data[data['firstBaron'] == 1].count()
p_firstDragon = data[(data['firstDragon'] == 1) & (data['winner'] == 1)].count()/data[data['firstDragon'] == 1].count()
p_firstRiftHerald = data[(data['firstRiftHerald'] == 1) & (data['winner'] == 1)].count()/data[data['firstRiftHerald'] == 1].count()

labels = ('firstBlood', 'firstTower', 'firstInhibitor', 'firstBaron', 'firstDragon', 'firstRiftHerald')
probabilities = [p_firstBlood['gameDuration'], p_firstTower['gameDuration'], p_firstInhibitor['gameDuration'], p_firstBaron['gameDuration']
                ,p_firstDragon['gameDuration'] ,p_firstRiftHerald['gameDuration']]
probabilities = [i * 100 for i in probabilities]
y_pos = np.arange(len(labels))
plt.figure(figsize=(25,15))
plt.bar(y_pos, probabilities, align='center', alpha=1)
plt.xticks(y_pos, labels, fontsize = 30)
plt.yticks(fontsize = 30)
plt.ylabel('Probability(%)', fontsize = 30)
plt.title('Winning Probability when Team 1 got FirstXXX', fontsize = 40)
for a,b in zip(y_pos, probabilities):
    plt.text(a, b, '%.3f'%b+'%', ha='center', va= 'bottom',fontsize=30)
plt.show()

In [None]:
# pos correlation chart
plt.figure(figsize=(14,12))
plt.title('Pearson Correlation of Features',y=1.05,size=15)
sns.heatmap(X.astype(float).corr(),linewidths=0.1,vmax=1.0,
            square=True,linecolor='white',annot=True)
plt.xticks(rotation=90)
plt.yticks(rotation=360)
plt.show()

In [None]:
### barcharts to compare
plotColors = ['#3498DB','#E74C3C','#BDC3C7']
firstLabels = ['First Blood','First Tower', 'First Inhibitor', 'First Baron', 'First Dragon', 'First Rift Herald']
nrows, ncols = 2,3
fig = plt.figure(figsize=(15,10))
for i in range(1,7):
    ax = fig.add_subplot(nrows,ncols,i)
    sns.barplot(x=firstSort.index,y=firstSort[firstSort.columns[i-1]],palette=plotColors)
    ax.set_ylabel('Count')
    ax.yaxis.set_ticklabels([])
    ax.set_title(firstLabels[i-1])