# First Pick Predictor

**Name(s)**: Michael Kroyan

**Website Link**: https://ghost-written.github.io/first-pick-predictor/

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

import plotly.express as px

pd.options.plotting.backend = 'plotly'

import plotly.io as pio
pio.renderers.default = "browser"

from dsc80_utils import * # Feel free to uncomment and use this.
from scipy.stats import binomtest
from scipy.stats import ks_2samp

pd.set_option('display.max_rows', 100)

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import precision_score

## Step 1: Introduction

In [3]:
# I'm interested in:

# Using team compositions to predict whether a team is likelier to win or lose.
# Using team compositions to predict gold, XP, or kill diff at 25.
# Using the champion pick list to predict the first ban.
# Using the champion ban list to predict the first pick.

# I think that I will use the champion ban list to predict the first pick. Banning comes before picking, chronologically speaking, 
# so the inverse would end up being in an invalid order. While the other predictions are interesting, I feel like the pick list depends 
# very directly on the ban list. Whether a team wins or loses is somewhat influenced by champions, but also by skill.

## Step 2: Data Cleaning and Exploratory Data Analysis

In [4]:
# Read in the DataFrame we are going to use for this data analysis. 

df = pd.read_csv("2024_LoL_esports_match_data_from_OraclesElixir.csv", low_memory=False)

In [5]:
# This is the data we will need for the question we intend to answer. 

# We are retaining gameid and participantid for pre-processing purposes. 
# We are going to move all data from both halves of the same game into each row and then drop the row for the team that picks second. 
# We are simply going to drop any rows with missingness. 
# For the purposes of our predictions, we need complete data, and there is no great way to impute data for these nominal categories.

# For the purposes of most of my Exploratory Data Analysis, I will, however, refer back to the complete data set, from which we can draw 
# some interesting conclusions. We will come back to this version of the data set later.

eval_df = pd.read_csv("2024_LoL_esports_match_data_from_OraclesElixir.csv", low_memory=False)
eval_df = eval_df[eval_df["participantid"].isin([100, 200])]
eval_df = eval_df[['gameid', 'participantid', 'ban1', 'ban2', 'ban3', 'ban4', 'ban5', 'pick1', 'patch', 'teamname']]

In [6]:
# We add the position of pick1 to use as training data later. This is done by matching the gameids and champions of a 
# lookup table with the positions in it with the gameids and pick1s of the team data rows so as to attach the position 
# corresponding to the champion in the pick1 spot for that game.

positions = df[(~df["participantid"].isin([100, 200])) & df["position"].notna()]
positions = positions[["gameid", "champion", "position"]]

eval_df = eval_df.merge(positions, left_on=["gameid", "pick1"], right_on=["gameid", "champion"], how="left")
eval_df = eval_df.drop(columns="champion", axis=1)
eval_df

Unnamed: 0,gameid,participantid,ban1,ban2,...,pick1,patch,teamname,position
0,10660-10660_game_1,100,Akali,Nocturne,...,Kalista,13.24,LNG Esports,bot
1,10660-10660_game_1,200,Poppy,Ashe,...,Renata Glasc,13.24,Rare Atom,sup
2,10660-10660_game_2,100,Nocturne,Udyr,...,Neeko,13.24,LNG Esports,mid
3,10660-10660_game_2,200,Poppy,Ashe,...,Kalista,13.24,Rare Atom,bot
4,10660-10660_game_3,100,Rell,Nocturne,...,Neeko,13.24,LNG Esports,mid
...,...,...,...,...,...,...,...,...,...
19603,LOLTMNT02_193448,200,Poppy,Skarner,...,Nocturne,14.23,OKSavingsBank BRION,jng
19604,LOLTMNT02_194400,100,LeBlanc,Poppy,...,Skarner,14.23,OKSavingsBank BRION,jng
19605,LOLTMNT02_194400,200,Vi,Nocturne,...,Aurora,14.23,Dplus KIA,mid
19606,LOLTMNT02_194401,100,Vi,Renekton,...,Aurora,14.23,Dplus KIA,mid


In [7]:
# We add the player name to use as training data later. This is achieved the same way as we added pick1.

players = df[(~df["participantid"].isin([100, 200])) & df["playername"].notna()]
players = players[["gameid", "champion", "playername"]]

eval_df = eval_df.merge(players, left_on=["gameid", "pick1"], right_on=["gameid", "champion"], how="left")
eval_df = eval_df.drop(columns="champion", axis=1)
eval_df

Unnamed: 0,gameid,participantid,ban1,ban2,...,patch,teamname,position,playername
0,10660-10660_game_1,100,Akali,Nocturne,...,13.24,LNG Esports,bot,GALA
1,10660-10660_game_1,200,Poppy,Ashe,...,13.24,Rare Atom,sup,Zorah
2,10660-10660_game_2,100,Nocturne,Udyr,...,13.24,LNG Esports,mid,Scout
3,10660-10660_game_2,200,Poppy,Ashe,...,13.24,Rare Atom,bot,Assum
4,10660-10660_game_3,100,Rell,Nocturne,...,13.24,LNG Esports,mid,Scout
...,...,...,...,...,...,...,...,...,...
19603,LOLTMNT02_193448,200,Poppy,Skarner,...,14.23,OKSavingsBank BRION,jng,HamBak
19604,LOLTMNT02_194400,100,LeBlanc,Poppy,...,14.23,OKSavingsBank BRION,jng,HamBak
19605,LOLTMNT02_194400,200,Vi,Nocturne,...,14.23,Dplus KIA,mid,ShowMaker
19606,LOLTMNT02_194401,100,Vi,Renekton,...,14.23,Dplus KIA,mid,ShowMaker


In [8]:
# We prepare the other team's bans in the same game to be added to our information. 
# We do this by reversing participantid in a copy dataframe and then extracting all bans with participantid labeled 100 (in reality, 200), 
# and then renaming their ban columns from ban1-5 to ban 6-10.

opponent_bans = eval_df[["gameid", "participantid", "ban1", "ban2", "ban3", "ban4", "ban5"]].copy()
opponent_bans["participantid"] = opponent_bans["participantid"].map({100: 200, 200: 100})
opponent_bans = opponent_bans[opponent_bans["participantid"] == 100]
opponent_bans = opponent_bans.drop(columns=["participantid"])

opponent_bans = opponent_bans.rename(columns={
    "ban1": "ban6",
    "ban2": "ban7",
    "ban3": "ban8",
    "ban4": "ban9",
    "ban5": "ban10",
})

opponent_bans

Unnamed: 0,gameid,ban6,ban7,ban8,ban9,ban10
1,10660-10660_game_1,Poppy,Ashe,Neeko,Vi,Jarvan IV
3,10660-10660_game_2,Poppy,Ashe,Rumble,Tristana,Lucian
5,10660-10660_game_3,Poppy,Ashe,LeBlanc,Sejuani,Vi
7,10660-10660_game_4,Rell,Nocturne,Ashe,Azir,Akali
9,10661-10661_game_1,Kalista,Nocturne,Neeko,Sejuani,Poppy
...,...,...,...,...,...,...
19599,LOLTMNT02_194058,K'Sante,Aurora,Skarner,Renekton,Maokai
19601,LOLTMNT02_193442,Aurora,Skarner,Ambessa,Renekton,Rakan
19603,LOLTMNT02_193448,Poppy,Skarner,Ambessa,Ashe,Varus
19605,LOLTMNT02_194400,Vi,Nocturne,Azir,Gragas,Maokai


In [9]:
# We merge the other team's bans in the same game into our DataFrame, on participantid which we manipulated in the previous cell, 
# and eliminate the duplicate rows.

eval_df = eval_df.merge(opponent_bans, on="gameid", how="left")
eval_df = eval_df[eval_df["participantid"] == 100]
eval_df = eval_df.drop(columns=["gameid", "participantid"], axis=1)

In [10]:
# We drop NaN rows. Our DataFrame is now ready.

eval_df = eval_df.dropna().reset_index(drop=True)
eval_df

Unnamed: 0,ban1,ban2,ban3,ban4,...,ban7,ban8,ban9,ban10
0,Akali,Nocturne,K'Sante,Lee Sin,...,Ashe,Neeko,Vi,Jarvan IV
1,Nocturne,Udyr,Renata Glasc,Nautilus,...,Ashe,Rumble,Tristana,Lucian
2,Rell,Nocturne,Tristana,Jarvan IV,...,Ashe,LeBlanc,Sejuani,Vi
3,Poppy,LeBlanc,Neeko,Sejuani,...,Nocturne,Ashe,Azir,Akali
4,Ashe,Akali,LeBlanc,Vi,...,Nocturne,Neeko,Sejuani,Poppy
...,...,...,...,...,...,...,...,...,...
8620,Sejuani,Vi,Ambessa,LeBlanc,...,Aurora,Skarner,Renekton,Maokai
8621,LeBlanc,Poppy,Azir,Bard,...,Skarner,Ambessa,Renekton,Rakan
8622,Renekton,Vi,K'Sante,Kalista,...,Skarner,Ambessa,Ashe,Varus
8623,LeBlanc,Poppy,Ambessa,Lee Sin,...,Nocturne,Azir,Gragas,Maokai


In [11]:
# We re-read the default csv for all our other later analysis.

df = pd.read_csv("2024_LoL_esports_match_data_from_OraclesElixir.csv", low_memory=False)
df.head(12)

Unnamed: 0,gameid,datacompleteness,url,league,...,deathsat25,opp_killsat25,opp_assistsat25,opp_deathsat25
0,10660-10660_game_1,partial,https://lpl.qq.com/es/stats.shtml?bmid=10660,DCup,...,,,,
1,10660-10660_game_1,partial,https://lpl.qq.com/es/stats.shtml?bmid=10660,DCup,...,,,,
2,10660-10660_game_1,partial,https://lpl.qq.com/es/stats.shtml?bmid=10660,DCup,...,,,,
3,10660-10660_game_1,partial,https://lpl.qq.com/es/stats.shtml?bmid=10660,DCup,...,,,,
4,10660-10660_game_1,partial,https://lpl.qq.com/es/stats.shtml?bmid=10660,DCup,...,,,,
5,10660-10660_game_1,partial,https://lpl.qq.com/es/stats.shtml?bmid=10660,DCup,...,,,,
6,10660-10660_game_1,partial,https://lpl.qq.com/es/stats.shtml?bmid=10660,DCup,...,,,,
7,10660-10660_game_1,partial,https://lpl.qq.com/es/stats.shtml?bmid=10660,DCup,...,,,,
8,10660-10660_game_1,partial,https://lpl.qq.com/es/stats.shtml?bmid=10660,DCup,...,,,,
9,10660-10660_game_1,partial,https://lpl.qq.com/es/stats.shtml?bmid=10660,DCup,...,,,,


In [12]:
# Univariate Analysis: Top 30 Picked Champions
# This selects and then graphs the Top 30 Picked Champions. I wanted to see if there were any outliers, 
# and I'm curious if my predictor later will also numerically favour the Champions whom are the most common in these data as a whole. 
# I think that graphing is generally pretty self explanatory.

champion_counts = df["champion"].value_counts().reset_index()
champion_counts.columns = ["champion", "count"]

top_champions = champion_counts.head(30)

fig = px.bar(
    top_champions,
    x="champion",
    y="count",
    title="Top 30 Champions By Pick Count"
)

fig.update_layout(
    width=1000,
    height=500
)

fig.show()

In [13]:
# Univariate Analysis: Top 30 Banned Champions
# This selects and then graphs the Top 30 Banned Champions. I wanted to see if there were any outliers, and I'm curious if my predictor later will also be the most influenced by the Champions whom are the most common in these data as a whole.
# We use eval_df, which only has one row per game, to make sure we don't repeat the bans for each player entry. I think that graphing is generally pretty self explanatory.

all_bans = pd.concat([eval_df[col] for col in ['ban1', 'ban2', 'ban3', 'ban4', 'ban5']], axis=0)
ban_counts = all_bans.dropna().value_counts().reset_index()
ban_counts.columns = ["champion", "count"]

top_bans = ban_counts.head(30)

fig = px.bar(
    top_bans,
    x="champion",
    y="count",
    title="Top 30 Champions By Ban Count"
)

fig.update_layout(
    width=1000, 
    height=500
)

fig.show()

In [14]:
# Bivariate Analysis: Arcane-Themed Pick vs. Ban Heatmap
# This selects and then graphs a heatmap for the Champions who appeared in Arcane. Every entry in this heatmap shows, when the Champion in that entry was picked, how many times they banned the Champion on the other side of the table. I wanted to see if the bans were roughly evenly distributed, or if some Champions are just never banned with certain picks. I think that graphing is generally self explanatory.

champion_subset = ["Caitlyn", "Ekko", "Heimerdinger", "Jayce", "Jinx", "Leblanc", "Orianna", "Singed", "Vi", "Viktor", "Warwick"]

# Concatenate all picks and bans.
pick_ban_df = pd.DataFrame({
    "pick": pd.concat([df[col] for col in ["pick1", "pick2", "pick3", "pick4", "pick5"]], ignore_index=True),
    "ban": pd.concat([df[col] for col in ["ban1", "ban2", "ban3", "ban4", "ban5"]], ignore_index=True)
})

# Only keep the ones for the champions we want to investigate.
pick_ban_df = pick_ban_df[pick_ban_df["pick"].isin(champion_subset) & pick_ban_df["ban"].isin(champion_subset)]

freqs = pd.crosstab(pick_ban_df["pick"], pick_ban_df["ban"])
fig = px.imshow(
    freqs,
    labels=dict(x="Banned Champion", y="Picked Champion", color="Frequency"),
    title="Pick vs Ban Frequency for Arcane's Champions",
    aspect="auto"
)

fig.update_layout(
    width=900, 
    height=800
)

fig.show()

In [15]:
# Bivariate Analysis: Ship-Themed Pick vs. Ban Percentile Heatmap
# This selects and then graphs a percentile heatmap for some Champions who are shipped together. Every entry in this heatmap shows, when the Champion in that entry was picked, how many times they banned the Champion on the other side of the table. I wanted to see if the bans were roughly evenly distributed, or if some Champions are just never banned with certain picks. I think that graphing is generally self explanatory. See previous cell for almost identical explanation.

champion_subset = ["Graves", "Twisted Fate", "Yasuo", "Ahri", "Braum", "Illaoi", "Diana", "Leona", "Jinx", "Ekko", "Caitlyn", "Violet", "Garen", "Lux"]

# Concatenate all picks and bans.
pick_ban_df = pd.DataFrame({
    "pick": pd.concat([df[col] for col in ["pick1", "pick2", "pick3", "pick4", "pick5"]], ignore_index=True),
    "ban": pd.concat([df[col] for col in ["ban1", "ban2", "ban3", "ban4", "ban5"]], ignore_index=True)
})

# Only keep the ones for the champions we want to investigate.
pick_ban_df = pick_ban_df[pick_ban_df["pick"].isin(champion_subset) & pick_ban_df["ban"].isin(champion_subset)]

# normalize = True is what makes this different from the previous one
freqs = pd.crosstab(pick_ban_df["pick"], pick_ban_df["ban"], normalize=True)
fig = px.imshow(
    freqs,
    labels=dict(x="Banned Champion", y="Picked Champion", color="Frequency"),
    title="Pick vs Ban Frequency for Shipped Champions",
    aspect="auto"
)

fig.update_layout(
    width=900, 
    height=800
)

fig.show()

In [16]:
# Pivot Table: Banned Champions By Side
# This concatenates the champion ban list while distinguishing which side banned them, then renders it as a pivot table. I wanted to see if there were significant differences here that should be taken into account.

ban_side_df = pd.concat([df[[col, "side"]].rename(columns={col: "champion"}) 
                         for col in ["ban1", "ban2", "ban3", "ban4", "ban5"]], ignore_index=True, axis=0)

pivot = pd.pivot_table(ban_side_df, index="champion", columns="side", aggfunc="size", fill_value=0)
print("Champion bans by side:")
pivot

Champion bans by side:


side,Blue,Red
champion,Unnamed: 1_level_1,Unnamed: 2_level_1
Aatrox,1842,1644
Ahri,1812,1794
Akali,1950,1776
Akshan,102,78
Alistar,2064,2742
...,...,...
Zeri,2400,1710
Ziggs,2748,2778
Zilean,18,12
Zoe,138,108


## Step 3: Assessment of Missingness

In [17]:
# We reload the default CSV just to be safe for our assessment of missingness.

df = pd.read_csv("2024_LoL_esports_match_data_from_OraclesElixir.csv", low_memory=False)
df[df["elders"].notna()]

Unnamed: 0,gameid,datacompleteness,url,league,...,deathsat25,opp_killsat25,opp_assistsat25,opp_deathsat25
190,LOLTMNT99_132542,complete,,TSC,...,7.0,7.0,14.0,20.0
191,LOLTMNT99_132542,complete,,TSC,...,20.0,20.0,47.0,7.0
202,LOLTMNT99_132665,complete,,TSC,...,11.0,11.0,15.0,17.0
203,LOLTMNT99_132665,complete,,TSC,...,17.0,17.0,28.0,11.0
214,LOLTMNT99_132755,complete,,TSC,...,7.0,7.0,11.0,10.0
...,...,...,...,...,...,...,...,...,...
117623,LOLTMNT02_193448,complete,,KeSPA,...,19.0,19.0,49.0,6.0
117634,LOLTMNT02_194400,complete,,KeSPA,...,11.0,11.0,26.0,11.0
117635,LOLTMNT02_194400,complete,,KeSPA,...,11.0,11.0,37.0,11.0
117646,LOLTMNT02_194401,complete,,KeSPA,...,7.0,7.0,9.0,8.0


In [18]:
# Missingness Dependency: "elders" depends on "participantid"
# This encodes whether elders are missing or not as the values 0 or 1, and then encodes each position with a different integer value. It then performs permutation tests and assesses the Kolmogorov-Smirnov statistic using scipy to determine the p-value of the distribution. This technique is taken from lecture.
# I know the missingness of "elders" depends on "participantid", because "elders" is only recorded for whole-game rows, meaning only rows where "participantid" is equal to 100 or 200. It should be strictly completely dependent.

col1 = "elders"
col2 = "participantid"

n_repetitions = 1000
df[f"missing_{col1}"] = df[col1].isna().astype(int)

x = df[f"{col2}"].dropna()
group = df.loc[x.index, f"missing_{col1}"]

observed_ks = ks_2samp(x[group == 1], x[group == 0]).statistic

ks_stats = []
for _ in range(n_repetitions):
    shuffled = np.random.permutation(group)
    ks_stat = ks_2samp(x[shuffled == 1], x[shuffled == 0]).statistic
    ks_stats.append(ks_stat)

p = np.mean(np.array(ks_stats) >= observed_ks)

print(f"p: {p:.10f}")

if p < 0.05:
    print(f"The value of the column '{col1}' is dependent on the value of the column '{col2}'.")
else:
    print(f"The value of the column '{col1}' is not dependent on the value of the column '{col2}'.")

p: 0.0000000000
The value of the column 'elders' is dependent on the value of the column 'participantid'.


In [19]:
# Missingness Dependency: "elders" does not depend on "patch"
# Same code as test in cell above, same idea. This technique is taken from lecture.
# I know the missingness of "elders" does not depend on "patch", because whether or not "elders" is recorded does not change with game updates. The boss recorded in "elders", Elder Dragons, exists across all patches of this year's dataset.

col1 = "elders"
col2 = "patch"

n_repetitions = 1000
df[f"missing_{col1}"] = df[col1].isna().astype(int)

x = df["result"].dropna()
group = df.loc[x.index, f"missing_{col1}"]

observed_ks = ks_2samp(x[group == 1], x[group == 0]).statistic

ks_stats = []
for _ in range(n_repetitions):
    shuffled = np.random.permutation(group)
    ks_stat = ks_2samp(x[shuffled == 1], x[shuffled == 0]).statistic
    ks_stats.append(ks_stat)

p = np.mean(np.array(ks_stats) >= observed_ks)

print(f"p: {p:.10f}")

if p < 0.05:
    print(f"The value of the column '{col1}' is dependent on the value of the column '{col2}'.")
else:
    print(f"The value of the column '{col1}' is not dependent on the value of the column '{col2}'.")

p: 0.9970000000
The value of the column 'elders' is not dependent on the value of the column 'patch'.


In [20]:
# We plot the empirical distribution of the test statistic (KS statistic) along with the observed statistic from the test which checked whether or not "elders" depended on "patch". I think that graphing is generally self explanatory.

fig = px.histogram(
    x=ks_stats,
    nbins=50,
    histnorm="probability",
    title="Empirical Distribution of KS ('elders' vs 'patch')",
    labels={"x": "KS Statistic", "y": "Proportion"}
)

fig.add_vline(x=observed_ks, line_color="red", line_width=3)
fig.add_annotation(
    text=f"Observed KS = {observed_ks:.3f}",
    x=observed_ks,
    y=0.05,
    showarrow=False,
    font=dict(color="red")
)

fig.show()

## Step 4: Hypothesis Testing

In [21]:
# This filters the dataframe to only include entries for Leona, and then runs a binomial hypothesis test on it to determine whether or not Leona's win rate is significantly less than 50%, at a significance level of a = 0.05. This technique is mostly taken from lecture.

# The null is H0: Leona’s win rate is not significantly less than 50%.
# The alternative is H1: Leona’s win rate is significantly less than 50%.

leona_df = df[df["champion"] == "Leona"]

leona_wins = leona_df["result"].sum()
leona_total = leona_df.shape[0]

hypothesis_test = binomtest(k=leona_wins, n=leona_total, p=0.5, alternative='less')

print(f"Leona's win rate: {leona_wins / leona_total:.2%}")
print(f"p: {hypothesis_test.pvalue:.10f}")

if hypothesis_test.pvalue < 0.05:
    print("Reject H0: Leona's win rate is significantly less than 50%.")
else:
    print("Fail to reject H0: No evidence Leona's win rate is less than 50%.")

Leona's win rate: 45.14%
p: 0.0000038041
Reject H0: Leona's win rate is significantly less than 50%.


## Step 5: Framing a Prediction Problem

In [22]:
# We are, as suggested in Step 1, going to use the champion ban list to predict the first pick. If the ban list is itself not informative enough, we can take advantage of the additional features we have included (patch, teamname, position, player) to see if they make a difference and improve our model.

## Step 6: Baseline Model

In [23]:
# The cleaned and modified DataFrame we will be using, copied just in case.

model_df = eval_df.copy()
model_df

Unnamed: 0,ban1,ban2,ban3,ban4,...,ban7,ban8,ban9,ban10
0,Akali,Nocturne,K'Sante,Lee Sin,...,Ashe,Neeko,Vi,Jarvan IV
1,Nocturne,Udyr,Renata Glasc,Nautilus,...,Ashe,Rumble,Tristana,Lucian
2,Rell,Nocturne,Tristana,Jarvan IV,...,Ashe,LeBlanc,Sejuani,Vi
3,Poppy,LeBlanc,Neeko,Sejuani,...,Nocturne,Ashe,Azir,Akali
4,Ashe,Akali,LeBlanc,Vi,...,Nocturne,Neeko,Sejuani,Poppy
...,...,...,...,...,...,...,...,...,...
8620,Sejuani,Vi,Ambessa,LeBlanc,...,Aurora,Skarner,Renekton,Maokai
8621,LeBlanc,Poppy,Azir,Bard,...,Skarner,Ambessa,Renekton,Rakan
8622,Renekton,Vi,K'Sante,Kalista,...,Skarner,Ambessa,Ashe,Varus
8623,LeBlanc,Poppy,Ambessa,Lee Sin,...,Nocturne,Azir,Gragas,Maokai


In [24]:
# The baseline model. It uses 10 features, columns ban1-ban10, to predict target pick1. We preprocess with a ColumnTransformer performing One-Hot Encoding and then insert it into the pipeline, which we then train with a train_test_split, and then fit. Accuracy is printed for analysis, using pipeline.score().

features = [f"ban{i}" for i in range(1, 11)]

banproc = ColumnTransformer(
    transformers=[
        ('transform', OneHotEncoder(handle_unknown="ignore"), features)
    ]
)

pl = Pipeline([
    ('preprocessor', banproc),
    ('classifier', RandomForestClassifier())
])

X = model_df[features]
y = model_df["pick1"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
pl.fit(X_train, y_train)

print("Accuracy:", pl.score(X_test, y_test))

Accuracy: 0.19710144927536233


## Step 7: Final Model

In [25]:
# I chose to test these hyperparameters because I thought that if there was any one issue with my model, it might be overfitting. All of these hyperparameters one way or another govern how fit the model is, so I thought they would be good ideas. I list the specific reasons for the ones I chose below.

# n_estimators, the number of trees in the forest, because I felt that increasing them might improve accuracy.
# max_depth, the maximum depth of each tree, because I was worried about overfitting.
# min_samples_leaf, the minimum samples on each leaf, because I was worried about overfitting.
# min_samples_split, the minimum number of samples required to split a node, because I was worried about overfitting.

# All testing was done with GridSearchCV.

hyperparameters = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2, 3],
}

rf_search = GridSearchCV(pl, hyperparameters, cv=3, verbose=2)
rf_search.fit(X_train, y_train)

print("RandomForestClassifier Parameters:", rf_search.best_params_)

Fitting 3 folds for each of 54 candidates, totalling 162 fits



The least populated class in y has only 1 members, which is less than n_splits=3.



[CV] END classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=50; total time=   3.4s
[CV] END classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=50; total time=   3.4s
[CV] END classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=50; total time=   3.4s
[CV] END classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=   6.8s
[CV] END classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=   6.9s
[CV] END classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=   6.8s
[CV] END classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_spli

In [26]:
# The final model. The three new features encoded here are position, playername, and patch, which are all One-Hot Encoded along with the list of bans 1-10. This is a total of 13 features. We preprocess with a ColumnTransformer performing One-Hot Encoding and then insert it into the pipeline, which we then train with a train_test_split, and then fit. Accuracy is printed for analysis, using pipeline.score().

features = [f"ban{i}" for i in range(1, 11)] + ["position", "playername", "patch"]

banproc = ColumnTransformer(
    transformers=[
        ('transform', OneHotEncoder(handle_unknown="ignore"), features)
    ]
)

pl = Pipeline([
    ('preprocessor', banproc),
    ('classifier', RandomForestClassifier(max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200))
])

X = model_df[features]
y = model_df["pick1"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
pl.fit(X_train, y_train)

print("Accuracy:", pl.score(X_test, y_test))

Accuracy: 0.4736231884057971


## Step 8: Fairness Analysis

In [27]:
# For my fairness analysis, I'm going to check precision. I want to see if I guess champions for one of two heavily played positions more than another. Those positions are bot and mid, which are also decently similar in presence.

# Null hypothesis H0: Our model is fair. Its precision for bot lane is the same as its precision for mid lane.
# Alternative hypothesis H1: Our model is unfair. Its precision for bot lane is not the same as its precision for mid lane.

# To accomplish this, I effectively reuse the permutation test code from Step 3 with a little extra work.

n_permutations = 1000

position_df = X_test.copy()
position_df["y_actual"] = y_test.copy()
position_df["y_predicted"] = pl.predict(X_test)
position_df = position_df[position_df["position"].isin(["bot", "mid"])]

precision_bot = precision_score(
    position_df[position_df["position"] == "bot"]["y_actual"],
    position_df[position_df["position"] == "bot"]["y_predicted"],
    average="weighted",
    zero_division=0
)

precision_mid = precision_score(
    position_df[position_df["position"] == "mid"]["y_actual"],
    position_df[position_df["position"] == "mid"]["y_predicted"],
    average="weighted",
    zero_division=0
)

# observed value
observed_abs_diff = abs(precision_bot - precision_mid)

# here is the permutation test itself, looping the observed value code
abs_diffs = []
for _ in range(n_permutations):
    shuffled = np.random.permutation(position_df["position"])
    shuffled_df = position_df.copy()
    shuffled_df["shuffled_position"] = shuffled

    shuffled_bot = precision_score(
        shuffled_df[shuffled_df["shuffled_position"] == "bot"]["y_actual"],
        shuffled_df[shuffled_df["shuffled_position"] == "bot"]["y_predicted"],
        average="weighted",
        zero_division=0
    )

    shuffled_mid = precision_score(
        shuffled_df[shuffled_df["shuffled_position"] == "mid"]["y_actual"],
        shuffled_df[shuffled_df["shuffled_position"] == "mid"]["y_predicted"],
        average="weighted",
        zero_division=0
    )
    
    abs_diff = abs(shuffled_bot - shuffled_mid)
    abs_diffs.append(abs_diff)

# p-value
p = np.mean(np.array(abs_diffs) >= observed_abs_diff)

print(f"p: {p:.10f}")

if p < 0.05:
    print("Reject H0: Our model is unfair. Its precision for bot lane is not the same as its precision for mid lane.")
else:
    print("Fail to reject H0: Our model is fair. Its precision for bot lane is the same as its precision for mid lane.")

p: 0.1860000000
Fail to reject H0: Our model is fair. Its precision for bot lane is the same as its precision for mid lane.
