# League Creep Score Analysis

**Name(s)**: Haris Saif, Ernest Ibarolle

**Website Link**: https://eibarolle.github.io/LoL-Creep-Score-Study/

In [3]:
import pandas as pd
import numpy as np
from pathlib import Path

import plotly.express as px
pd.options.plotting.backend = 'plotly'

# from dsc80_utils import * # Feel free to uncomment and use this.

## Step 1: Introduction

In [None]:
data = Path('data') / 'LoL_2022.csv'
df = pd.read_csv(data, dtype='unicode')


In [5]:
df.groupby('champion').count().sort_values(by='year', ascending=False)

Unnamed: 0_level_0,gameid,datacompleteness,url,league,year,split,playoffs,date,game,patch,...,opp_csat25,golddiffat25,xpdiffat25,csdiffat25,killsat25,assistsat25,deathsat25,opp_killsat25,opp_assistsat25,opp_deathsat25
champion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Nautilus,4455,4455,875,4455,4455,3358,4455,4455,4455,4455,...,3357,3357,3357,3357,3357,3357,3357,3357,3357,3357
Aphelios,4075,4075,903,4075,4075,3223,4075,4075,4075,4074,...,3028,3028,3028,3028,3028,3028,3028,3028,3028,3028
Jinx,3931,3931,812,3931,3931,3107,3931,3931,3931,3930,...,2940,2940,2940,2940,2940,2940,2940,2940,2940,2940
Viego,3882,3882,762,3882,3882,3026,3882,3882,3882,3880,...,2954,2954,2954,2954,2954,2954,2954,2954,2954,2954
Gnar,3068,3068,596,3068,3068,2460,3068,3068,3068,3063,...,2367,2367,2367,2367,2367,2367,2367,2367,2367,2367
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Aurelion Sol,6,6,0,6,6,5,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6
Rammus,6,6,0,6,6,3,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6
Master Yi,4,4,0,4,4,2,4,4,4,4,...,3,3,3,3,3,3,3,3,3,3
Fizz,3,3,0,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3


In [17]:
df['earnedgold']

0          7164
1          5368
2          5945
3          6835
4          2908
          ...  
150175     8667
150176    10416
150177     4811
150178    26462
150179    39998
Name: earnedgold, Length: 150180, dtype: object

In [19]:
df.columns.values

array(['gameid', 'datacompleteness', 'url', 'league', 'year', 'split',
       'playoffs', 'date', 'game', 'patch', 'participantid', 'side',
       'position', 'playername', 'playerid', 'teamname', 'teamid',
       'champion', 'ban1', 'ban2', 'ban3', 'ban4', 'ban5', 'pick1',
       'pick2', 'pick3', 'pick4', 'pick5', 'gamelength', 'result',
       'kills', 'deaths', 'assists', 'teamkills', 'teamdeaths',
       'doublekills', 'triplekills', 'quadrakills', 'pentakills',
       'firstblood', 'firstbloodkill', 'firstbloodassist',
       'firstbloodvictim', 'team kpm', 'ckpm', 'firstdragon', 'dragons',
       'opp_dragons', 'elementaldrakes', 'opp_elementaldrakes',
       'infernals', 'mountains', 'clouds', 'oceans', 'chemtechs',
       'hextechs', 'dragons (type unknown)', 'elders', 'opp_elders',
       'firstherald', 'heralds', 'opp_heralds', 'void_grubs',
       'opp_void_grubs', 'firstbaron', 'barons', 'opp_barons',
       'firsttower', 'towers', 'opp_towers', 'firstmidtower',
       'fi

In [8]:
fig = px.histogram(df, x='champion')
fig

## Step 2: Data Cleaning and Exploratory Data Analysis

In [9]:
# TODO

## Step 3: Assessment of Missingness

In [10]:
# TODO

## Step 4: Hypothesis Testing

### Null Hypothesis: 
The proportion of "winning" results among teams with a higher creep score/minute (cspm) from the 2022 League of Legends Dataframe is equal to 0.5. This null hypothesis is chosen because the team rows are from matches against each other, meaning in all cases, one team must win, and the other must lose. Thus, the proportion of "winning" results is ${1}/{2}$, or 0.5.
### Alternative Hypothesis: 
The proportion of "winning" results among teams with a higher creep score/minute from the 2022 League of Legends Dataframe is greater than 0.5. This alternative hypothesis is chosen because we hypothesize that with a larger creep score, teams will be able to accumulate more gold for better stats.
### Test statistic: 
The win rate proportion for teams with a higher creep score/minute. This is chosen because the data can be cleaned and grouped into both results and the team with the higher cspm.
### Significance Level: 
$\alpha$ = 0.05. This significance level is chosen because of its balance between finding significant patterns and reducing statistical errors.

In [None]:
teams = df[(df['participantid'] == '100') | (df['participantid'] == '200')]
teams = teams.dropna(subset=['cspm'])
teams = teams.reset_index(drop=True)
teams['higher_cspm'] = False  # Default all rows to False

# Compare each first row in a pair with the next row
teams.loc[teams.index % 2 == 0, 'higher_cspm'] = teams['cspm'] > teams['cspm'].shift(-1)

# For the second team in each pair, set 'higher_cspm' to the opposite of the first team in the pair
teams.loc[teams.index % 2 == 1, 'higher_cspm'] = teams['cspm'] > teams['cspm'].shift(1)
cleaned_teams = teams[['higher_cspm', 'result']]
cleaned_teams['result'] = cleaned_teams['result'].astype(int)
cleaned_teams.groupby('higher_cspm').mean()
cleaned_teams
permuted_wrs = []
for _ in range(5000):
    permutated_cspm = np.random.permutation(cleaned_teams['higher_cspm'])
    permuted_wrs.append(cleaned_teams[permutated_cspm == True]['result'].mean())
permuted_wrs = np.array(permuted_wrs)
observed_proportion = np.float64(cleaned_teams.groupby('higher_cspm').mean())[1]
p_value = (permuted_wrs > observed_proportion).sum() / len(permuted_wrs)
p_value < 0.05



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



np.float64(0.0)

### Conclusion: 
Because the p-value for the permutation test was lower than 0.05, we reject the null hypothesis that the proportion of "winning" results among teams with a higher creep score/minute (cspm) from the 2022 League of Legends Dataframe is equal to 0.5.

## Step 5: Framing a Prediction Problem

In [12]:
# TODO

## Step 6: Baseline Model

In [None]:

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

players = df[(df['participantid'] != '100') & (df['participantid'] != '200')]

features = ["position", "totalgold"]
target = "cspm"
X = players[features]
y = players[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

categorical_features = ["position"]
numerical_features = ["totalgold"]

categorical_transformer = OneHotEncoder(handle_unknown="ignore")
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical_features),
        ("num", numerical_transformer, numerical_features),
    ]
)

pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", LinearRegression())])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE:", mse)

Mean Squared Error: 1.4492408471790212


### Model Summary:



## Step 7: Final Model

In [14]:
# TODO

## Step 8: Fairness Analysis

In [15]:
# TODO