# Step one: Data analysis and cleaning

In [1]:
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt

In [2]:
# Reading the main data
df = pd.read_csv('games.csv')

print(f'Shape: {df.shape}')
df.columns

Shape: (51490, 61)


Index(['gameId', 'creationTime', 'gameDuration', 'seasonId', 'winner',
       'firstBlood', 'firstTower', 'firstInhibitor', 'firstBaron',
       'firstDragon', 'firstRiftHerald', 't1_champ1id', 't1_champ1_sum1',
       't1_champ1_sum2', 't1_champ2id', 't1_champ2_sum1', 't1_champ2_sum2',
       't1_champ3id', 't1_champ3_sum1', 't1_champ3_sum2', 't1_champ4id',
       't1_champ4_sum1', 't1_champ4_sum2', 't1_champ5id', 't1_champ5_sum1',
       't1_champ5_sum2', 't1_towerKills', 't1_inhibitorKills', 't1_baronKills',
       't1_dragonKills', 't1_riftHeraldKills', 't1_ban1', 't1_ban2', 't1_ban3',
       't1_ban4', 't1_ban5', 't2_champ1id', 't2_champ1_sum1', 't2_champ1_sum2',
       't2_champ2id', 't2_champ2_sum1', 't2_champ2_sum2', 't2_champ3id',
       't2_champ3_sum1', 't2_champ3_sum2', 't2_champ4id', 't2_champ4_sum1',
       't2_champ4_sum2', 't2_champ5id', 't2_champ5_sum1', 't2_champ5_sum2',
       't2_towerKills', 't2_inhibitorKills', 't2_baronKills', 't2_dragonKills',
       't2_riftHer

In [3]:
# Reading the first champions json
import json
f = open('champion_info.json')
champs = json.load(f)
rows = []
champs_1_df = pd.DataFrame(columns=['id', 'name'])
for champion_id, champion_data in champs['data'].items():
    champ_id = int(champion_id)
    name = champion_data['name']
    rows.append({'id': champ_id, 'name': name})
    
champs_1_df['name'] = [row['name'] for row in rows]
champs_1_df['id'] = [row['id'] for row in rows]

f.close()
champs_1_df

Unnamed: 0,id,name
0,1,Annie
1,2,Olaf
2,3,Galio
3,4,Twisted Fate
4,5,Xin Zhao
...,...,...
133,429,Kalista
134,432,Bard
135,497,Rakan
136,498,Xayah


In [4]:
# The second json's structure is a little bit different, the id's are not on the first element
# of the tuple, instead, they are inside the second one

f = open('champion_info_2.json')
champs_2 = json.load(f)
rows = []
champs_df = pd.DataFrame(columns=['id', 'name'])
for _, champion_data in champs_2['data'].items():
    # There is one specific dirty data, where the id -1 and the rest is just filled with None
    if float(champion_data['id']) < 0:
        continue
    champ_id = int(champion_data['id'])
    name = champion_data['name']
    role = champion_data['tags']
    rows.append({'id': champ_id, 'name': name, 'role': role})
champs_df['name'] = [row['name'] for row in rows]
champs_df['id'] = [row['id'] for row in rows]
champs_df['role'] = [row['role'] for row in rows]
f.close()
champs_df.sort_values('id',inplace=True)
champs_df

Unnamed: 0,id,name,role
57,1,Annie,[Mage]
52,2,Olaf,"[Fighter, Tank]"
89,3,Galio,"[Tank, Mage]"
73,4,Twisted Fate,[Mage]
112,5,Xin Zhao,"[Fighter, Assassin]"
...,...,...,...
37,429,Kalista,[Marksman]
68,432,Bard,"[Support, Mage]"
43,497,Rakan,[Support]
5,498,Xayah,[Marksman]


In [5]:
# Note that there arent any null values since the data was previously cleaned by its author
df.isna().sum().unique()

array([0], dtype=int64)

In [None]:
!pip install 

In [14]:
# Distribution
from distfit import distfit


dfit = distfit(todf=True)
results = dfit.fit_transform(df['gameDuration'])

dfit.plot()

ModuleNotFoundError: No module named 'distfit'

Note that remaked matches are included in this dataframe, so i will remove them. I will remove the match periods from 3 to 15 minutes. Above that ff is possible and actual wins are unlikely. There may be some low-time ff due to disconnect but, in my interpretation, removing them is reasonable

In [None]:
remake = df[df['gameDuration'] <= 180].shape[0]
between_remake_and_surrender = df[((df['gameDuration'] < 900) & (df['gameDuration']>180))].shape[0]

print("remake: ", remake,"  between remake and surrender: ", between_remake_and_surrender)

In [None]:
df[((df['gameDuration'] < 900) & (df['gameDuration']>180))]

df_model = df.copy()

In [None]:
df.shape[0]-1309

In [None]:
# Removing potential remake matches
df = df[~((df['gameDuration'] < 900) & (df['gameDuration']>180))]

In [None]:
# Ploting distribution again
from distfit import distfit


dfit = distfit(todf=True)
results = dfit.fit_transform(df['gameDuration'])
dfit.plot()


### Analyzing champions' win percentage and pick/ban rate

In [None]:
# Analyzing champions' win percentage and pick/ban rate

champs_t1, champs_t2, bans_t1, bans_t2 = [],[],[],[]

# Getting columns that contains champions' appearances 'automatically'

# When testing, checking whether its alread in champ_t1 became necessary since compyling it twice
# would duplicate the list's values
for column in df.columns:
    if 'id' in column:
        if 't1' in column and column not in champs_t1:
            champs_t1.append(column)
        elif 't2' in column and column not in champs_t2:
            champs_t2.append(column)
    if 'ban' in column:
        if 't1' in column and column not in bans_t1:
            bans_t1.append(column)
        elif 't2' in column and column not in bans_t2:
            bans_t2.append(column)

In [None]:
# Extracting picked/banned percentage and getting the relevance:
# how many times a champion is picked when its not banned
# (number of picks) / ((total games) - (number of bans))
# In my opinion, this number would infer the best bans

df['picks'] = df[champs_t1 + champs_t2].values.tolist()
df['bans'] = df[bans_t1 + bans_t2].values.tolist()

df['comp'] = df[champs_t1 + champs_t2].values.tolist()

# I struggled a lot to assign lists/arrays to specific cells, so here's the solution i came up with
df['win/lose'] = np.where(df['winner'] == 1, '[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]', '[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]')
df['win/lose'] = df['win/lose'].apply(ast.literal_eval)
df_exploded = df.explode(['picks','bans','win/lose'])

pick_id = df_exploded['picks'].value_counts().reset_index()
ban_id = df_exploded['bans'].value_counts().reset_index()
winrate_id = df_exploded[['picks','win/lose']].groupby('picks').sum().reset_index()
winrate_id.rename(columns={'picks': 'index'},inplace=True)


# Since pd.merge only works for 2 dfs, had to do a step inside it
df_champion_stats = pd.merge(pick_id,pd.merge(ban_id,winrate_id,on='index'),on='index')
df_champion_stats['picked or banned (%)'] = (df_champion_stats['picks'] + df_champion_stats['bans'])*100/df.shape[0]
df_champion_stats['relevance (%)'] = df_champion_stats['picks']/(df.shape[0]-df_champion_stats['bans'])*100

df_exploded['picks'].value_counts().reset_index()

df_champion_stats.rename(columns={'index': 'id'}, inplace= True)
df_champion_stats = pd.merge(champs_df,df_champion_stats,on='id')

df_champion_stats[['picked or banned (%)', 'relevance (%)']] = df_champion_stats[['picked or banned (%)', 'relevance (%)']].round(2)
df_champion_stats.sort_values(by='relevance (%)', ascending = False)
df_champion_stats['winrate (%)'] = round(100*df_champion_stats['win/lose']/df_champion_stats['picks'],2)

df_champion_stats = df_champion_stats.sort_values('winrate (%)', ascending=False)

df_champion_stats

In [None]:
# Question: what team composition (i will be based only on roles, since champions would be more specific)
# has the highest winrate?
df['comp_1'] = df['picks'].apply(lambda x: x[:5])
df['comp_2'] = df['picks'].apply(lambda x: x[5:])


df[['winner','comp_1','comp_2']]
#cel.append(champs_df[champs_df['id'] == df.loc[0,'comp_1'][0]]['role'])

In [None]:
# Since trying to merge array with a dataframe is very tough, i will try it in another way:
# exploding it, merging and then making a list again

df_comps = df[['gameId','winner','comp_1','comp_2']]
df_comps = df_comps.explode(['comp_1','comp_2'])

df_comps = pd.merge(df_comps,champs_df.rename(columns={'id': 'comp_1','name':'name_1','role':'role_1'}), on='comp_1', sort=False)


df_comps = pd.merge(df_comps,champs_df.rename(columns={'id': 'comp_2','name':'name_2','role':'role_2'}), on='comp_2', sort=False)

df_comps = df_comps.sort_values('gameId')

# Here comes the tricky part: getting the team comps for each gameId
df_comps = df_comps.groupby('gameId').agg(
    {'comp_1': list, 
     'comp_2': list,
     'name_1': list,
     'name_2': list,
     'role_1': lambda x: [x.tolist()], 
     'role_2': lambda x: [x.tolist()], 
     'winner': 'first'
    }
).reset_index()

df_comps
# Note that the array is nested, so we have to undo it, for that i found a solution online

def unnest_array(array):
    # Unnest values and sort the role order
    # The reason i sorted is to prevent future errors of same composition with different orders
    unnested_array = sorted([item for sublist in array for sublist_2 in sublist for item in sublist_2], key=lambda x: x.lower())
    # Remove duplicates and return as sorted list
    return sorted(list(set(unnested_array)), key=lambda x: x.lower())

df_comps['role_1'] = df_comps['role_1'].apply(unnest_array)
df_comps['role_2'] = df_comps['role_2'].apply(unnest_array)


# Sorting names since pick order doesnt matter:

df_comps['name_1'] = df_comps['name_1'].apply(sorted)
df_comps['name_2'] = df_comps['name_2'].apply(sorted)


In [11]:
df_comps

NameError: name 'df_comps' is not defined

In [None]:
# Get most picked roles:
df_comps['role_1'] = df_comps['role_1'].astype(str)
df_comps['role_2'] = df_comps['role_2'].astype(str)


pd.concat([df_comps['role_1'],df_comps['role_2']]).value_counts()

df_comps[df_comps['role_1'] == df_comps['role_2']]

#df_comps

In [None]:
# Get most picked heroes compositions:
df_comps['name_1'] = df_comps['name_1'].astype(str)
df_comps['name_2'] = df_comps['name_2'].astype(str)


pd.concat([df_comps['name_1'],df_comps['name_2']]).value_counts()


Note that i got an information that is pretty odd: the most picked comp only repeated four times. That probably happened since most of the popular heroes either get picked by the other team or banned. 

In my personal experience, the first comp is pretty odd (i confess that i dont remember what was the time's meta), so that explains why the other teams didnt ban them (vayne and kayle are very specific champions)


So i wont even bother to determine the best composition's winrate

In [None]:
# Calculating roles winrate
# Since the same roles can be picked by both teams, i will remove those cases

df_comps_winrate = df_comps[df_comps['role_1'] != df_comps['role_2']]
results = pd.DataFrame(columns={"comp", "winner"})

for index, row in df_comps_winrate.iterrows():
    if row['winner'] == 1:
        line = pd.DataFrame({"comp": [row['role_1'], row['role_2']],
                             "winner": [1, 0]})
        pd.concat([results,line], ignore_index = True)
        


In [None]:
# Another approach

results = pd.DataFrame(columns={"comp", "winner"})
df_comps_winrate = df_comps[df_comps['role_1'] != df_comps['role_2']]

results[['comp','winner']] = [df_comps_winrate[df_comps_winrate['winner'] == 1]['role_1'],1]

results

In [None]:
# This one was a bit crafty. i want to get comp_1 and comp_2 and assure which one was winner
# or loser. For that, i used winner % 2 +1 , this way when winner =1 i get 2 and when winner = 2 i get
# 1


results_data = []
for _, row in df_comps_winrate.iterrows():
    comp_winner = row['role_' + str(row['winner'])]
    comp_loser = row['role_' + str(row['winner'] % 2 + 1)]
    
    results_data.append({'comp': comp_winner, 'winner': 1})
    results_data.append({'comp': comp_loser, 'winner': 0})

results = pd.DataFrame(results_data)


In [None]:
results.groupby('comp').value_counts()

In [None]:
comp_report = results.groupby('comp').agg({
    'winner': ['sum','count']
})

comp_report['winner','winrate'] = round(100*comp_report['winner']['sum']/comp_report['winner']['count'],2)

comp_report.sort_values(('winner','winrate'),ascending=False)

What sticks out the most for me was ['Fighter', 'Marksman', 'Tank'], its a pretty unusual composition

### Starting objectives analysis:
 - 1st Inhib
 - 1st Baron
 - 1st Tower
 - 1st Dragon
 - 1st Herald
 - 1st Blood

In [None]:
# Without taking 0 values out of the equation
df_1st = df.iloc[:,4:11]

df_1stwr = pd.DataFrame(columns=df_1st.columns[df_1st.columns != 'winner'] + ' (%)')

for column in df_1st.columns[df_1st.columns != 'winner']:
    df_1stwr.loc[0,column + ' (%)'] = round(100*df[df['winner'] == df[column]].shape[0]/df.shape[0], 2)

df_1stwr

Those low values can be explained by how often neither teams take barons/herald. So, theses numbers are false. Fixing it now:

In [None]:
# Taking 0 values out of the equation
df_1stwr = pd.DataFrame(columns=df_1st.columns[df_1st.columns != 'winner'] + ' (%)')

for column in df_1st.columns[df_1st.columns != 'winner']:
    df_1stwr.loc[0,column + ' (%)'] = round(100*df[((df['winner'] == df[column]) & \
                                                  (df[column] != 0))].shape[0] / \
                                       df[df[column] != 0].shape[0], 2)

df_1stwr

I have an idea of some kind of visualization to prove the importance of each objective, i will graphically show three scenarios:
1) Enemy team taking the 1st objective


2) None of the team taking it



3) The actual team taking it

In [None]:
import pandas as pd

df_1stwr = pd.DataFrame()

for column in df_1st.columns[df_1st.columns != 'winner']:
    for winner in [1, 2]:
        # 1st scenario: the team wins the game but the other team takes it
        df_1stwr.loc[0, str(winner) + column + ' (3rd)'] = int(round(100 * df[((df['winner'] == winner) & (df[column] != winner))].shape[0] / df[df[column] != winner].shape[0], 0))
        
        # 2nd scenario: None of the teams take it and the team wins the game
        df_1stwr.loc[0, str(winner) + column + ' (2nd)'] = int(round(100 * df[((df['winner'] == winner) & (df[column] == 0))].shape[0] / df[((df['winner'] == winner) & (df[column] == 0))].shape[0], 0)) if df[((df['winner'] == winner) & (df[column] == 0))].shape[0] else 0
        
        # 3rd scenario: the team wins the game and takes the objective (removing games that none of the teams took it)
        df_1stwr.loc[0, str(winner) + column + ' (1st)'] = int(round(100 * df[((df['winner'] == winner) & (df[column] == winner))].shape[0] / df[df[column] == winner].shape[0], 0))

df_1stwr.columns


In [None]:
df_1stwr = pd.DataFrame()

for column in df_1st.columns[df_1st.columns != 'winner']:
    for winner in [1, 2]:
        # 1st scenario: the team wins the game but the other team takes it
        df_1stwr.loc[0, '[' + str(winner) + ']' + column + ' (3rd)'] = \
        int(round(100 * df[((df['winner'] == winner) & (df[column] != winner))].shape[0] 
                  / df[df[column] != winner].shape[0], 0))
        
        # 2nd scenario: None of the teams take it and the team wins the game
        
        # The if statement below is to avoid divided by 0 in first blood cases 
        # (since i removed the remake matches, there hasnt been one that ended without a kill)
        df_1stwr.loc[0, '[' + str(winner) + ']' + column + ' (2nd)'] = \
        int(round(100 * df[((df['winner'] == winner) & (df[column] == 0))].shape[0] 
                  / df[df[column] == 0].shape[0], 0)) if df[df[column] == 0].shape[0] else 0
        
        # 3rd scenario: the team wins the game and takes the objective (removing games that
        # none of the teams took it)
        df_1stwr.loc[0, '[' + str(winner) + ']' + column + ' (1st)'] = \
        int(round(100 * df[((df['winner'] == winner) & (df[column] == winner))].shape[0] 
                  / df[df[column] == winner].shape[0], 0))

df_1stwr


Maybe this was a bit confusing, i will try to graphically show that:

In [None]:
df_1stwr

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def single_objective_analysis(objective):
    columns_chosen = [word for word in df_1stwr.columns if objective in word]
    
    df_graph = df_1stwr[columns_chosen]

    data = np.concatenate((df_graph.iloc[:3].values, df_graph.iloc[3:6].values), axis=0)

    labels = ["Other team's", "Neither team's", "Actual team's"]
    x = np.arange(len(labels))
    width = 0.3

    fig, ax = plt.subplots(figsize=(7, 4))


    bars1 = ax.bar(x - width/2, data[0, :3], width, label='Team 1', alpha=0.7)
    bars2 = ax.bar(x + width/2, data[0, -3:], width, label='Team 2', alpha=0.7)
    
    ax.set_ylabel('Percentage')
    ax.set_xlabel(objective)
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    ax.legend()

    # Add percentage labels on each bar
    for bar1, bar2 in zip(bars1, bars2):
        ax.text(bar1.get_x() + bar1.get_width() / 2, bar1.get_height(), f'{int(bar1.get_height())}%', ha='center', va='bottom')
        ax.text(bar2.get_x() + bar2.get_width() / 2, bar2.get_height(), f'{int(bar2.get_height())}%', ha='center', va='bottom')

    plt.show()

This graph emphatizes the importance of objectives and also alert teams that they should take the objectives, since it'll increase the odds dramatically when compairing neither team getting and one team getting it

In [12]:
# Creating a single cell with every graph:

column_names = df_1stwr.columns
values = [name.split("first")[1].split(" (")[0] for name in column_names]
unique_values = list(set(values))
unique_values

[single_objective_analysis(obj) for obj in unique_values]

NameError: name 'df_1stwr' is not defined

Putting it all in a single cell:

In [None]:
def objective_analysis(ax, objective):
    columns_chosen = [word for word in df_1stwr.columns if objective in word]
    
    df_graph = df_1stwr[columns_chosen]

    data = np.concatenate((df_graph.iloc[:3].values, df_graph.iloc[3:6].values), axis=0)

    labels = ["Other team's", "Neither team's", "Actual team's"]
    x = np.arange(len(labels))
    width = 0.3

    bars1 = ax.bar(x - width/2, data[0, :3], width, label='Team 1', alpha=0.7)
    bars2 = ax.bar(x + width/2, data[0, -3:], width, label='Team 2', alpha=0.7)
    
    ax.set_ylabel('Percentage')
    ax.set_xlabel(objective)
    ax.set_xticks(x)
    ax.set_xticklabels(labels)

    # Add percentage labels on each bar
    for bar1, bar2 in zip(bars1, bars2):
        ax.text(bar1.get_x() + bar1.get_width() / 2, bar1.get_height(), f'{int(bar1.get_height())}%', ha='center', va='bottom')
        ax.text(bar2.get_x() + bar2.get_width() / 2, bar2.get_height(), f'{int(bar2.get_height())}%', ha='center', va='bottom')

    ax.set_ylim(top=100)
  

plt.figure(figsize=(17, 8))
# Create the subplot grid
num_plots = len(unique_values)
num_cols = 2
num_rows = (num_plots + num_cols - 1) // num_cols
fig, axes = plt.subplots(num_rows, num_cols, figsize=(18, 10))

# Fill the subplots with the graphs
for i, objective in enumerate(unique_values):
    row = i // num_cols
    col = i % num_cols
    if num_rows == 1:
        ax = axes[col]
    else:
        ax = axes[row, col]
    objective_analysis(ax, objective)

# Adjust the layout of the subplots
plt.tight_layout()

# Move the legend outside the axes
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.5), ncol=2)

# Show the plot
plt.show()



OBS: Winning without taking towers/inhibs makes sense since it can be a forfeit

### Correlation between objectives, and objective vs winner (incomplete)

In [None]:
# First step is to transform them in binary

df_binary = pd.DataFrame(columns=['fb', 'ft'])


df_binary['fb'] = pd.concat

# winner_fb = pd.concat([df[df['winner'] == 1]['t1_towerKills'],df[df['winner'] == 2]['t2_towerKills']])

# loser_ft = pd.concat([df[df['winner'] == 1]['t2_towerKills'],df[df['winner'] == 2]['t1_towerKills']])



df_binary = df[['firstTower','firstBlood']].replace({1:1,2:0})


from sklearn.metrics import matthews_corrcoef

# Calcular a correlação de Matthews
correlation = matthews_corrcoef(df_binary['firstTower'], df_binary['firstBlood'])




print(correlation)

In [None]:
chi2

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency

# Example data - replace with your own data
data = df[['winner', 'firstBlood', 'firstTower', 'firstInhibitor', 'firstBaron', 'firstDragon', 'firstRiftHerald']]

# Calculating the chi-square statistic for each pair of variables
num_vars = data.shape[1]
chi2_matrix = np.zeros((num_vars, num_vars))

for i in range(num_vars):
    for j in range(num_vars):
        contingency_table = pd.crosstab(data.iloc[:, i], data.iloc[:, j])
        chi2, _, _, _ = chi2_contingency(contingency_table)
        chi2_matrix[i, j] = chi2

# Creating the heatmap using seaborn
fig, ax = plt.subplots(figsize=(8, 6))
heatmap = sns.heatmap(chi2_matrix, annot=True, cmap="coolwarm", fmt=".2f", ax=ax)

# Set the colorbar limits
#heatmap.collections[0].colorbar.set_clim(0, 10000)


plt.show()


In [None]:
# Create a matrix of: winner, firstBlood, firstTower, firstInhib, firstBaron, firstDragon, firstHerald

In [None]:
data

In [None]:
winner_towers = pd.concat([df[df['winner'] == 1]['t1_towerKills'],df[df['winner'] == 2]['t2_towerKills']])

loser_towers = pd.concat([df[df['winner'] == 1]['t2_towerKills'],df[df['winner'] == 2]['t1_towerKills']])


winner_towers = winner_towers.to_frame()

winner_towers['winner'] = 1
loser_towers = loser_towers.to_frame()
loser_towers['winner'] = 0
teste = pd.concat([winner_towers,loser_towers])


correlation = np.corrcoef(teste[0], teste['winner'])[0, 1]

print(correlation)

### Starting to model: 
1) Predict the team winner with every piece of information we have on our disposal



2) It would be fun (especially for gamblers) to predict the winner with the minimal information available. I will do it using only team picks, bans, first blood, and first tower, as these pieces of information can be seen beforehand.

Removing irrelevant columns and separating features from target (winner)

In [None]:
df.drop(['gameId','creationTime','seasonId'],axis=1, inplace=True)

In [None]:
summoner_columns = [columns for columns in df.columns if 'sum' in columns]
picks_bans = champs_t1 + champs_t2 + bans_t1 + bans_t2
features = df.drop(['winner','picks','bans','comp','win/lose'] + picks_bans + summoner_columns,axis=1)
target = df['winner']


I personally dont like the following approach, since different pick orders will be interpreted differently, but im curious to get the difference between a model with this problem and without it (for now i thought about sorting the picks)

In [None]:
df_model.columns
target = df_model['winner']
features = df_model.drop(['seasonId','winner','gameId','creationTime']+bans_t1+bans_t2+sum_columns, axis=1)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score


def model_and_results(features,target):
    
    # Separe os dados em conjuntos de treinamento e teste
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

    model = RandomForestClassifier()  # ou RandomForestClassifier

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    
    print("Matriz de Confusão:")
    print(confusion_matrix(y_test, y_pred))


    accuracy = accuracy_score(y_test, y_pred)
    print("Acurácia:", accuracy)


    precision = precision_score(y_test, y_pred)
    print("Precisão:", precision)


    recall = recall_score(y_test, y_pred)
    print("Recall:", recall)


    f1 = f1_score(y_test, y_pred)
    print("F1-Score:", f1)
    
    importances = model.feature_importances_

    # Criar um DataFrame para visualizar as importâncias
    importance_df = pd.DataFrame({'Feature': features.columns, 'Importance': importances})
    importance_df = importance_df.sort_values('Importance', ascending=False)
    
    # Imprimir as importâncias das variáveis
    print('\n\n',importance_df)


Results are incredible! But let's dig a little more. I want to sort the picks:

In [None]:
df_model.copy()

In [None]:
df_model[champs_t1].apply(lambda x: x[:5])

In [None]:
df_model['comp_1'] = df_model[champs_t1].apply(sorted, axis=1)
df_model['comp_2'] = df_model[champs_t2].apply(sorted, axis=1)
split_1 = pd.DataFrame(df_model['comp_1'].to_list(), columns = champs_t1)
split_2 = pd.DataFrame(df_model['comp_2'].to_list(), columns = champs_t2)

split_sorted = pd.concat([split_1,split_2],axis=1)

sorted_features = pd.concat([df_model.drop(['gameId','creationTime','winner','seasonId']+champs_t1+champs_t2+bans_t1+bans_t2+summoner_columns,axis=1),split_sorted],axis=1)
sorted_features.drop(columns=['picks','comp_1','comp_2'],inplace = True)

In [None]:
sorted_features.drop(['picks','comp_1','comp_2'],axis=1,inplace = True)

In [None]:
sorted_features.drop(['picks','comp_1','comp_2'],axis=1).columns

In [None]:
split_sorted.columns

In [None]:
sorted_features

In [None]:
sorted_features

In [None]:
sorted_features.columns

In [None]:
sorted_features.dtypes

In [None]:
# Separe os dados em conjuntos de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(sorted_features, target, test_size=0.2, random_state=42)

model = RandomForestClassifier()  # ou RandomForestClassifier

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [None]:
confusion_matrix = confusion_matrix(y_test, y_pred)
print("Matriz de Confusão:")
print(confusion_matrix)


accuracy = accuracy_score(y_test, y_pred)
print("Acurácia:", accuracy)


precision = precision_score(y_test, y_pred)
print("Precisão:", precision)


recall = recall_score(y_test, y_pred)
print("Recall:", recall)


f1 = f1_score(y_test, y_pred)
print("F1-Score:", f1)

In [None]:
sorted_win_prediction = sorted_features[champs_t1+champs_t2]

In [None]:
print('Apenas picks de cada time (em ordem)')

model_and_results(sorted_win_prediction,target)

In [None]:
print('Picks with first main objectives')
model_and_results(df_model[champs_t1+champs_t2+['firstBlood','firstTower','firstDragon','firstRiftHerald']],target)

In [None]:
print('Picks with first main objectives')



model_and_results(df_model[champs_t1+champs_t2+['firstBlood','firstTower']],target)

Thats a pretty decent result. With only picks, first blood and first tower, we have 71% accuracy to predict the winner. Let me check if bans are relevant

In [None]:
print('Picks with first main objectives')
model_and_results(df_model[champs_t1+champs_t2+bans_t1+bans_t2+['firstBlood','firstTower']],target)

It isnt that relevant, but if we have the data then why not?