In [1]:
import pandas as pd
import numpy as np

economy_data = pd.read_csv('data/economy.csv')
picks_data = pd.read_csv('data/picks.csv')
players_data = pd.read_csv('data/players.csv')
results_data = pd.read_csv('data/results.csv')

  economy_data = pd.read_csv('data/economy.csv')


In [2]:
merged_data = pd.merge(results_data, picks_data, on=['date', 'match_id', 'team_1', 'team_2'])

# print(merged_data.head())

In [3]:
# verify that we have no missing/null data in columns
missing_data = merged_data.isnull().sum()
print(missing_data[missing_data > 0])

Series([], dtype: int64)


In [4]:
# find all the columns which may be problematic while training a model
object_columns = merged_data.dtypes[merged_data.dtypes == 'object'].index
print(object_columns)
for column in object_columns:
    print(f"{column}:\n{merged_data[column].head()}\n")

Index(['date', 'team_1', 'team_2', '_map', 'best_of', 't1_removed_1',
       't1_removed_2', 't1_removed_3', 't2_removed_1', 't2_removed_2',
       't2_removed_3', 't1_picked_1', 't2_picked_1', 'left_over'],
      dtype='object')
date:
0    2020-03-18
1    2020-03-18
2    2020-03-18
3    2020-03-17
4    2020-03-17
Name: date, dtype: object

team_1:
0    New England Whalers
1                Rugratz
2                Rugratz
3            Singularity
4            Singularity
Name: team_1, dtype: object

team_2:
0          Station7
1    Bad News Bears
2    Bad News Bears
3          Endpoint
4          Endpoint
Name: team_2, dtype: object

_map:
0     Inferno
1     Inferno
2     Vertigo
3    Overpass
4     Vertigo
Name: _map, dtype: object

best_of:
0    1
1    3
2    3
3    3
4    3
Name: best_of, dtype: object

t1_removed_1:
0    Mirage
1     Dust2
2     Dust2
3     Train
4     Train
Name: t1_removed_1, dtype: object

t1_removed_2:
0     Dust2
1      Nuke
2      Nuke
3    Mirage
4    Mirag

In [5]:
from sklearn.preprocessing import LabelEncoder
team_encoder = LabelEncoder()
unique_teams = list(set(np.concatenate((merged_data['team_1'].unique(), merged_data['team_2'].unique()), axis=None)))
team_encoder.fit(unique_teams)

merged_data['team_1'] = team_encoder.transform(merged_data['team_1'])
merged_data['team_2'] = team_encoder.transform(merged_data['team_2'])

In [6]:
# encode the categorical variables
# these are the string/id columns
# use one-hot encoding for _map because there aren't many unique values
# use ordinal encoding for team_1 and team_2 to avoid increasing dimensionality too much

# from sklearn.preprocessing import OrdinalEncoder

# encoder = OrdinalEncoder()
# merged_data[['team_1', 'team_2']] = encoder.fit_transform(merged_data[['team_1', 'team_2']])

# For _map column, we can use one-hot encoding as before
merged_data = pd.get_dummies(merged_data, columns=['_map', 'best_of', 't1_removed_1', 't1_removed_2', 't1_removed_3',
       't2_removed_1', 't2_removed_2', 't2_removed_3', 't1_picked_1',
       't2_picked_1', 'left_over'])

In [7]:
from datetime import datetime

def convert_date_to_ordinal(date):
    return datetime.strptime(date, "%Y-%m-%d").toordinal()

# Convert the date column to a numeric format
merged_data['date'] = merged_data['date'].apply(convert_date_to_ordinal)

In [8]:
# ensure there are no categorical variables left, which would fail our model
object_columns = merged_data.dtypes[merged_data.dtypes == 'object'].index
print(object_columns)
for column in object_columns:
    print(f"{column}:\n{merged_data[column].head()}\n")

Index([], dtype='object')


In [9]:
from sklearn.model_selection import train_test_split

# We'll try to predict the 'match_winner' column
X = merged_data.drop('match_winner', axis=1)
y = merged_data['match_winner']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# train the model on logistic regression and get the accuracy
from sklearn.linear_model import LogisticRegression


model = LogisticRegression(max_iter=1000)

model.fit(X_train, y_train)

In [11]:
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test)

print("Model accuracy: ", accuracy_score(y_test, y_pred))

# Print classification report
print(classification_report(y_test, y_pred))

# Calculate and print ROC-AUC
roc_auc = roc_auc_score(y_test, y_pred)
print(f'ROC-AUC: {roc_auc}')

Model accuracy:  0.6557436082102989
              precision    recall  f1-score   support

           1       0.70      0.61      0.65      1470
           2       0.62      0.70      0.66      1307

    accuracy                           0.66      2777
   macro avg       0.66      0.66      0.66      2777
weighted avg       0.66      0.66      0.66      2777

ROC-AUC: 0.6584560373499055


In [12]:
# figure out which team we predicted most accurately for
y_all = model.predict(X)
data_with_preds = merged_data.copy()
data_with_preds['predictions'] = y_all
data_with_preds['team_1'] = team_encoder.inverse_transform(data_with_preds[['team_1']])
data_with_preds['team_2'] = team_encoder.inverse_transform(data_with_preds[['team_2']])

team_accuracies = {}
teams = pd.concat([data_with_preds['team_1'], data_with_preds['team_2']]).unique()

for team in teams:
    team_games = data_with_preds[(data_with_preds['team_1'] == team) | (data_with_preds['team_2'] == team)]
    if len(team_games) < 5:
        continue
    correct_predictions = team_games[team_games['match_winner'] == team_games['predictions']]
    accuracy = len(correct_predictions) / len(team_games)
    team_accuracies[team] = accuracy

best_team = max(team_accuracies, key=team_accuracies.get)
num_best_team_games = len(data_with_preds[(data_with_preds['team_1'] == best_team) | (data_with_preds['team_2'] == best_team)])
print(f'The team with the highest prediction accuracy is {best_team} with an accuracy of {team_accuracies[best_team]} over {num_best_team_games} games')

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


The team with the highest prediction accuracy is ex-eUnited with an accuracy of 1.0 over 7 games
