##### Importing libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import tensorflow as tf


pd.set_option('display.max_columns', None)

##### Importing datasets

In [None]:
df_2023 = pd.read_csv('Data/2023.csv')
df_2022 = pd.read_csv('Data/2022.csv')
df_2021 = pd.read_csv('Data/2021.csv')
df_2020 = pd.read_csv('Data/2020.csv')

In [None]:
# combine all dataframes into one
df = pd.concat([df_2023, df_2022, df_2021, df_2020], ignore_index=True)

### Cleaning up data for pre-processing

In [None]:
# Filter rows based on data completeness and position
df = df[(df['datacompleteness'] == 'complete') & (df['position'] == 'team')]

# Drop unnecessary columns
cols_to_drop = ['datacompleteness', 'url', 'gameid', 'league', 'split', 'playoffs', 'date', 'game', 'participantid', 'position', 'playername', 'playerid', 'teamid', 'champion', 'firstbloodassist', 'firstbloodkill', 'firstbloodvictim', 'dragons (type unknown)', 'damageshare', 'earnedgoldshare', 'total cs', 'monsterkillsownjungle', 'monsterkillsenemyjungle', 'teamname', 'year', 'patch', 'teamkills', 'teamdeaths']
df = df.drop(columns=cols_to_drop)

# Fill NaN values in specific columns
df.fillna({'ban1': 'UNKNOWN', 'ban2': 'UNKNOWN', 'ban3': 'UNKNOWN', 'ban4': 'UNKNOWN', 'ban5': 'UNKNOWN', 'firstmidtower': 0}, inplace=True)

champions = pd.concat([df['ban1'], df['ban2'], df['ban3'], df['ban4'], df['ban5']]).unique()

# Convert categorical values to numerical values
value_to_index = {value: index for index, value in enumerate(champions)}

df['ban1'] = df['ban1'].map(value_to_index)
df['ban2'] = df['ban2'].map(value_to_index)
df['ban3'] = df['ban3'].map(value_to_index)
df['ban4'] = df['ban4'].map(value_to_index)
df['ban5'] = df['ban5'].map(value_to_index)

df['side'] = df['side'].map({'Blue': 0, 'Red': 1})


In [None]:
X = df.drop(columns=['result'])
y = df['result']

columns_to_exclude_scaling = ['side', 'ban1', 'ban2', 'ban3', 'ban4', 'ban5']
columns_to_scale = df.columns.difference(columns_to_exclude_scaling)
scaler = MinMaxScaler()
df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
model = GradientBoostingClassifier()

model.fit(X, y)
# Get feature importances
feature_importance = model.feature_importances_

# Get indices of top 10 features
top_indices = np.argsort(feature_importance)[-10:]

# Get the corresponding feature names
top_features = X.columns[top_indices]

# Get the corresponding feature importances
top_importance = feature_importance[top_indices]

# Plot the top 10 features
plt.figure(figsize=(10, 6))
plt.bar(top_features, top_importance)
plt.xticks(rotation=90)
plt.title('Top 10 Features')
plt.show()

In [None]:
# top 10 important features mapped to their column names
top_10_features = np.argsort(feature_importance)[-10:]
top_10_features = [X.columns[i] for i in top_10_features]
top_10_features

# Only keep these columns in the dataset
X_train = X_train[top_10_features]
X_test = X_test[top_10_features]

### Building a Neural Network 

In [None]:
def build_sequential_network():
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(input_shape=(X_train.shape[1],), activation='relu', units=128),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(1, activation='relu')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:
model = build_sequential_network()
history = model.fit(X_train, y_train, epochs=100, validation_data=(X_test, y_test), verbose=1, batch_size=32, shuffle=True, use_multiprocessing=True, workers=4)

# Plot the loss and accuracy curves for training and validation
fig, ax = plt.subplots(1, 2, figsize=(15, 5))
ax[0].plot(history.history['loss'], label='Train Loss')
ax[0].plot(history.history['val_loss'], label='Validation Loss')
ax[0].legend()
ax[0].set_title('Loss')
ax[1].plot(history.history['accuracy'], label='Train Accuracy')
ax[1].plot(history.history['val_accuracy'], label='Validation Accuracy')
ax[1].legend()
ax[1].set_title('Accuracy')
plt.show()