In [69]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import pickle
import tensorflow as tf
from tensorflow.keras import Sequential, layers
from tensorflow.keras.layers import Dense


# IMPORT DATA

In [3]:
folder_path = "data/Match_Diff"

In [23]:
data_dict = {}

# Iterate over files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)
        
        # Extract the key from the filename
        key = filename.replace("Match_Diff_", "").replace(".csv", "")
        
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        
        # Delete the first column from the DataFrame
        df = df.drop(df.columns[0], axis=1)
        
        # Add the DataFrame to the dictionary with the key
        data_dict[key] = df

# STARTS TIMBA MODEL

In [24]:
def timba(all_events_diff):

    # Get columns that start with 'monsterType'
    monster_type_cols = all_events_diff.filter(like='monsterType', axis=1).columns.tolist()

    # Get columns that start with 'towerType'
    tower_type_cols = all_events_diff.filter(like='towerType', axis=1).columns.tolist()

    # Get columns that start with 'killType'
    kill_type_cols = all_events_diff.filter(like='killType', axis=1).columns.tolist()

    # Combine the columns into a single list
    all_cols = monster_type_cols + tower_type_cols + kill_type_cols + ["target","minionsKilled","totalGold"]

    all_df = all_events_diff[all_cols]

    return all_df

data_dict_timba={}

for key, df in data_dict.items():
    
    data_dict_timba[key] = timba(data_dict[key])
    


In [50]:
def duplicates(df):
    print(f"Duplicates droped: {df.duplicated().sum()}")

    df = df.drop_duplicates()

    return df

def scale(df):
    # Standard or Robust if there are many outliers
    scaler = RobustScaler()
    
    scaled_array = scaler.fit_transform(df)
    scaled_df = pd.DataFrame(scaled_array, columns=df.columns)
    
    return scaled_df

def preprop(df):
    
    #df = duplicates(df)
    
    df = scale(df)
    
    return pd.DataFrame(df)

In [51]:
data_preprop = {}

for key, df in data_dict_timba.items():
    
    X = data_dict_timba[key].drop(columns="target")
    
    y = data_dict_timba[key]["target"]

    X_preprop = preprop(X)
    
    data_preprop[key] = [X_preprop,y]

In [52]:
data_preprop["IRON"][0]

Unnamed: 0,monsterType_WATER_DRAGON,monsterType_FIRE_DRAGON,monsterType_RIFTHERALD,monsterType_AIR_DRAGON,monsterType_EARTH_DRAGON,monsterType_CHEMTECH_DRAGON,monsterType_HEXTECH_DRAGON,towerType_OUTER_TURRET,towerType_INNER_TURRET,killType_KILL_FIRST_BLOOD,killType_KILL_MULTI,killType_KILL_ACE,minionsKilled,totalGold
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,0.0,-0.244444,-1.026975
1,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,1.088889,-0.542698
2,1.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.577778,0.505347
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,0.0,0.155556,0.428412
4,0.0,0.0,-1.0,0.0,-1.0,0.0,0.0,0.0,0.0,-1.0,1.0,0.0,-1.266667,-1.245331
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4730,0.0,0.0,-1.0,0.0,0.0,-1.0,0.0,0.0,0.0,-1.0,0.0,0.0,1.400000,0.351476
4731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.733333,0.587709
4732,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.155556,-0.245491
4733,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0,0.0,0.711111,0.519713


# LOG MODEL

In [53]:
#Simple model to check features

for key, value in data_preprop.items():
    
    print(key)
    
    X_preprop = data_preprop[key][0]
    y = data_preprop[key][1]
    
    # Instantiate model
    log_reg = LogisticRegression(max_iter=10000)

    # Scoring on multiple folds aka Cross Validation
    scores = cross_val_score(log_reg, X_preprop, y, cv=10)
    print(f"Score:{scores.mean()}")

    # Fit model
    log_model = LogisticRegression().fit(X_preprop, y)

    # Performs Permutation
    permutation_score = permutation_importance(log_model, X_preprop, y, n_repeats=10)

    # Unstack results showing the decrease in performance after shuffling features
    importance_df = pd.DataFrame(np.vstack((X_preprop.columns,
                                            permutation_score.importances_mean)).T)
    importance_df.columns=['feature','score decrease']

    # Show the important features
    print(importance_df.sort_values(by="score decrease", ascending = False))

IRON
Score:0.7294747593687835
                        feature score decrease
13                    totalGold       0.174044
4      monsterType_EARTH_DRAGON       0.009229
12                minionsKilled       0.007202
1       monsterType_FIRE_DRAGON       0.005681
0      monsterType_WATER_DRAGON       0.005322
6    monsterType_HEXTECH_DRAGON       0.004731
3        monsterType_AIR_DRAGON       0.002196
5   monsterType_CHEMTECH_DRAGON       0.001985
2        monsterType_RIFTHERALD       0.001436
9     killType_KILL_FIRST_BLOOD       0.000106
7        towerType_OUTER_TURRET       0.000063
8        towerType_INNER_TURRET       0.000042
11            killType_KILL_ACE      -0.000021
10          killType_KILL_MULTI       -0.00076
PLATINUM
Score:0.715418917760055
                        feature score decrease
13                    totalGold       0.146082
12                minionsKilled       0.007361
1      monsterType_EARTH_DRAGON       0.005033
4       monsterType_FIRE_DRAGON       0.0049

In [68]:
#Look for the best params and export the model with these

for key, value in data_preprop.items():
    print(key)
    
    X_preprop = data_preprop[key][0]
    y = data_preprop[key][1]
    
    X_train, X_test, y_train, y_test = train_test_split(X_preprop, y, test_size=0.2, random_state=42)

    # Define the hyperparameter grid
    param_grid = {
        'C': [0.1, 1, 10],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga']
    }

    # Create a logistic regression model
    model = LogisticRegression(max_iter=5000)

    # Create a grid search object
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='accuracy',
        cv=5
    )

    # Perform the grid search
    grid_search.fit(X_train, y_train)

    # Print the best hyperparameters and best score
    print("Best Hyperparameters: ", grid_search.best_params_)
    print("Best Score: ", grid_search.best_score_)
    
    # Fit the model with the best hyperparameters
    best_model = LogisticRegression(
        C=grid_search.best_params_['C'],
        penalty=grid_search.best_params_['penalty'],
        solver=grid_search.best_params_['solver'],
        max_iter=5000
    )
    best_model.fit(X_train, y_train)

    # Export the model as a pickle file. Uncoment if you want
    """
    filename = key + '_model.pkl'
    with open(filename, 'wb') as file:
        pickle.dump(best_model, file)
    print("Model exported as", filename)
    """

IRON
Best Hyperparameters:  {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
Best Score:  0.7357476220185917
PLATINUM
Best Hyperparameters:  {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
Best Score:  0.7182283722275502
SILVER
Best Hyperparameters:  {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
Best Score:  0.7126429818155621
GOLD
Best Hyperparameters:  {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
Best Score:  0.7137599820730117
BRONZE
Best Hyperparameters:  {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Best Score:  0.7025020061905308
CHALLENGER
Best Hyperparameters:  {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
Best Score:  0.7233133473241978
MASTER
Best Hyperparameters:  {'C': 1, 'penalty': 'l1', 'solver': 'saga'}
Best Score:  0.7052763483570029
GRANDMASTER
Best Hyperparameters:  {'C': 0.1, 'penalty': 'l1', 'solver': 'saga'}
Best Score:  0.7123748584985016
DIAMOND
Best Hyperparameters:  {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
Best Score:  0.70438190

# DEEP LEARNING

In [60]:
X_preprop = data_preprop["IRON"][0]
y = data_preprop["IRON"][1]
    
X_train, X_test, y_train, y_test = train_test_split(X_preprop, y, test_size=0.2, random_state=42)

In [65]:
# Create a Sequential model
model = Sequential()

# Add a dense layer with ReLU activation as the hidden layer

model.add(layers.Dense(10, activation='relu', input_dim=X_train.shape[1]))

model.add(layers.Dense(5, activation='relu'))

model.add(layers.Dense(5, activation='relu'))


# Add another dense layer with ReLU activation as the output layer
model.add(layers.Dense(1, activation='sigmoid')) 

# Compile the model
model.compile(
    optimizer='adam',
    loss='binary_crossentropy', 
    metrics = 'accuracy')

# Train the model
model.fit(X_train, y_train, epochs=100, batch_size=32)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Loss: {loss}")
print(f"Accuracy: {accuracy}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78