In [1]:
import pandas as pd
import datetime
import numpy as np
from dateutil import tz
import matplotlib.pyplot as plt
import os
import seaborn as sns
import math
import itertools
import my_transformers
%matplotlib inline

In [2]:
# columns in the data frame ["brawler", "trophies", "power", "highestTotalTrophies", "totalTrophies", "exp", "highestPowerPlay", "3vs3Victories", "soloVictories", "duoVictories", "highestBrawlerTrophies"]

battle_logs = pd.read_csv("data/cleaned_data.csv")
battle_logs = battle_logs[battle_logs["result"] != "draw"]
y = [1 if result == "victory" else 0 for result in battle_logs["result"]]
battle_logs.drop("result", axis = 1, inplace = True)

simple_battle_logs = my_transformers.ColumnSelector().fit_transform(battle_logs)


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(simple_battle_logs, y, test_size = 0.2, random_state = 42)
# X_test.to_csv("data/X_test.csv", index = False)
# pd.Series(y_test).to_csv("data/y_test.csv", index = False)

## Developing some baseline measurements

In [3]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer


from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import cross_val_score

import tensorflow as tf
from tensorflow import keras

In [4]:
categorical_columns = []
numeric_columns = []
for col_name in X_train.columns:
    if simple_battle_logs[col_name].dtype == 'O':
        categorical_columns.append(col_name)
    else:
        numeric_columns.append(col_name)

In [35]:
# This model drops all the brawler information. It only takes into account trophy information.

numeric_pipe = Pipeline([
    ('imputer', my_transformers.SpecialImputer()), # impute missing values by taking the median of the other players in that game
    ('simple_imputer', SimpleImputer(strategy = "median")), # if there are other missing values fill them in
    ('std_scaler', StandardScaler())
])

categorical_pipe = Pipeline([
    ('drop_categorical_columns', my_transformers.DropCategoricalColumns()) # drop the original categorical variable
])

preprocess_pipe = ColumnTransformer([
    ("numeric_pipe", numeric_pipe, numeric_columns),
    ("categorical_pipe", categorical_pipe, categorical_columns)
])


pre_X_train = preprocess_pipe.fit_transform(X_train)

models = [
    ("sgd", SGDClassifier(random_state = 42)),
    ("svc", SVC()),
    ("rf", RandomForestClassifier()),
    ("knn", KNeighborsClassifier()),
    ("lr", LogisticRegression())
]
all_models = models + [("ensmble", VotingClassifier(models, voting="hard"))]

results = {model[0] : cross_val_score(model[1], pre_X_train, y_train, cv = 3, scoring = "accuracy") for model in all_models}
df = pd.DataFrame(results)
df

Unnamed: 0,sgd,svc,rf,knn,lr,ensmble
0,0.646034,0.654172,0.653714,0.603508,0.653026,0.654172
1,0.64542,0.663762,0.658604,0.611716,0.657687,0.66147
2,0.648974,0.663075,0.663075,0.602545,0.668463,0.666858


In [5]:
# this code also takes into account categorical variables which the categorical variables are simply one_hot encoded

numeric_pipe = Pipeline([
    ('imputer', my_transformers.SpecialImputer()), # impute missing values by taking the median of the other players in that game
    ('simple_imputer', SimpleImputer(strategy = "median")), # if there are other missing values fill them in
    ('std_scaler', StandardScaler())
])

categorical_pipe = Pipeline([
    ('one_hot_encoder', my_transformers.MyOneHotEncoder()), # one hot encode the brawlers of each player
    ('drop_categorical_columns', my_transformers.DropCategoricalColumns()) # drop the original categorical variable
])

preprocess_pipe = ColumnTransformer([
    ("numeric_pipe", numeric_pipe, numeric_columns),
    ("categorical_pipe", categorical_pipe, categorical_columns)
])


pre_X_train = preprocess_pipe.fit_transform(X_train)

models = [
    ("sgd", SGDClassifier(random_state = 42, max_iter = 10000)),
    ("svc", SVC()),
    ("rf", RandomForestClassifier()),
    ("knn", KNeighborsClassifier()),
    ("lr", LogisticRegression(max_iter = 10000))
]
all_models = models + [("ensmble", VotingClassifier(models, voting="hard"))]

results = {model[0] : cross_val_score(model[1], pre_X_train, y_train, cv = 3, scoring = "accuracy") for model in all_models}
df = pd.DataFrame(results)
df

Unnamed: 0,sgd,svc,rf,knn,lr,ensmble
0,0.665864,0.674347,0.659101,0.604081,0.673544,0.672169
1,0.669494,0.678895,0.664336,0.612977,0.682907,0.684054
2,0.666743,0.6758,0.665941,0.609767,0.676487,0.681532


## Optimizing Models

In [33]:
# artifically increase the number of samples by 72 fold
X, X_val, y, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42)
X["result"] = y
X, y = my_transformers.DataImputer().fit_transform(X)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  X = pd.concat(all_Xs)


In [5]:
X = np.array(pd.read_csv("data/pre_X_imputed.csv"))
y = np.array(pd.read_csv("data/pre_y_imputed.csv"))
pre_X_val = np.array(pd.read_csv("data/pre_X_val.csv"))
y_val = np.array(pd.read_csv("data/pre_y_val.csv"))

In [34]:
numeric_pipe = Pipeline([
    ('imputer', my_transformers.SpecialImputer()), # impute missing values by taking the median of the other players in that game
    ('simple_imputer', SimpleImputer(strategy = "median")), # if there are other missing values fill them in
    ('std_scaler', StandardScaler())
])

categorical_pipe = Pipeline([
    ('one_hot_encoder', my_transformers.MyOneHotEncoder()), # one hot encode the brawlers of each player
    ('drop_categorical_columns', my_transformers.DropCategoricalColumns()) # drop the original categorical variable
])

preprocess_pipe = ColumnTransformer([
    ("numeric_pipe", numeric_pipe, numeric_columns),
    ("categorical_pipe", categorical_pipe, categorical_columns)
])


pre_X = preprocess_pipe.fit_transform(X)
y = np.array(y)
pre_X_val = preprocess_pipe.transform(X_val)
y_val = np.array(y_val)

In [17]:
pd.DataFrame(pre_X).to_csv("data/pre_X.csv")
pd.DataFrame(y).to_csv("data/y.csv")
pd.DataFrame(pre_X_val).to_csv("data/pre_X_val.csv")
pd.DataFrame(y_val).to_csv("data/y_val.csv")

In [0]:
models = [
    ("sgd", SGDClassifier(random_state = 42, max_iter = 10000)),
    # ("rf", RandomForestClassifier()),
    ("lr", LogisticRegression(max_iter = 10000))
]
all_models = models + [("ensmble", VotingClassifier(models, voting="hard"))]

all_trained_models = [(model[0], model[1].fit(pre_X, y)) for model in all_models]



In [37]:
from sklearn.metrics import accuracy_score
all_trained_models_accuracy = [(model[0], accuracy_score(model[1].predict(pre_X_val), y_val)) for model in all_models]

In [38]:
all_trained_models_accuracy

[('sgd', 0.6820787160871227),
 ('lr', 0.6878104700038211),
 ('ensmble', 0.6839893007260222)]

### Neural Net

In [18]:
# without trying to impute any more data; try to get a neural net to work as well as SVM
model1 = keras.models.Sequential([
    keras.layers.Dense(228, input_shape = (228,)),
    keras.layers.Dense(448, activation = "relu"),
    # keras.layers.Dense(114, activation = "relu"),
    keras.layers.Dense(1, activation = "sigmoid")
])
model1.compile(loss="binary_crossentropy",
              optimizer = "sgd",
              metrics=["accuracy"])

history = model1.fit(pre_X, y, epochs = 5, validation_data = (pre_X_val, y_val))

Train on 1507392 samples, validate on 5234 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [22]:
# without trying to impute any more data; try to get a neural net to work as well as SVM
model2 = keras.models.Sequential([
    keras.layers.Dense(228, input_shape = (228,)),
    keras.layers.Dense(114, activation = "relu"),
    # keras.layers.Dense(114, activation = "relu"),
    keras.layers.Dense(1, activation = "sigmoid")
])
model2.compile(loss="binary_crossentropy",
              optimizer = "sgd",
              metrics=["accuracy"])

history = model2.fit(pre_X, y, epochs = 5, validation_data = (pre_X_val, y_val))

Train on 1507392 samples, validate on 5234 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [20]:
# without trying to impute any more data; try to get a neural net to work as well as SVM
model3 = keras.models.Sequential([
    keras.layers.Dense(228, input_shape = (228,)),
    keras.layers.Dense(50, activation = "relu"),
    keras.layers.Dense(25, activation = "relu"),
    keras.layers.Dense(1, activation = "sigmoid")
])
model3.compile(loss="binary_crossentropy",
              optimizer = "sgd",
              metrics=["accuracy"])

history = model3.fit(pre_X, y, epochs = 5, validation_data = (pre_X_val, y_val))

Train on 1507392 samples, validate on 5234 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Final Model

In [45]:
pre_X_test = preprocess_pipe.transform(X_test)
model = models[0][1]
score = accuracy_score(model.predict(np.array(pre_X_test)), np.array(y_test))
score

0.681289347391617