In [1]:
from sklearn import linear_model
from sklearn import tree
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import statsmodels.api as sm
import matplotlib.pyplot as plt
import pandas as pd
import random
import os
import joblib

In [2]:
game_data = pd.read_csv("../Data/data/clean_game_data.csv",dtype={"p1_id" : "string","p2_id" : "string","p1_char" : "string", "p2_char" : "string", "stage" : "string", "p1_games_played" : "int32", "p1_games_won" : "int32", "p2_games_played" : "int32", "p2_games_won" : "int32", "p1_won" : "bool"})
game_data = pd.get_dummies(game_data, columns=["p1_char","p2_char","stage"], prefix_sep=".", )
game_data

Unnamed: 0,p1_id,p2_id,p1_won,p1_games_played,p1_games_won,p2_games_played,p2_games_won,p1_char.-1,p1_char.banjokazooie,p1_char.bayonetta,...,stage.Town and City,stage.Unova Pokemon League,stage.Unova Pokémon League,stage.Venom,stage.WarioWare,stage.Wily Castle,stage.Yggdrasil's Altar,stage.Yoshi's Island,stage.Yoshi's Island (Melee),stage.Yoshi's Story
0,1472816,1075251,False,2,0,23,13,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1075251,1472816,True,23,13,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,challonge__MrRiceman,challonge__Loconotcoco,False,4,1,2,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Leo,1272809,True,1,1,102,49,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1034645,1302612,True,77,40,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2963854,30896,4702,False,72,51,865,555,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2963855,4702,30896,True,865,555,72,51,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2963856,1263104,53481,True,186,112,8,7,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2963857,53481,1263104,False,8,7,186,112,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
game_train, game_test = train_test_split(game_data, train_size = 0.8, stratify = game_data[["p1_won"]], random_state=2049)

# We stratified on our response, p1_won. 
# This shouldn't make too much of a difference because we randomized which is p1 and p2, however it is still good practice.

X = game_train.loc[:,game_train.columns != "p1_won"]
y = game_train["p1_won"]

game_folded = StratifiedKFold(n_splits=5).split(X,y)

Let's start by using our numerical predictors to fit a linear model. We want to use a logistic regression because we are predicting a binary class.

In [4]:
def fitmodel(model, filename, predictors, response = "p1_won", df = game_train):

    if not os.path.isfile(filename):

        _x = game_train[predictors]
        _y = game_train[response]

        model.fit(_x,_y)

        joblib.dump(model, filename)

    else:
        modeltemp = joblib.load(filename)
        if (type(model) != type(modeltemp)) or \
            ((model.get_params()) != (modeltemp.get_params())) or \
            (set(predictors) != set(modeltemp.feature_names_in_)):
            print ("\033[93m Warning: model mismatch. Delete the file {filename} and rerun or risk faulty models.\n \033[0m".format(filename=filename))
        model = modeltemp
    return model

def predself(model, response = "p1_won", df = game_train):
    prediction = model.predict(df[model.feature_names_in_])
    actual = df[response]

    print("Accuracy: {acc}".format(acc = accuracy_score(prediction,actual)))

In [20]:
# C is a hyperparameter in a logistic regression, which controls penalty strength.

lm_predictors = ["p1_games_played","p2_games_played","p1_games_won","p2_games_won"]

# I'm leaving out character and stage data, because we would not expect them to be good predictors by
# themselves, they would need to have some interaction terms. There would be so many interaction terms
# in our linear model that it would take a really long time to compute without being a very good metric
# anyway, because in our EDA and visualization we found they change in a non-linear way with respect to
# other variables. So, I'm only including our numerical data. We can use this model as a base metric for
# how good we can expect our other models to perform.

lm_grid = dict(C = [1000, 100, 10, 1.0, 0.1, 0.01, 0.001])

game_folded = StratifiedKFold(n_splits=5).split(X,y)

lm_grid_result = GridSearchCV(estimator = linear_model.LogisticRegression(),
                              param_grid = lm_grid,
                              n_jobs = 4,
                              cv = game_folded,
                              scoring = 'accuracy',
                              error_score = 0,
                              verbose = 4
                              ).fit(game_train[lm_predictors],game_train["p1_won"])

Fitting 5 folds for each of 7 candidates, totalling 35 fits


In [23]:
print("C = {bestC}, with an accuracy of {bestScore}.\n".format(
    bestC = lm_grid_result.best_params_['C'],
    bestScore = lm_grid_result.best_score_))

res = pd.DataFrame(lm_grid_result.cv_results_)
print(res[['param_C','mean_test_score']]) # I tried using higher values of C, and the accuracy seemed to increase marginally, but I think 1000 is good enough.

C = 1000, with an accuracy of 0.6432539169626114.

  param_C  mean_test_score
0    1000         0.643254
1     100         0.643254
2      10         0.643254
3     1.0         0.643254
4     0.1         0.643254
5    0.01         0.643254
6   0.001         0.643253


In [37]:
lm = fitmodel(model = linear_model.LogisticRegression(C = 1000),
              filename = "models/logistic_regression.joblib",
              predictors = lm_predictors)

In [27]:
# We again can tune C, which controls penalty strength.
# We're going to use an elastic net, and also tune the parameter l1_ratio, which
# gives the proportion of lasso regression to ridge regression. A value of 1.0 means
# expresses a lasso regression, 0.0 a ridge regression, and anything in between some
# sort of elastic net.

en_predictors = ["p1_games_played","p2_games_played","p1_games_won","p2_games_won"]

en_grid = dict(solver = ['saga'],
               penalty = ['elasticnet'],
               l1_ratio = [0.0, 0.10, 0.25, 0.50, 0.75, 0.90, 1.0],
               C = [1000, 100, 10, 1.0, 0.1, 0.01, 0.001])

game_folded = StratifiedKFold(n_splits=5).split(X,y)

en_grid_result = GridSearchCV(estimator = linear_model.LogisticRegression(),
                           param_grid = en_grid,
                           n_jobs = 4,
                           cv = game_folded,
                           scoring = 'accuracy',
                           error_score = 0,
                           verbose = 4
                           ).fit(game_train[en_predictors],game_train["p1_won"])

Fitting 5 folds for each of 49 candidates, totalling 245 fits


In [28]:
print("C = {bestC}, l1_ratio = {bestratio}, with an accuracy of {bestScore}.\n".format(
    bestC = en_grid_result.best_params_['C'],
    bestratio = en_grid_result.best_params_['l1_ratio'],
    bestScore = en_grid_result.best_score_))

res = pd.DataFrame(en_grid_result.cv_results_)
print(res[['param_l1_ratio','param_C','mean_test_score']])

C = 0.1, l1_ratio = 0.9, with an accuracy of 0.6431248622406927.

   param_l1_ratio param_C  mean_test_score
0             0.0    1000         0.643092
1             0.1    1000         0.643102
2            0.25    1000         0.643107
3             0.5    1000         0.643084
4            0.75    1000         0.643094
5             0.9    1000         0.643096
6             1.0    1000         0.643106
7             0.0     100         0.643100
8             0.1     100         0.643091
9            0.25     100         0.643104
10            0.5     100         0.643077
11           0.75     100         0.643104
12            0.9     100         0.643085
13            1.0     100         0.643101
14            0.0      10         0.643077
15            0.1      10         0.643092
16           0.25      10         0.643094
17            0.5      10         0.643100
18           0.75      10         0.643100
19            0.9      10         0.643081
20            1.0      10      

In [36]:
en = fitmodel(model = linear_model.LogisticRegression(C = 0.1, l1_ratio = 0.9, penalty = "elasticnet", solver = "saga"),
              filename = "models/elastic_net.joblib",
              predictors = lm_predictors)

One thing to keep in mind is that games played and games won are relative to the size of the dataset, so it might be a good idea to use a proportion between players instead in case someone wants to use the final model on a dataset which is not of the same size. It's unclear if this will matter though.

The goal of this project is more interpretation/inference than predicition, so we can hold this thought for now.

In [8]:
# cpp_alpha is a hyperparameter in a decision tree, which controls the cost complexity of our tree.
# the other hyperparameters in the decision tree are typically picked automatically.

dtc_predictors = list(set(game_train.head()).difference({"p1_won","p1_id","p2_id"}))

dtc_grid = dict(ccp_alpha = [0.01, 0.02, 0.05, 0.10, 0.25])

game_folded = StratifiedKFold(n_splits=5).split(X,y)

dtc_grid_result = GridSearchCV(estimator = tree.DecisionTreeClassifier(),
                               param_grid = dtc_grid,
                               n_jobs = 4,
                               cv = game_folded,
                               scoring = 'accuracy',
                               error_score = 0,
                               verbose = 4
                               ).fit(game_train[dtc_predictors],game_train["p1_won"])

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [None]:
print("C = {bestccp}, with an accuracy of {bestScore}.\n".format(bestccp = dtc_grid_result.best_params_['ccp_alpha'],
                                                               bestScore = dtc_grid_result.best_score_))
res = pd.DataFrame(dtc_grid_result.cv_results_)
print(res[['param_ccp_alpha','mean_test_score']]) # I tried using higher values of C, and the accuracy seemed to increase marginally, but I think 1000 is good enough.

In [97]:
dtc = fitmodel(model = tree.DecisionTreeClassifier(),
               filename = "models/decision_tree.joblib",
               predictors = list(set(game_train.head()).difference({"p1_won","p1_id","p2_id"})))

predself(dtc)

 [0m


KeyError: '[\'p1_char_link\', \'stage_Venom\', \'p2_char_pyra\', \'p2_char_roy\', \'p1_char_rosalina\', \'p1_char_ridley\', \'p2_char_ganondorf\', \'p1_char_-1\', \'p1_char_pit\', \'p2_char_bowserjr\', \'p1_char_pikachu\', \'stage_Distant Planet\', \'stage_Town and City\', \'p2_char_mrgameandwatch\', \'p1_char_miiswordfighter\', \'stage_Brinstar Depths\', \'stage_Hyrule Castle\', \'p2_char_fox\', \'p2_char_olimar\', \'p1_char_pokemontrainer\', "stage_Luigi\'s Mansion", \'p1_char_diddykong\', \'p1_char_greninja\', \'p2_char_jigglypuff\', \'p2_char_darkpit\', \'p2_char_incineroar\', \'p2_char_piranhaplant\', \'p2_char_richter\', \'stage_WarioWare\', \'p2_char_duckhunt\', \'stage_Town & City\', \'p1_char_bowser\', \'p1_char_robin\', \'p1_char_darksamus\', \'p1_char_captainfalcon\', \'p1_char_bowserjr\', \'p1_char_inkling\', \'p1_char_steve\', \'p1_char_sephiroth\', \'stage_Wily Castle\', \'p1_char_mrgameandwatch\', \'p2_char_littlemac\', \'p1_char_cloud\', \'p1_char_isabelle\', \'p2_char_byleth\', \'p2_char_drmario\', \'p1_char_younglink\', \'stage_Boxing Ring\', \'p2_char_sheik\', \'stage_Spirit Train\', \'p2_char_ken\', \'p2_char_younglink\', \'stage_Mushroom Kingdom\', \'p1_char_simon\', \'p1_char_zelda\', \'stage_Skyloft\', \'p1_char_metaknight\', \'p1_char_darkpit\', \'p2_char_luigi\', \'p2_char_sora\', \'p1_char_littlemac\', \'p2_char_inkling\', \'p2_char_marth\', \'p2_char_cloud\', \'p2_char_minmin\', \'p1_char_ganondorf\', \'stage_Pilotwings\', \'p1_char_luigi\', \'p2_char_falco\', \'p2_char_snake\', \'p2_char_diddykong\', \'p1_char_rob\', \'stage_Mario Galaxy\', \'stage_Pokemon Stadium 2\', \'p2_char_wolf\', \'stage_Kalos Pokemon League\', \'p1_char_falco\', \'p2_char_steve\', \'p2_char_random\', \'p2_char_terry\', \'p1_char_lucas\', \'stage_Hollow Bastion\', \'stage_Small Battlefield\', \'stage_Norfair\', \'p1_char_drmario\', \'p2_char_greninja\', \'p1_char_sora\', \'p2_char_lucina\', \'p2_char_simon\', \'p2_char_chrom\', \'p2_char_kingdedede\', \'p1_char_mewtwo\', \'p2_char_pokemontrainer\', \'p1_char_zerosuitsamus\', \'p2_char_joker\', \'p2_char_zelda\', \'stage_Pokémon Stadium\', \'p1_char_fox\', \'p1_char_lucario\', \'stage_Mario Circuit\', "stage_Yoshi\'s Island", \'stage_Pokémon Stadium 2\', \'p1_char_samus\', \'stage_Magicant\', \'p2_char_-1\', \'p2_char_palutena\', \'p1_char_olimar\', \'p1_char_hero\', \'p2_char_kirby\', \'p2_char_robin\', \'stage_Find Mii\', \'p2_char_ridley\', \'p1_char_kingdedede\', \'stage_Frigate Orpheon\', \'p1_char_minmin\', \'p1_char_toonlink\', \'p2_char_toonlink\', \'stage_Castle Siege\', \'p1_char_iceclimbers\', \'p2_char_rosalina\', \'p2_char_pit\', \'p1_char_marth\', \'stage_Final Destination\', \'stage_New Donk City Hall\', \'stage_Super Mario Maker\', \'p1_char_random\', \'stage_Unova Pokémon League\', \'p1_char_kingkrool\', \'p2_char_kazuya\', \'p1_char_miibrawler\', \'stage_Lylat Cruise\', \'p2_char_miiswordfighter\', \'stage_Northern Cave\', \'p2_char_link\', \'p2_char_pikachu\', \'p2_char_sephiroth\', \'p2_char_miigunner\', \'p2_char_lucario\', \'p1_char_peach\', \'p2_char_rob\', \'p1_char_roy\', \'p1_char_pacman\', \'p1_char_sheik\', \'p2_char_zerosuitsamus\', \'p1_char_jigglypuff\', \'p1_char_megaman\', \'p2_char_donkeykong\', \'p1_char_ike\', \'p2_char_mario\', \'p1_char_piranhaplant\', \'stage_Unova Pokemon League\', \'p1_char_ken\', \'p1_char_richter\', \'p2_char_banjokazooie\', \'p2_char_pichu\', \'p1_char_banjokazooie\', \'p1_char_incineroar\', \'p1_char_byleth\', \'p1_char_corrin\', \'p2_char_captainfalcon\', \'p2_char_hero\', \'p1_char_yoshi\', \'stage_Fountain of Dreams\', \'stage_Mementos\', \'p1_char_wario\', \'p2_char_mewtwo\', \'stage_Smashville\', \'stage_Saffron City\', \'p1_char_villager\', \'p2_char_megaman\', \'stage_-1\', \'stage_Balloon Fight\', \'p1_char_kirby\', \'p1_char_ryu\', \'p2_char_kingkrool\', \'p2_char_shulk\', \'p2_char_corrin\', \'p1_char_chrom\', \'p2_char_miibrawler\', \'p1_char_pyra\', \'p2_char_ryu\', \'p2_char_ike\', \'p1_char_snake\', \'p1_char_shulk\', \'p1_char_miigunner\', \'p2_char_metaknight\', \'stage_Kalos Pokémon League\', \'stage_Dream Land\', \'p2_char_sonic\', \'stage_Battlefield\', \'p1_char_mario\', \'p1_char_terry\', \'p1_char_daisy\', "stage_Yoshi\'s Island (Melee)", \'p2_char_villager\', \'p2_char_samus\', \'p1_char_wiifittrainer\', \'stage_Mute City SNES\', \'stage_Green Greens\', \'stage_Reset Bomb Forest\', \'p1_char_ness\', \'p1_char_pichu\', \'p1_char_sonic\', \'p1_char_wolf\', \'p2_char_darksamus\', \'stage_Moray Towers\', \'p1_char_kazuya\', \'stage_Duck Hunt\', \'p2_char_isabelle\', \'p2_char_wario\', \'stage_Eldin Bridge\', \'p2_char_iceclimbers\', \'p1_char_joker\', \'p2_char_ness\', \'p1_char_lucina\', \'p1_char_palutena\', \'p2_char_lucas\', \'p1_char_duckhunt\', \'p2_char_peach\', \'p1_char_donkeykong\', \'p2_char_wiifittrainer\', "stage_Yggdrasil\'s Altar", \'p2_char_pacman\', \'p2_char_daisy\', \'stage_Corneria\', \'p1_char_bayonetta\', \'p2_char_yoshi\', \'p2_char_bowser\', "stage_Yoshi\'s Story", \'p2_char_bayonetta\'] not in index'

In [20]:
prediction = dtc.predict(game_test[dtc.feature_names_in_])
actual = game_test["p1_won"]

print("Accuracy: {acc}".format(acc = accuracy_score(prediction,actual)))
# NEED TO DO CROSS-VALIDATION BEFORE

Accuracy: 0.6192667669862949


In [21]:
rfc = None
rfcfile = "models/random_forest.joblib"

predictors = list(set(game_train.head()).difference({"p1_won","p1_id","p2_id"}))
response = "p1_won"

if not os.path.isfile(rfcfile):
    rfc = ensemble.RandomForestClassifier(verbose=3,n_jobs=4)

    x = game_train[predictors]
    y = game_train[response]

    rfc.fit(x,y)

    joblib.dump(rfc, rfcfile)

    # the above creates a very large (10gb) file, comment it out if you don't want it.
else:
    rfc = joblib.load(rfcfile)

prediction = rfc.predict(game_train[rfc.feature_names_in_])
actual = game_train[response]

print("Accuracy: {acc}".format(acc = accuracy_score(prediction,actual)))

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.


building tree 1 of 100building tree 2 of 100
building tree 3 of 100

building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100


[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  2.1min


building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70

[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:  8.7min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:   14.2s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   55.2s finished


Accuracy: 0.968066545006573


In [22]:
prediction = rfc.predict(game_test[rfc.feature_names_in_])
actual = game_test[response]

print("Accuracy: {acc}".format(acc = accuracy_score(prediction,actual)))

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    3.0s


Accuracy: 0.6676394971422401


[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   12.2s finished


In [23]:
gbc = None
gbcfile = "models/boosted_tree.joblib"

predictors = list(set(game_train.head()).difference({"p1_won","p1_id","p2_id"}))
response = "p1_won"

if not os.path.isfile(gbcfile):
    gbc = ensemble.GradientBoostingClassifier(verbose=3,n_estimators=1000)

    x = game_train[predictors]
    y = game_train[response]

    gbc.fit(x,y)

    joblib.dump(gbc,gbcfile)
else:
    gbc = joblib.load(gbcfile)

prediction = gbc.predict(game_train[gbc.feature_names_in_])
actual = game_train[response]

print("Accuracy: {acc}".format(acc = accuracy_score(prediction,actual)))

      Iter       Train Loss   Remaining Time 
         1           1.3699          173.08m
         2           1.3559          167.11m
         3           1.3441          163.77m
         4           1.3339          162.37m
         5           1.3253          161.80m
         6           1.3177          160.94m
         7           1.3114          160.17m
         8           1.3059          159.88m
         9           1.3011          159.59m
        10           1.2968          159.46m
        11           1.2930          159.16m
        12           1.2897          159.60m
        13           1.2869          159.73m
        14           1.2842          159.98m
        15           1.2820          159.91m
        16           1.2798          159.95m
        17           1.2780          159.86m
        18           1.2761          159.32m
        19           1.2746          158.91m
        20           1.2732          158.32m
        21           1.2719          158.10m
        2

In [None]:
prediction = gbc.predict(game_test[gbc.feature_names_in_])
actual = game_test[response]

print("Accuracy: {acc}".format(acc = accuracy_score(prediction,actual)))
# CROSS VALIDATION FIRST

Accuracy: 0.6854507297915556
