In [18]:
import numpy as np
import pandas as pd
from bokeh.plotting import figure, output_file, show
from bokeh.io import output_notebook, curdoc
from bokeh.themes import Theme
from bokeh.models import HoverTool
from bokeh.palettes import Spectral11
from bokeh.layouts import gridplot
from bokeh.models import ColumnDataSource
from bokeh.charts import Histogram, Donut
from bokeh.io import output_notebook
from bokeh.models.formatters import DatetimeTickFormatter as dttf
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

In [33]:
df_clean = pd.read_json('messy_with_countries.json')
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19969 entries, 0 to 9999
Data columns (total 43 columns):
Age_at_win               14248 non-null float64
AverageWinsToDate        19969 non-null float64
CountryCode              19969 non-null object
CountryName              19969 non-null object
CumulativePrizeToDate    19969 non-null int64
CurrencyCode             19969 non-null object
CurrentHandle            19969 non-null object
DOB                      14248 non-null float64
EndDate                  19969 non-null int64
ExchangeRate             19969 non-null float64
Finals                   19969 non-null int64
GameId                   19969 non-null int64
NameFirst                19969 non-null object
NameLast                 19969 non-null object
Note                     105 non-null object
PlayerId                 19969 non-null int64
PriorWins                19969 non-null int64
Prize                    19969 non-null float64
PrizeToDate              19969 non-null int64
Pri

In [36]:
to_drop = [ 'rankplace_Cash Only', 'rankplace_Runners Up', 'rankplace_Winners',
            'CountryCode','CurrencyCode','CurrentHandle','ExchangeRate','GameId','NameFirst','NameLast','Note',
            'PlayerId','Prize','RankText','TeamPlayers','Team_On','TournamentName','team_history',
            'Team_On_Backup','DOB','EndDate','TotalUSDPrize', 'Prize_USD',
            'num_teams','date','totalsc2', 'CumulativePrizeToDate',
            'TournLength','Finals', 'WinsToDate','TTtoDate','sub-region', 'country']
test_here = ['SecondWins','AverageWinsToDate','PriorWins',]
df_dropped = df_clean.drop(to_drop, axis=1)

In [37]:
df_dropped.head()

Unnamed: 0,Age_at_win,AverageWinsToDate,CountryName,PriorWins,PrizeToDate,Rank,SecondWins,currency,region,teams
0,23.0,0.0,Ukraine,0,0,Winners,0,EUR,Europe,Unaffiliated
1,23.0,136.0,Ukraine,1,136,Winners,0,EUR,Europe,Unaffiliated
10,24.0,638.4,Ukraine,8,6384,Winners,1,EUR,Europe,Other
100,20.0,11777.4,Korea (Republic of),6,176661,Runners Up,3,USD,Asia,Old Generations
1000,17.0,816.25,Korea (Republic of),3,9795,Runners Up,1,KRW,Asia,Other


In [38]:
#prep for classification
df_dums = pd.get_dummies(data=df_dropped, columns = ['Rank','currency','teams','CountryName','region'])
df_dums['Age_at_win'] = df_dums['Age_at_win'].fillna(df_dums['Age_at_win'].mean())
df_dums['y']= df_dums['Rank_Runners Up'] + df_dums['Rank_Winners']
df_class = df_dums.drop(['Rank_Runners Up','Rank_Cash Only','Rank_Winners'], axis=1)
y_out = df_class.pop('y')
df_class.head()

Unnamed: 0,Age_at_win,AverageWinsToDate,PriorWins,PrizeToDate,SecondWins,currency_AUD,currency_CNY,currency_EUR,currency_KRW,currency_Other,...,CountryName_United Arab Emirates,CountryName_United Kingdom of Great Britain and Northern Ireland,CountryName_United States of America,CountryName_Uruguay,CountryName_Venezuela (Bolivarian Republic of),CountryName_Viet Nam,region_Americas,region_Asia,region_Europe,region_Other
0,23.0,0.0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1,23.0,136.0,1,136,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
10,24.0,638.4,8,6384,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
100,20.0,11777.4,6,176661,3,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1000,17.0,816.25,3,9795,1,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [39]:
X, X_finaltest, y, y_final_test = train_test_split(df_class,y_out, random_state=314)

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [41]:
def feature_importance(clf, X):
    importances = clf.feature_importances_
    indices = np.argsort(importances)[::-1]
    j = list(X.columns)
    # Print the feature ranking
    print("Feature ranking:")
    l = []
    for f in range(X.shape[1]):
        l.append( [j[indices[f]], importances[indices[f]]])
    return l

def standard_confusion_matrix(y_true, y_pred):
    """Make confusion matrix with format:
                  -----------
                  | TP | FP |
                  -----------
                  | FN | TN |
                  -----------
    Parameters
    ----------
    y_true : ndarray - 1D
    y_pred : ndarray - 1D

    Returns
    -------
    ndarray - 2D
    """
    [[tn, fp], [fn, tp]] = confusion_matrix(y_true, y_pred)
    return np.array([[tp, fp], [fn, tn]])

def roc_curve(probabilities, labels):
    '''
    INPUT: numpy array, numpy array
    OUTPUT: list, list, list

    Take a numpy array of the predicted probabilities and a numpy array of the
    true labels.
    Return the True Positive Rates, False Positive Rates and Thresholds for the
    ROC curve.
    '''
    
    thresholds = np.sort(probabilities)

    tprs = []
    fprs = []

    num_positive_cases = sum(labels)
    num_negative_cases = len(labels) - num_positive_cases

    for threshold in thresholds:
        # With this threshold, give the prediction of each instance
        predicted_positive = probabilities >= threshold
        # Calculate the number of correctly predicted positive cases
        true_positives = np.sum(predicted_positive * labels)
        # Calculate the number of incorrectly predicted positive cases
        false_positives = np.sum(predicted_positive) - true_positives
        # Calculate the True Positive Rate
        tpr = true_positives / float(num_positive_cases)
        # Calculate the False Positive Rate
        fpr = false_positives / float(num_negative_cases)

        fprs.append(fpr)
        tprs.append(tpr)

    return tprs, fprs, thresholds.tolist()

In [42]:
clf = LogisticRegression()
clf.fit(X_train, y_train)

probabilities = clf.predict_proba(X_test)[:, 1]
tprLR, fprLR, thresholds = roc_curve(probabilities, y_test)

#Cross-Validation
scores = cross_val_score(clf, X_train, y_train, cv=5)
print("Train Crossval Accuracy: " + str(scores))
print("Predicted Accuracy: "+ str(sum(scores)/5))
preds = clf.predict(X_test)
score = clf.score(X_test,y_test)
print('Test Accuracy: ' + str(score))
print(standard_confusion_matrix(y_test,preds))

Train Crossval Accuracy: [ 0.64546263  0.6571683   0.64737311  0.66028495  0.64692787]
Predicted Accuracy: 0.651443372639
Test Accuracy: 0.65811965812
[[1286  730]
 [ 550 1178]]


In [43]:
clf = GradientBoostingClassifier(learning_rate=.01, max_features=20,n_estimators=1000)
clf.fit(X_train, y_train)

#Cross-Validation
scores = cross_val_score(clf, X_train, y_train, cv=5)
print("Train Crossval Accuracy: " + str(scores))
print("Predicted Accuracy: "+ str(sum(scores)/5))

probabilities = clf.predict_proba(X_test)[:, 1]
tprGB, fprGB, thresholds = roc_curve(probabilities, y_test)

preds = clf.predict(X_test)
score = clf.score(X_test,y_test)
print('Test Accuracy: ' + str(score))
print(standard_confusion_matrix(y_test,preds))


feature_importance(clf, X)
    

Train Crossval Accuracy: [ 0.64813167  0.67141585  0.65182547  0.65449688  0.6349065 ]
Predicted Accuracy: 0.652155274858
Test Accuracy: 0.669604700855
[[1290  691]
 [ 546 1217]]
Feature ranking:


[['currency_EUR', 0.10752662709535504],
 ['PriorWins', 0.10490783113914073],
 ['currency_KRW', 0.10478655930586984],
 ['AverageWinsToDate', 0.083735160374083309],
 ['PrizeToDate', 0.064977986745511515],
 ['Age_at_win', 0.056882724068747198],
 ['SecondWins', 0.053262940657885449],
 ['CountryName_Germany', 0.051437513089960543],
 ['currency_Other', 0.025828885802796523],
 ['CountryName_Korea (Republic of)', 0.023255001859690805],
 ['currency_USD', 0.023019864678316539],
 ['teams_KT Rolster', 0.019774512700217264],
 ['CountryName_Taiwan, Province of China', 0.019237451137984744],
 ['currency_AUD', 0.017415749007983362],
 ['CountryName_Malaysia', 0.013384980892504677],
 ['teams_Jin Air Green Wings', 0.011902146900104072],
 ['teams_Other', 0.011522103386873527],
 ['CountryName_Spain', 0.011507416434372923],
 ['teams_Evil Geniuses', 0.010482555471409447],
 ['region_Europe', 0.00999291227651951],
 ['CountryName_Mexico', 0.0096569169264073707],
 ['teams_SK Telecom T1', 0.0094523250850759716],


In [44]:
clf = RandomForestClassifier(max_depth=10, n_jobs=750, max_features=30)
clf.fit(X_train, y_train)

#Cross-Validation
scores = cross_val_score(clf, X_train, y_train, cv=5)
print("Train Crossval Accuracy: " + str(scores))
print("Predicted Accuracy: "+ str(sum(scores)/5))

probabilities = clf.predict_proba(X_test)[:, 1]
tprRF, fprRF, thresholds = roc_curve(probabilities, y_test)

preds = clf.predict(X_test)
score = clf.score(X_test,y_test)
print('Test Accuracy: ' + str(score))
print(standard_confusion_matrix(y_test,preds))

feature_importance(clf, X)

Train Crossval Accuracy: [ 0.64635231  0.67097061  0.66874443  0.66740873  0.64024933]
Predicted Accuracy: 0.658745084183
Test Accuracy: 0.668002136752
[[1247  654]
 [ 589 1254]]
Feature ranking:


[['currency_KRW', 0.17005053839201217],
 ['currency_EUR', 0.10943285436689468],
 ['PriorWins', 0.10941413181004186],
 ['AverageWinsToDate', 0.10039455873444056],
 ['PrizeToDate', 0.10013778127438264],
 ['SecondWins', 0.059482503207678936],
 ['Age_at_win', 0.058458254915896291],
 ['currency_USD', 0.054504871836142221],
 ['CountryName_Germany', 0.024152968549473297],
 ['currency_Other', 0.014492870908262401],
 ['region_Europe', 0.013781988528079853],
 ['currency_CNY', 0.013518548271271502],
 ['CountryName_Taiwan, Province of China', 0.012578650948248083],
 ['CountryName_Korea (Republic of)', 0.012124341094602611],
 ['teams_Other', 0.011185902955109562],
 ['teams_KT Rolster', 0.01089028795417111],
 ['region_Asia', 0.0088937548716805821],
 ['teams_SK Telecom T1', 0.0075056807914438018],
 ['teams_Unaffiliated', 0.0071170452824445658],
 ['CountryName_France', 0.0042961288768213497],
 ['currency_AUD', 0.0042001181208246629],
 ['CountryName_Poland', 0.0040185499763094965],
 ['CountryName_Unite

In [45]:

theme = Theme(json={
    'attrs': {
        'Figure': {
            'background_fill_color': '#2F2F2F',
            'border_fill_color': '#2F2F2F',
            'outline_line_color': '#444444'
            },
        'Axis': {
            'axis_line_color': "white",
            'axis_label_text_color': "white",
            'major_label_text_color': "white",
            'major_tick_line_color': "white",
            'minor_tick_line_color': "white",
            'minor_tick_line_color': "white"
            },
        'Grid': {
            'grid_line_dash': [6, 4],
            'grid_line_alpha': .3
            },
        'Circle': {
            'fill_color': 'lightblue',
            'size': 10,
            },
        'Title': {
            'text_color': "white"
            }
        }
    })
curdoc().theme = theme
#output_notebook()
output_file=('test.html')


TOOLS = [HoverTool(tooltips=[('FPR:','@x'),('TPR','@y')])]
pal = Spectral11
p = figure(x_axis_label='False Positive Rate', y_axis_label='True Positive Rate', tools=TOOLS)
xvals = np.arange(0,1.01,.01)
yvals = np.arange(0,1.01,.01)

p.line(fprLR, tprLR, legend = 'Logistic Regression', line_width=3, line_color=pal[0])
p.line(fprGB, tprGB, legend = 'Gradient Boost', line_width=3, line_color=pal[1])
p.line(fprRF, tprRF, legend = 'Random Forest', line_width=3, line_color=pal[2])
p.line(xvals,yvals, legend = 'Random Chance', line_width=3, line_color=pal[3])
p.legend.location = 'bottom_right'
show(p)

In [46]:
print(list(X.columns))

['Age_at_win', 'AverageWinsToDate', 'PriorWins', 'PrizeToDate', 'SecondWins', 'currency_AUD', 'currency_CNY', 'currency_EUR', 'currency_KRW', 'currency_Other', 'currency_USD', 'teams_CJ Entus', 'teams_Dead Pixels', 'teams_ESC Gaming', 'teams_Evil Geniuses', 'teams_FXOpen e-Sports', 'teams_Incredible Miracle', 'teams_Invictus Gaming', 'teams_Jin Air Green Wings', 'teams_KT Rolster', 'teams_MVP', 'teams_Millenium', 'teams_Old Generations', 'teams_Other', 'teams_PSISTORM Gaming', 'teams_Prime', 'teams_ROOT Gaming', 'teams_SK Telecom T1', 'teams_SlayerS', 'teams_StarCraft II (Samsung Galaxy)', 'teams_StarTale', 'teams_Team Acer', 'teams_Team Empire', 'teams_Team Liquid', 'teams_Team SCV Life', 'teams_Unaffiliated', 'teams_mYinsanity', 'teams_mousesports', 'CountryName_Argentina', 'CountryName_Australia', 'CountryName_Austria', 'CountryName_Belarus', 'CountryName_Belgium', 'CountryName_Bolivia (Plurinational State of)', 'CountryName_Brazil', 'CountryName_Bulgaria', 'CountryName_Canada', 'Co

In [47]:
print(feature_importance(clf,X))

Feature ranking:
[['currency_KRW', 0.17005053839201217], ['currency_EUR', 0.10943285436689468], ['PriorWins', 0.10941413181004186], ['AverageWinsToDate', 0.10039455873444056], ['PrizeToDate', 0.10013778127438264], ['SecondWins', 0.059482503207678936], ['Age_at_win', 0.058458254915896291], ['currency_USD', 0.054504871836142221], ['CountryName_Germany', 0.024152968549473297], ['currency_Other', 0.014492870908262401], ['region_Europe', 0.013781988528079853], ['currency_CNY', 0.013518548271271502], ['CountryName_Taiwan, Province of China', 0.012578650948248083], ['CountryName_Korea (Republic of)', 0.012124341094602611], ['teams_Other', 0.011185902955109562], ['teams_KT Rolster', 0.01089028795417111], ['region_Asia', 0.0088937548716805821], ['teams_SK Telecom T1', 0.0075056807914438018], ['teams_Unaffiliated', 0.0071170452824445658], ['CountryName_France', 0.0042961288768213497], ['currency_AUD', 0.0042001181208246629], ['CountryName_Poland', 0.0040185499763094965], ['CountryName_United Sta