In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
#models:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier,\
GradientBoostingClassifier, VotingClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
##
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_validate
import seaborn as sns
import pickle
%matplotlib inline

In [2]:
# Function to read all the datasets
def read_all(template,start,end):
    frames = [ pd.read_json(f).fillna(0) for f in [template.format(i) for i in range(start,end)] ]
    X = pd.concat(frames, ignore_index = True,sort = True)
    return X

In [3]:
template = "datasets/dataset_finalized/dataset_{:02}.json"
df = read_all(template,0,18)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87141 entries, 0 to 87140
Columns: 178 entries, Ak47_ct to t_leads
dtypes: float64(11), int64(167)
memory usage: 118.3 MB


In [5]:
colwep = ['Ak47_ct', 'Ak47_t', 'Aug_ct', 'Aug_t', 'Awp_ct', 'Awp_t', 'C4_t', 'Cz75Auto_ct',\
          'Cz75Auto_t', 'Deagle_ct', 'Deagle_t', 'DecoyGrenade_ct', 'DecoyGrenade_t', 'Flashbang_ct',\
          'Flashbang_t', 'Glock_ct', 'Glock_t', 'HeGrenade_ct', 'HeGrenade_t', 'M4a4_ct', 'M4a4_t',\
          'MolotovIncendiaryGrenade_ct', 'MolotovIncendiaryGrenade_t', 'Mp9_ct', 'Mp9_t', 'P2000_ct',\
          'P2000_t', 'P250_ct', 'P250_t', 'Sg553_ct', 'Sg553_t', 'SmokeGrenade_ct', 'SmokeGrenade_t',\
          'UspS_ct', 'UspS_t','other_heavy_ct', 'other_heavy_t', 'other_pistols_ct', \
          'other_pistols_t', 'other_rifles_ct', 'other_rifles_t', 'other_smgs_ct', 'other_smgs_t']

# colwep = ['Ak47_ct', 'Ak47_t', 'Aug_ct', 'Aug_t', 'Awp_ct', 'Awp_t', 'C4_t',\
#           'Deagle_ct', 'Deagle_t', 'Flashbang_ct',\
#           'Flashbang_t', 'Glock_ct', 'Glock_t', 'HeGrenade_ct', 'HeGrenade_t', 'M4a4_ct', 'M4a4_t',\
#           'MolotovIncendiaryGrenade_ct', 'MolotovIncendiaryGrenade_t', 'P2000_ct',\
#           'P2000_t', 'P250_ct', 'P250_t', 'Sg553_ct', 'Sg553_t', 'SmokeGrenade_ct', 'SmokeGrenade_t',\
#           'UspS_ct', 'UspS_t']

colpla = ['alive_players_ct', 'alive_players_t', 'armor_ct1_Bin_Code', 'armor_ct2_Bin_Code',\
          'armor_ct3_Bin_Code', 'armor_ct4_Bin_Code', 'armor_ct5_Bin_Code', 'armor_ct_Bin_Code',\
          'armor_t1_Bin_Code', 'armor_t2_Bin_Code', 'armor_t3_Bin_Code', 'armor_t4_Bin_Code',\
          'armor_t5_Bin_Code', 'armor_t_Bin_Code','defuse_kit_ct1', 'defuse_kit_ct2',\
          'defuse_kit_ct3', 'defuse_kit_ct4', 'defuse_kit_ct5', 'has_helmet_ct1', 'has_helmet_ct2',\
          'has_helmet_ct3', 'has_helmet_ct4', 'has_helmet_ct5', 'has_helmet_t1', 'has_helmet_t2',\
          'has_helmet_t3', 'has_helmet_t4', 'has_helmet_t5', 'health_ct1_Bin_Code',\
          'health_ct2_Bin_Code', 'health_ct3_Bin_Code', 'health_ct4_Bin_Code', 'health_ct5_Bin_Code',\
          'health_t1_Bin_Code', 'health_t2_Bin_Code', 'health_t3_Bin_Code', 'health_t4_Bin_Code',\
          'health_t5_Bin_Code','money_ct1_Bin_Code', 'money_ct2_Bin_Code', 'money_ct3_Bin_Code',\
          'money_ct4_Bin_Code', 'money_ct5_Bin_Code', 'money_ct_Bin_Code', 'money_t1_Bin_Code',\
          'money_t2_Bin_Code', 'money_t3_Bin_Code', 'money_t4_Bin_Code', 'money_t5_Bin_Code',\
          'money_t_Bin_Code']

colsta = ['current_score_ct', 'current_score_t','t_leads','round_status_BombPlanted',\
          'round_status_FreezeTime', 'round_status_Normal', 'round_status_time_left']

colkill = ['kwct_Ak47', 'kwct_Aug', 'kwct_Awp', 'kwct_C4', 'kwct_Cz75Auto', 'kwct_Deagle',\
           'kwct_Flashbang', 'kwct_Glock', 'kwct_HeGrenade', 'kwct_Knife', 'kwct_M4a4',\
           'kwct_MolotovIncendiaryGrenade', 'kwct_Mp9', 'kwct_P2000', 'kwct_P250', 'kwct_Sg553',\
           'kwct_SmokeGrenade', 'kwct_UspS', 'kwct_other_heavy', 'kwct_other_pistols',\
           'kwct_other_rifles', 'kwct_other_smgs', 'kwct_other_utils', 'kwct_other_world', 'kwt_Ak47',\
           'kwt_Aug', 'kwt_Awp', 'kwt_C4', 'kwt_Cz75Auto', 'kwt_Deagle', 'kwt_Flashbang', 'kwt_Glock',\
           'kwt_HeGrenade', 'kwt_Knife', 'kwt_M4a4', 'kwt_MolotovIncendiaryGrenade', 'kwt_Mp9',\
           'kwt_P2000', 'kwt_P250', 'kwt_Sg553', 'kwt_SmokeGrenade', 'kwt_UspS', 'kwt_other_heavy',\
           'kwt_other_pistols', 'kwt_other_rifles', 'kwt_other_smgs', 'kwt_other_utils',\
           'kwt_other_world']

colmap = ['map_de_dust2', 'map_de_inferno', 'map_de_mirage', 'map_de_nuke', 'map_de_overpass',\
          'map_de_train', 'map_de_vertigo','map_de_cache']

colpos = ['pos_bs_ct1', 'pos_bs_ct2', 'pos_bs_ct3', 'pos_bs_ct4', 'pos_bs_ct5', 'pos_bs_t1',\
          'pos_bs_t2', 'pos_bs_t3', 'pos_bs_t4', 'pos_bs_t5']

# colpos = ['pos_bs_ct1', 'pos_bs_ct2', 'pos_bs_ct3', 'pos_bs_ct4', 'pos_bs_ct5', 'pos_bs_t1',\
#           'pos_bs_t2', 'pos_bs_t3', 'pos_bs_t4', 'pos_bs_t5', 'pr_ct1','pr_ct2', 'pr_ct3',\
#           'pr_ct4', 'pr_ct5', 'pr_t1', 'pr_t2', 'pr_t3', 'pr_t4', 'pr_t5']

cols = colpla+colmap+colwep

In [6]:
Y_all = df['round_winner_t']
X_all = df.drop(columns='round_winner_t',axis=1)[cols]
X_all = StandardScaler().fit_transform(X_all) 

In [7]:
X_all.shape

(87141, 102)

In [8]:
Y_all.shape

(87141,)

In [9]:
def train_model(model,X_t,Y_t,degree):
    pr = PolynomialFeatures(degree=degree,include_bias=True)
    X_p = pr.fit_transform(X_t)
    model.fit(X_p,Y_t)
    pred_train = model.predict(X_p)
    
    return pred_train

In [10]:
def test_model(model,X_t,degree):
    pr = PolynomialFeatures(degree=degree,include_bias=True)
    X_p = pr.fit_transform(X_t)
    
    return model.predict(X_p)

def get_mse(model,Y_train,pred_train,Y_test,pred_test):
    train_mse = mean_squared_error(Y_train, pred_train)
    test_mse = mean_squared_error(Y_test, pred_test)
    print("Training MSE = {}".format(train_mse))
    print("Test MSE = {}".format(test_mse))

def train_test(model,X_train,Y_train,X_test,Y_test,degree):
    pred_train = train_model(model,X_train,Y_train,degree)
    pred_test = test_model(model,X_test,degree)
    print(pred_train)
    get_mse(model,Y_train,pred_train,Y_test,pred_test)

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X_all, Y_all)

In [12]:
MLA = [
    LinearRegression(),
    LogisticRegression(solver='sag',verbose=1),
    GaussianNB(),
#     MLPClassifier(verbose=1),
    GradientBoostingClassifier(),
#     VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)]),
    BaggingClassifier(),
    ExtraTreesClassifier(),
    DecisionTreeClassifier(),
#     SVC(probability=True),
    KNeighborsClassifier(n_neighbors = 3),
    RandomForestClassifier(n_estimators = 100,verbose=1)
]

In [None]:
#split dataset in cross-validation with this splitter class: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html#sklearn.model_selection.ShuffleSplit
#note: this is an alternative to train_test_split
cv_split = ShuffleSplit(n_splits = 10, test_size = .20, train_size = .75, \
                                                random_state = 0 )
                    # run model 10x with 60/30 split intentionally leaving out 10%

#create table to compare MLA metrics
MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy Mean', 'MLA Test Accuracy Mean', \
               'MLA Test Accuracy 3*STD' ,'MLA Time']
MLA_compare = pd.DataFrame(columns = MLA_columns)

#create table to compare MLA predictions
MLA_predict = Y_all.copy()

#index through MLA and save performance to table
row_index = 0
for alg in MLA:

    #set name and parameters
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
    MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
    
    print("Running the classification on %s" %(MLA_name))
    
    #score model with cross validation: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate
    cv_results = cross_validate(alg, X_all, Y_all, cv  = cv_split,return_train_score=True)

    MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
    MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = cv_results['train_score'].mean()
    MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean()   
    #if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, 
    #should statistically capture 99.7% of the subsets
    MLA_compare.loc[row_index, 'MLA Test Accuracy 3*STD'] = cv_results['test_score'].std()*3   
    #let's know the worst that can happen!
    

    #save MLA predictions - see section 6 for usage
    alg.fit(X_all, Y_all)
    MLA_predict[MLA_name] = alg.predict(X_all)
    
    row_index+=1

    
#print and sort table: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sort_values.html
MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = False, inplace = True)
MLA_compare
#MLA_predict

Running the classification on LinearRegression
Running the classification on LogisticRegression


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 59 epochs took 5 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 60 epochs took 5 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 60 epochs took 5 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 58 epochs took 4 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 59 epochs took 5 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 59 epochs took 5 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 60 epochs took 4 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 59 epochs took 4 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 57 epochs took 5 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 60 epochs took 4 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 45 epochs took 5 seconds
Running the classification on GaussianNB


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.7s finished


Running the classification on GradientBoostingClassifier
Running the classification on BaggingClassifier
Running the classification on ExtraTreesClassifier
Running the classification on DecisionTreeClassifier
Running the classification on KNeighborsClassifier


In [None]:
degree = 1
# model = RandomForestClassifier(n_estimators = 100,verbose=1)
# model = KNeighborsClassifier(n_neighbors = 3)
# model = MLPClassifier(verbose=1,max_iter=300,hidden_layer_sizes=(150,100,10,2,))
model = SVC(probability=True,verbose=1)
train_test(model,X_train,Y_train,X_test,Y_test,degree)

[LibSVM]

In [32]:
pd.set_option('display.max_rows', None)
X_coeff = pd.DataFrame(df.columns)
X_coeff.columns = ['Feature']
X_coeff["Correlation"] = pd.Series(model.coef_[0])

X_coeff.sort_values(by='Correlation', ascending=False)

Unnamed: 0,Feature,Correlation
2,Aug_ct,0.919642
13,Flashbang_ct,0.246007
12,DecoyGrenade_t,0.199311
90,kwct_P250,0.193878
51,defuse_kit_ct1,0.191431
11,DecoyGrenade_ct,0.174686
61,has_helmet_t1,0.159686
36,alive_players_t,0.154433
9,Deagle_ct,0.153462
35,alive_players_ct,0.148785


In [33]:
## NOTE:
#1. proximity features not working

In [25]:
RandomForestClassifier?