In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels as ols

import warnings
warnings.filterwarnings('ignore')

import time

from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from statsmodels.formula.api import logit

In [2]:
# READ IN DATA
killsData = pd.read_csv('kills.csv')
roundViewData = pd.read_csv('round_view.csv')

In [3]:
%%time
matchNames = (roundViewData['file']).unique()
updatedKillsData = pd.DataFrame()

for i in range( 40 ):
    numberOfRounds = roundViewData[roundViewData['file'] == matchNames[i]].shape[0]
    roundSplit = killsData[killsData['file'] == matchNames[i]].get('seconds').diff()
    roundSplitArr = roundSplit[roundSplit <= 0].index
    startOfDifGame = killsData[killsData['file'] == matchNames[i+1]].get('seconds').diff().index[0]
    
#     print(startOfDifGame)
    
    #Iterate through round ranges(for killsData) to apply new columns     
    for j in range(len(roundSplitArr)):
        currentRoundInfo = roundViewData.iloc[j]
        
#         print(roundSplitArr[j])
#         print("VALUE OF J:", j)

        #First and Second Round Condition     
        if j == 0 and roundSplitArr[j] != 0:
            #For the first round        
            updatedKillsData = updatedKillsData.append(killsData.iloc[0:roundSplitArr[j]].assign(**{
                'Round_Type': currentRoundInfo['round_type'],
                'Winner_Side' : currentRoundInfo['winner_side'],
                'Ct_Eq_Val': currentRoundInfo['ct_eq_val'],
                'T_Eq_Val': currentRoundInfo['t_eq_val'],
                'map': currentRoundInfo['map']
            }))
            
            #Update for next Round           
            currentRoundInfo = roundViewData.iloc[j+1]
            updatedKillsData = updatedKillsData.append(killsData.iloc[roundSplitArr[j]:roundSplitArr[j+1]].assign(**{
                'Round_Type': currentRoundInfo['round_type'],
                'Winner_Side' : currentRoundInfo['winner_side'],
                'Ct_Eq_Val': currentRoundInfo['ct_eq_val'],
                'T_Eq_Val': currentRoundInfo['t_eq_val'],
                'map': currentRoundInfo['map']
            }))
      
#             print(updatedKillsData)
        
        #Last Round Condition
        elif j == (len(roundSplitArr) - 1):
            updatedKillsData = updatedKillsData.append(killsData.iloc[roundSplitArr[j]:startOfDifGame].assign(**{
                'Round_Type': currentRoundInfo['round_type'],
                'Winner_Side' : currentRoundInfo['winner_side'],
                'Ct_Eq_Val': currentRoundInfo['ct_eq_val'],
                'T_Eq_Val': currentRoundInfo['t_eq_val'],
                'map': currentRoundInfo['map']
            }))
#             print(updatedKillsData)

        #All Other Rounds      
        else:
            updatedKillsData = updatedKillsData.append(killsData.iloc[roundSplitArr[j]:roundSplitArr[j+1]].assign(**{
                'Round_Type': currentRoundInfo['round_type'],
                'Winner_Side' : currentRoundInfo['winner_side'],
                'Ct_Eq_Val': currentRoundInfo['ct_eq_val'],
                'T_Eq_Val': currentRoundInfo['t_eq_val'],
                'map': currentRoundInfo['map']
            }))
#             print(updatedKillsData)
        
#         print('=======================')


CPU times: user 25 s, sys: 195 ms, total: 25.2 s
Wall time: 25.2 s


# DATA CLEANING

In [4]:
#updatedKillsData['Winner_Side'].value_counts()
updatedKillsData.head()

Unnamed: 0.1,Unnamed: 0,file,seconds,ct_alive,t_alive,is_bomb_planted,wp_type,is_ct,Round_Type,Winner_Side,Ct_Eq_Val,T_Eq_Val,map
0,0,esea_match_13770997.dem,30.74165,5,4,0,Pistol,1,PISTOL_ROUND,Terrorist,4300,4250,de_overpass
1,1,esea_match_13770997.dem,31.93185,4,4,0,Pistol,0,PISTOL_ROUND,Terrorist,4300,4250,de_overpass
2,2,esea_match_13770997.dem,34.28094,3,4,0,Pistol,0,PISTOL_ROUND,Terrorist,4300,4250,de_overpass
3,3,esea_match_13770997.dem,38.93212,3,3,0,Pistol,1,PISTOL_ROUND,Terrorist,4300,4250,de_overpass
4,4,esea_match_13770997.dem,40.76441,2,3,0,Pistol,0,PISTOL_ROUND,Terrorist,4300,4250,de_overpass


In [5]:
ct_win = updatedKillsData.loc[updatedKillsData['Winner_Side'] == 'CounterTerrorist']

tr_win = updatedKillsData.loc[updatedKillsData['Winner_Side'] == 'Terrorist']
random_tr_sample = tr_win.sample(ct_win.shape[0])

In [6]:
equal_dataset = pd.concat([ct_win, random_tr_sample], axis = 0)

In [7]:
equal_dataset

Unnamed: 0.1,Unnamed: 0,file,seconds,ct_alive,t_alive,is_bomb_planted,wp_type,is_ct,Round_Type,Winner_Side,Ct_Eq_Val,T_Eq_Val,map
38,38,esea_match_13770997.dem,40.38855,4,5,0,Rifle,0,ECO,CounterTerrorist,5400,20550,de_overpass
39,39,esea_match_13770997.dem,43.39539,3,5,0,Rifle,0,ECO,CounterTerrorist,5400,20550,de_overpass
40,40,esea_match_13770997.dem,48.93921,3,4,0,Rifle,1,ECO,CounterTerrorist,5400,20550,de_overpass
41,41,esea_match_13770997.dem,61.76520,3,3,0,Rifle,1,ECO,CounterTerrorist,5400,20550,de_overpass
42,42,esea_match_13770997.dem,62.26636,3,2,0,Rifle,1,ECO,CounterTerrorist,5400,20550,de_overpass
...,...,...,...,...,...,...,...,...,...,...,...,...,...
963,963,esea_match_13779771.dem,83.30298,4,4,0,Rifle,0,PISTOL_ROUND,Terrorist,4300,4250,de_overpass
1221,1221,esea_match_13779775.dem,36.20923,4,5,0,Rifle,0,PISTOL_ROUND,Terrorist,4300,4250,de_overpass
1363,1363,esea_match_13779776.dem,39.17444,4,5,0,Rifle,0,PISTOL_ROUND,Terrorist,4300,4250,de_overpass
483,483,esea_match_13779769.dem,86.11435,4,1,0,Rifle,1,PISTOL_ROUND,Terrorist,4300,4250,de_overpass


# FEATURE ENGINEERING

This is just some feature engineering so that the data is in a suitable method for the logistic regression (don't modify)

In [8]:
# WEAPONS - ONE HOT ENCODED

ohe_weapons = OneHotEncoder()
weapon_types = ohe_weapons.fit_transform(equal_dataset[['wp_type']])

weapons = pd.DataFrame(weapon_types.toarray()).reset_index(drop = True)
weapons.columns = ohe_weapons.get_feature_names()

In [9]:
# ROUND TYPE - ONE HOT ENCODED

ohe_round_type = OneHotEncoder()
round_types = ohe_round_type.fit_transform(equal_dataset[['Round_Type']])

round_type = pd.DataFrame(round_types.toarray()).reset_index(drop = True)
round_type.columns = ohe_round_type.get_feature_names()

In [10]:
equal_dataset['y'] = equal_dataset['Winner_Side'].apply(lambda x: 1 if x == 'CounterTerrorist' else 0)
equal_dataset['Is_Overpass'] = equal_dataset['map'].apply(lambda x: 1 if x == 'de_overpass' else 0)
equal_dataset = equal_dataset.drop(columns = ['Winner_Side', 'map', 'Round_Type', 'wp_type', 'seconds', 'file', 'Unnamed: 0']).reset_index(drop = True)

In [11]:
finalized_data = pd.concat([equal_dataset, weapons, round_type], axis = 1)
input_labels = np.array(finalized_data.columns)
input_labels = np.delete(input_labels, 6)
input_labels

array(['ct_alive', 't_alive', 'is_bomb_planted', 'is_ct', 'Ct_Eq_Val',
       'T_Eq_Val', 'Is_Overpass', 'x0_Equipment', 'x0_Grenade',
       'x0_Heavy', 'x0_Pistol', 'x0_Rifle', 'x0_SMG', 'x0_Sniper',
       'x0_ECO', 'x0_FORCE_BUY', 'x0_NORMAL', 'x0_PISTOL_ROUND',
       'x0_SEMI_ECO'], dtype=object)

In [28]:
# randomly shuffle the data
shuffled_data = finalized_data.sample(frac = 1, random_state = 8888).reset_index(drop = True)

# split the data into training and testing data 
# 70% of the data will be devoted to training the model
# 30 % of the data will be devoted to testing the model 

training_data = shuffled_data.loc[:7332]
testing_data = shuffled_data.loc[7333:]

In [29]:
# Let's take a look at the data we're working with at hand
training_data.head()

Unnamed: 0,ct_alive,t_alive,is_bomb_planted,is_ct,Ct_Eq_Val,T_Eq_Val,y,Is_Overpass,x0_Equipment,x0_Grenade,x0_Heavy,x0_Pistol,x0_Rifle,x0_SMG,x0_Sniper,x0_ECO,x0_FORCE_BUY,x0_NORMAL,x0_PISTOL_ROUND,x0_SEMI_ECO
0,1,1,0,0,4300,4250,0,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,5,4,0,1,4550,3850,1,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,5,3,0,1,4850,27600,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,4,4,0,1,31400,4450,1,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4,5,4,0,1,4300,4250,0,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


# MODEL 1: USES ALL PREDICTORS

In [30]:
# This forms the logistic regression equation used to train our data 
eq = 'y ~ 1 + ' + ' + '.join(input_labels)
eq

'y ~ 1 + ct_alive + t_alive + is_bomb_planted + is_ct + Ct_Eq_Val + T_Eq_Val + Is_Overpass + x0_Equipment + x0_Grenade + x0_Heavy + x0_Pistol + x0_Rifle + x0_SMG + x0_Sniper + x0_ECO + x0_FORCE_BUY + x0_NORMAL + x0_PISTOL_ROUND + x0_SEMI_ECO'

In [31]:
# fit a logistic regression model using all predictors
model = logit(eq, data = training_data).fit()
# obtain a summary of the model's data
model.summary()

Optimization terminated successfully.
         Current function value: 0.049837
         Iterations 10


0,1,2,3
Dep. Variable:,y,No. Observations:,7333.0
Model:,Logit,Df Residuals:,7313.0
Method:,MLE,Df Model:,19.0
Date:,"Thu, 03 Jun 2021",Pseudo R-squ.:,0.9281
Time:,19:05:26,Log-Likelihood:,-365.45
converged:,True,LL-Null:,-5082.8
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1.1503,2.14e+06,-5.38e-07,1.000,-4.19e+06,4.19e+06
ct_alive,0.0078,0.108,0.072,0.943,-0.203,0.219
t_alive,-0.0063,0.095,-0.066,0.947,-0.193,0.180
is_bomb_planted,0.7573,0.377,2.007,0.045,0.018,1.497
is_ct,0.1805,0.291,0.620,0.535,-0.390,0.751
Ct_Eq_Val,0.0004,5.88e-05,6.056,0.000,0.000,0.000
T_Eq_Val,0.0002,5.62e-05,4.210,0.000,0.000,0.000
Is_Overpass,-8.4596,0.379,-22.299,0.000,-9.203,-7.716
x0_Equipment,1.9525,,,,,


In [32]:
# obtain the predicted values on the TRAINING data
logistic_output = pd.Series(model.predict())

# write a threshold so to categorized each logistic output to a prediction 
first_model_training_predictions = pd.DataFrame({'Logistic Output': logistic_output})
first_model_training_predictions['Prediction'] = (first_model_training_predictions['Logistic Output'].
                                        apply(lambda x: 1 if x >= 0.5 else 0))
first_model_training_predictions['Actual Values'] = training_data['y']

# display the training data
first_model_training_predictions.head()

Unnamed: 0,Logistic Output,Prediction,Actual Values
0,0.003279,0,0
1,0.958238,1,1
2,0.983973,1,1
3,0.999797,1,1
4,0.004865,0,0


In [33]:
# CALCULATING THE ERROR ON THE TRAINING DATA
# the proportion of correctly classified winners

training_error = (first_model_training_predictions['Prediction'] == first_model_training_predictions['Actual Values']).mean()
training_error

0.9938633574253375

In [18]:
# plot the logistic output
'''
plt.scatter(np.arange(0, first_model_training_predictions.shape[0]), 
            first_model_training_predictions['Logistic Output'], s = 9) # need to add actual values to this graph 
'''

"\nplt.scatter(np.arange(0, first_model_training_predictions.shape[0]), \n            first_model_training_predictions['Logistic Output'], s = 9) # need to add actual values to this graph \n"

In [19]:
# obtain the predicted values on the TESTING data
logistic_output = pd.Series(model.predict(testing_data))

# write a threshold so to categorized each logistic output to a prediction 
first_model_testing_predictions = pd.DataFrame({'Logistic Output': logistic_output})
first_model_testing_predictions['Prediction'] = (first_model_testing_predictions['Logistic Output'].
                                        apply(lambda x: 1 if x >= 0.5 else 0))
first_model_testing_predictions['Actual Values'] = testing_data['y']

# display the training data
first_model_training_predictions.head()

Unnamed: 0,Logistic Output,Prediction,Actual Values
0,0.003279,0,0
1,0.958238,1,1
2,0.983973,1,1
3,0.999797,1,1
4,0.004865,0,0


In [34]:
# CALCULATING THE ERROR ON THE TESTING DATA
# the proportion of correctly classified winners

testing_error = (first_model_testing_predictions['Prediction'] == first_model_testing_predictions['Actual Values']).mean()
testing_error

0.9891823098950048

# MODEL 2: USES ONLY A SUBSET OF PREDICTORS

- use the training and testing data I made 
- iterate through each number of predictors (i.e, a model with 1 predictor, a model with 2 predictors, ... up to 20)
- do kfold cross validation on the training data with each model, and get the average cross validated training error
- choose the model with the smallest kfold error 
- using that chosen model, test it on the testing data