In [2]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import MinMaxScaler 

%matplotlib inline 
import matplotlib.pyplot as plt 
import seaborn as sns 

from sklearn.model_selection import GridSearchCV 

from sklearn.metrics import accuracy_score, classification_report, recall_score, precision_score, confusion_matrix 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import GradientBoostingClassifier 
from sklearn.ensemble import AdaBoostClassifier 

#from xgboost import XGBClassifier 

import warnings 
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv('/Users/mac/Desktop/untitled folder/train.csv')
data.shape

(8001, 27)

In [4]:
data.head(6)

Unnamed: 0,rally,serve,hitpoint,speed,net.clearance,distance.from.sideline,depth,outside.sideline,outside.baseline,player.distance.travelled,...,previous.depth,opponent.depth,opponent.distance.from.center,same.side,previous.hitpoint,previous.time.to.net,server.is.impact.player,outcome,gender,ID
0,4,1,B,35.515042,-0.021725,3.474766,6.797621,False,False,1.46757,...,0.705435,12.5628,2.0724,True,F,0.445318,False,UE,mens,8644
1,4,2,B,33.38264,1.114202,2.540801,2.608708,False,True,2.311931,...,3.8566,12.3544,5.1124,False,B,0.432434,False,FE,mens,1182
2,23,1,B,22.31669,-0.254046,3.533166,9.435749,False,False,3.903728,...,2.908892,13.862,1.6564,False,F,0.397538,True,FE,mens,9042
3,9,1,F,36.837309,0.766694,0.586885,3.34218,True,False,0.583745,...,0.557554,14.2596,0.1606,True,B,0.671984,True,UE,mens,1222
4,4,1,B,35.544208,0.116162,0.918725,5.499119,False,False,2.333456,...,3.945317,11.3658,1.1082,False,F,0.340411,False,W,mens,4085
5,10,1,B,39.012186,0.349889,0.485098,3.264708,True,False,0.804337,...,5.007207,13.422,2.0316,True,F,0.53395,False,UE,mens,292


In [5]:
pd.unique(data.outcome) #printing unique values 

array(['UE', 'FE', 'W'], dtype=object)

<b>Outcome variable - classes</b>
<ul>Winner – the point winning player hits a shot that is not touched by the opponent</ul>
<ul>Forced error – the point winning player hits a shot that causes the opponent to not be able to return it, i.e. a good shot that is hard to handle</ul>
<ul>Unforced error – the player attempting to return the ball makes an error on an otherwise normal looking rally shot</ul>

In [6]:
len(data.columns)

27

In [7]:
data.dtypes

rally                                   int64
serve                                   int64
hitpoint                               object
speed                                 float64
net.clearance                         float64
distance.from.sideline                float64
depth                                 float64
outside.sideline                         bool
outside.baseline                         bool
player.distance.travelled             float64
player.impact.depth                   float64
player.impact.distance.from.center    float64
player.depth                          float64
player.distance.from.center           float64
previous.speed                        float64
previous.net.clearance                float64
previous.distance.from.sideline       float64
previous.depth                        float64
opponent.depth                        float64
opponent.distance.from.center         float64
same.side                                bool
previous.hitpoint                 

In [12]:
categorical_list =['hitpoint','outside.sideline','outside.baseline','same.side','previous.hitpoint','server.is.impact.player','gender','outcome']

In [13]:
for i in data[categorical_list]:
    data[i] = data[i].astype('category')

In [14]:
data.dtypes

rally                                    int64
serve                                    int64
hitpoint                              category
speed                                  float64
net.clearance                          float64
distance.from.sideline                 float64
depth                                  float64
outside.sideline                      category
outside.baseline                      category
player.distance.travelled              float64
player.impact.depth                    float64
player.impact.distance.from.center     float64
player.depth                           float64
player.distance.from.center            float64
previous.speed                         float64
previous.net.clearance                 float64
previous.distance.from.sideline        float64
previous.depth                         float64
opponent.depth                         float64
opponent.distance.from.center          float64
same.side                             category
previous.hitp

In [15]:
data.drop(['ID'], axis=1, inplace=True)
len(data.columns)

26

In [16]:
data.describe()

Unnamed: 0,rally,serve,speed,net.clearance,distance.from.sideline,depth,player.distance.travelled,player.impact.depth,player.impact.distance.from.center,player.depth,player.distance.from.center,previous.speed,previous.net.clearance,previous.distance.from.sideline,previous.depth,opponent.depth,opponent.distance.from.center,previous.time.to.net
count,8001.0,8001.0,8001.0,8001.0,8001.0,8001.0,8001.0,8001.0,8001.0,8001.0,8001.0,8001.0,8001.0,8001.0,8001.0,8001.0,8001.0,8001.0
mean,5.966004,1.3987,30.806938,0.629658,1.46763,4.421146,2.690463,11.899694,1.919544,12.253954,1.213795,28.763676,0.821562,2.19342,4.218717,12.61681,2.367952,0.549988
std,3.548182,0.489661,7.298917,0.982504,1.108697,3.144965,1.713136,2.788231,1.205449,2.039085,0.964364,6.47747,0.674663,1.038942,2.052946,2.075401,1.313927,0.186788
min,3.0,1.0,5.176078,-0.998184,0.000497,0.003135,0.0,2.156,0.0002,1.3898,0.0004,8.449117,0.028865,0.000164,0.000467,2.1612,0.0002,0.003201
25%,3.0,1.0,26.77029,-0.027092,0.5395,1.641161,1.444233,11.2214,0.9424,11.3742,0.5518,24.033218,0.404815,1.354458,2.733674,12.0824,1.3522,0.432164
50%,5.0,1.0,32.41769,0.44587,1.210847,3.860266,2.360894,12.6918,1.8294,12.5516,0.9838,29.793417,0.658382,2.168822,4.126864,12.9016,2.332,0.507559
75%,7.0,2.0,35.681431,0.970844,2.215955,7.029345,3.565853,13.553,2.7452,13.498,1.5966,33.581003,1.021397,3.022677,5.595515,13.7128,3.259,0.624135
max,38.0,2.0,55.052795,12.815893,7.569757,11.886069,14.480546,18.1256,7.7462,18.7458,9.3526,54.207506,6.730275,4.114361,9.997963,20.211,6.8526,1.635257


In [17]:
data.describe(include=['category'])

Unnamed: 0,hitpoint,outside.sideline,outside.baseline,same.side,previous.hitpoint,server.is.impact.player,outcome,gender
count,8001,8001,8001,8001,8001,8001,8001,8001
unique,4,2,2,2,4,2,3,2
top,F,False,False,False,F,True,UE,mens
freq,4402,6500,6380,6036,3684,4670,3501,4005


In [18]:
data.isnull().sum()

rally                                 0
serve                                 0
hitpoint                              0
speed                                 0
net.clearance                         0
distance.from.sideline                0
depth                                 0
outside.sideline                      0
outside.baseline                      0
player.distance.travelled             0
player.impact.depth                   0
player.impact.distance.from.center    0
player.depth                          0
player.distance.from.center           0
previous.speed                        0
previous.net.clearance                0
previous.distance.from.sideline       0
previous.depth                        0
opponent.depth                        0
opponent.distance.from.center         0
same.side                             0
previous.hitpoint                     0
previous.time.to.net                  0
server.is.impact.player               0
outcome                               0


In [19]:
data.columns

Index(['rally', 'serve', 'hitpoint', 'speed', 'net.clearance',
       'distance.from.sideline', 'depth', 'outside.sideline',
       'outside.baseline', 'player.distance.travelled', 'player.impact.depth',
       'player.impact.distance.from.center', 'player.depth',
       'player.distance.from.center', 'previous.speed',
       'previous.net.clearance', 'previous.distance.from.sideline',
       'previous.depth', 'opponent.depth', 'opponent.distance.from.center',
       'same.side', 'previous.hitpoint', 'previous.time.to.net',
       'server.is.impact.player', 'outcome', 'gender'],
      dtype='object')

In [20]:
numeric_list = ['rally','serve','speed','net.clearance','distance.from.sideline','depth','player.distance.travelled','player.impact.depth','player.impact.distance.from.center','player.depth','player.distance.from.center','previous.speed','previous.net.clearance','previous.distance.from.sideline','previous.depth','opponent.depth','opponent.distance.from.center','previous.time.to.net']
numeric_data = data[numeric_list]

In [21]:
categorical_list = ["hitpoint","outside.sideline",
                    "outside.baseline","same.side","previous.hitpoint","server.is.impact.player","gender"]

In [22]:
from sklearn.preprocessing import LabelEncoder

le1 = LabelEncoder()  ##Use LE best for target
for i in categorical_list:  ####uses indices to count unlike dummies
    le1.fit(data[i])        ####however follows alphabetic order
    data[i] = le1.transform(data[i])

In [23]:
data.head()

Unnamed: 0,rally,serve,hitpoint,speed,net.clearance,distance.from.sideline,depth,outside.sideline,outside.baseline,player.distance.travelled,...,previous.distance.from.sideline,previous.depth,opponent.depth,opponent.distance.from.center,same.side,previous.hitpoint,previous.time.to.net,server.is.impact.player,outcome,gender
0,4,1,0,35.515042,-0.021725,3.474766,6.797621,0,0,1.46757,...,2.449182,0.705435,12.5628,2.0724,1,1,0.445318,0,UE,0
1,4,2,0,33.38264,1.114202,2.540801,2.608708,0,1,2.311931,...,0.583291,3.8566,12.3544,5.1124,0,0,0.432434,0,FE,0
2,23,1,0,22.31669,-0.254046,3.533166,9.435749,0,0,3.903728,...,1.11525,2.908892,13.862,1.6564,0,1,0.397538,1,FE,0
3,9,1,1,36.837309,0.766694,0.586885,3.34218,1,0,0.583745,...,3.256695,0.557554,14.2596,0.1606,1,0,0.671984,1,UE,0
4,4,1,0,35.544208,0.116162,0.918725,5.499119,0,0,2.333456,...,1.431146,3.945317,11.3658,1.1082,0,1,0.340411,0,W,0


In [24]:
# Divide into train and test
y=data["outcome"]
X=data.drop('outcome', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=123)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(6400, 25)
(1601, 25)
(6400,)
(1601,)


In [25]:
y_train.value_counts()

UE    2792
W     2131
FE    1477
Name: outcome, dtype: int64

In [26]:
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import OneHotEncoder 

In [27]:
X_train.dtypes

rally                                   int64
serve                                   int64
hitpoint                                int64
speed                                 float64
net.clearance                         float64
distance.from.sideline                float64
depth                                 float64
outside.sideline                        int64
outside.baseline                        int64
player.distance.travelled             float64
player.impact.depth                   float64
player.impact.distance.from.center    float64
player.depth                          float64
player.distance.from.center           float64
previous.speed                        float64
previous.net.clearance                float64
previous.distance.from.sideline       float64
previous.depth                        float64
opponent.depth                        float64
opponent.distance.from.center         float64
same.side                               int64
previous.hitpoint                 

In [28]:
continuous_cols=['rally','serve','speed','net.clearance','distance.from.sideline','depth','player.distance.travelled','player.impact.depth','player.impact.distance.from.center','player.depth','player.distance.from.center','previous.speed','previous.net.clearance','previous.distance.from.sideline','previous.depth','opponent.depth','opponent.distance.from.center','previous.time.to.net']
categorical_cols=["hitpoint","outside.sideline",
                    "outside.baseline","same.side","previous.hitpoint","server.is.impact.player","gender"]

In [29]:
scaler = StandardScaler()
scaler.fit(X_train[continuous_cols])

X_train_num = pd.DataFrame(scaler.transform(X_train[continuous_cols]), columns=continuous_cols)
X_test_num = pd.DataFrame(scaler.transform(X_test[continuous_cols]), columns=continuous_cols)

In [30]:
X_train_num.head()

Unnamed: 0,rally,serve,speed,net.clearance,distance.from.sideline,depth,player.distance.travelled,player.impact.depth,player.impact.distance.from.center,player.depth,player.distance.from.center,previous.speed,previous.net.clearance,previous.distance.from.sideline,previous.depth,opponent.depth,opponent.distance.from.center,previous.time.to.net
0,0.284001,1.223152,0.270615,0.002334,-0.551396,-0.802488,-0.604128,0.476466,-1.026313,-0.246157,-1.238808,0.926002,-0.23931,0.884331,-0.680614,0.66732,-1.112562,-0.372215
1,0.284001,1.223152,1.423615,-1.101974,0.563017,1.990905,-0.59616,-2.040272,-1.359122,-2.178633,-0.343632,-1.4457,2.632303,-2.010304,-1.172311,1.12062,0.421476,-1.289968
2,-0.835732,-0.81756,1.784595,0.387496,0.015511,-0.696868,-1.123388,-0.186634,-0.542728,-0.710333,-0.391389,0.135949,-0.46982,1.830155,1.473939,2.198187,-1.035213,0.790395
3,-0.835732,-0.81756,-0.892546,-0.794785,1.004401,1.191173,-0.983148,0.194731,-1.363776,-0.359815,-0.662629,1.121188,-0.361474,1.342447,-1.70061,0.002965,1.520349,-0.796737
4,-0.275865,1.223152,1.313516,-0.897582,1.719093,1.246119,-1.367636,0.485582,0.15597,0.610416,0.704737,0.737485,-0.332466,1.649208,-0.484559,1.523882,0.243329,0.099646


In [31]:
print(X_train.hitpoint.value_counts())
print(X_train['outside.sideline'].value_counts())

1    3527
0    2419
2     345
3     109
Name: hitpoint, dtype: int64
0    5190
1    1210
Name: outside.sideline, dtype: int64


In [32]:
ohe = OneHotEncoder()

ohe.fit(X_train[categorical_cols])

columns_ohe = list(ohe.get_feature_names())
print(columns_ohe)

['x0_0', 'x0_1', 'x0_2', 'x0_3', 'x1_0', 'x1_1', 'x2_0', 'x2_1', 'x3_0', 'x3_1', 'x4_0', 'x4_1', 'x4_2', 'x4_3', 'x5_0', 'x5_1', 'x6_0', 'x6_1']


In [33]:
X_train_cat = ohe.transform(X_train[categorical_cols])
X_test_cat = ohe.transform(X_test[categorical_cols])

In [34]:
X_train_cat = pd.DataFrame(X_train_cat.todense(), columns=columns_ohe)
X_test_cat = pd.DataFrame(X_test_cat.todense(), columns=columns_ohe)

In [35]:
X_train_cat.head()

Unnamed: 0,x0_0,x0_1,x0_2,x0_3,x1_0,x1_1,x2_0,x2_1,x3_0,x3_1,x4_0,x4_1,x4_2,x4_3,x5_0,x5_1,x6_0,x6_1
0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
4,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [36]:
# X_train_num = X_train.drop(columns=categorical_cols, axis=1)
# X_test_num = X_test.drop(columns=categorical_cols, axis=1)

In [38]:
X_train = pd.concat([X_train_cat, X_train_num], axis=1)
X_test = pd.concat([X_test_cat, X_test_num], axis=1)

In [39]:
X_train.head()

Unnamed: 0,x0_0,x0_1,x0_2,x0_3,x1_0,x1_1,x2_0,x2_1,x3_0,x3_1,...,player.impact.distance.from.center,player.depth,player.distance.from.center,previous.speed,previous.net.clearance,previous.distance.from.sideline,previous.depth,opponent.depth,opponent.distance.from.center,previous.time.to.net
0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,-1.026313,-0.246157,-1.238808,0.926002,-0.23931,0.884331,-0.680614,0.66732,-1.112562,-0.372215
1,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,-1.359122,-2.178633,-0.343632,-1.4457,2.632303,-2.010304,-1.172311,1.12062,0.421476,-1.289968
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,-0.542728,-0.710333,-0.391389,0.135949,-0.46982,1.830155,1.473939,2.198187,-1.035213,0.790395
3,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,-1.363776,-0.359815,-0.662629,1.121188,-0.361474,1.342447,-1.70061,0.002965,1.520349,-0.796737
4,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.15597,0.610416,0.704737,0.737485,-0.332466,1.649208,-0.484559,1.523882,0.243329,0.099646


In [40]:
print(type(X_train))
print(type(X_test))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [41]:
X_train = X_train.values
X_test = X_test.values

print(type(X_train))
print(type(X_test))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [42]:
#Create adaboost-decision tree classifier object 

Adaboost_model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),n_estimators=600, learning_rate=1)

In [43]:
%time Adaboost_model.fit(X_train,y_train)

CPU times: user 19.4 s, sys: 41.3 ms, total: 19.4 s
Wall time: 19.4 s


AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                         class_weight=None,
                                                         criterion='gini',
                                                         max_depth=2,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort='deprecated',
                          

In [44]:
y_preds = Adaboost_model.predict(X_test)

In [45]:
print(accuracy_score(y_test,y_preds))

0.8307307932542161


In [46]:
param_grid = {'n_estimators' : [100, 150, 200],
              'learning_rate' : [0.1, 0.5, 0.9]}

Adaboost_model_clf = GridSearchCV(AdaBoostClassifier(
            DecisionTreeClassifier(max_depth=2)), param_grid, n_jobs=-1)

In [47]:
%time Adaboost_model_clf.fit(X_train, y_train)

CPU times: user 5.07 s, sys: 101 ms, total: 5.17 s
Wall time: 46.6 s


GridSearchCV(cv=None, error_score=nan,
             estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                                                class_weight=None,
                                                                                criterion='gini',
                                                                                max_depth=2,
                                                                                max_features=None,
                                                                                max_leaf_nodes=None,
                                                                                min_impurity_decrease=0.0,
                                                                                min_impurity_split=None,
                                                                                min_samples_leaf=1,
 

In [48]:
best_ada_model = Adaboost_model_clf.best_estimator_
print(Adaboost_model_clf.best_score_, Adaboost_model_clf.best_params_)

0.8535937499999999 {'learning_rate': 0.1, 'n_estimators': 150}


In [49]:
y_pred_test = best_ada_model.predict(X_test)

In [50]:
print(accuracy_score(y_test,y_pred_test))

0.8600874453466584


In [51]:
print(confusion_matrix(y_test, y_pred_test))

[[223 108  10]
 [ 49 627  33]
 [ 10  14 527]]


<b>Building the Gradient Boosting Classifier</b>

In [52]:
GBM_model = GradientBoostingClassifier(n_estimators=50,
                                      learning_rate=0.3,
                                      subsample=0.8)

In [53]:
%time GBM_model.fit(X=X_train, y=y_train)

CPU times: user 4.04 s, sys: 10.6 ms, total: 4.05 s
Wall time: 4.06 s


GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.3, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=50,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=0.8, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [54]:
y_pred = GBM_model.predict(X_test)

In [55]:
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred,digits=4))

0.8663335415365396
              precision    recall  f1-score   support

          FE     0.7568    0.7390    0.7478       341
          UE     0.8691    0.8618    0.8654       709
           W     0.9274    0.9510    0.9391       551

    accuracy                         0.8663      1601
   macro avg     0.8511    0.8506    0.8508      1601
weighted avg     0.8653    0.8663    0.8657      1601



In [56]:
#Model in use 

GBM = GradientBoostingClassifier()
#use a grid over parameters of interest 
param_grid = {'n_estimators':[100,150],
              'max_depth':[5,10],
              'learning_rate': [0.1,0.2]}
CV_GBM = GridSearchCV(estimator=GBM, param_grid=param_grid,cv=5)

In [57]:
%time CV_GBM.fit(X=X_train,y=y_train)

CPU times: user 13min 47s, sys: 1.44 s, total: 13min 49s
Wall time: 13min 50s


GridSearchCV(cv=5, error_score=nan,
             estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                  criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no_change=None,
         

In [58]:
# Find best model 
best_gbm_model = CV_GBM.best_estimator_
print(CV_GBM.best_score_,CV_GBM.best_params_)

0.8737499999999999 {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 150}


In [59]:
print(confusion_matrix(y_test,y_pred_test))

[[223 108  10]
 [ 49 627  33]
 [ 10  14 527]]
