In [32]:
# Data Processing
import pandas as pd
import numpy as np

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.dummy import DummyClassifier
from scipy.stats import randint

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import pygraphviz

In [33]:
df = pd.read_csv('../data/exploded-serves.csv')
df = df[df['Direction'] != 'Fault']

In [34]:
X = df.reset_index(drop=True).drop(columns=['Direction', 'Unnamed: 0'])
print(X['GamesReturner'])
y = df['Direction']

0        0
1        0
2        0
3        0
4        0
        ..
65639    3
65640    3
65641    3
65642    3
65643    3
Name: GamesReturner, Length: 65644, dtype: int64


#### Encode X

In [35]:
categorical_columns = ['Surface', 'FullNameServer', 'HandReturner'] + [word for word in X.columns if word.startswith('previous_directions')]
print(categorical_columns)

['Surface', 'FullNameServer', 'HandReturner', 'previous_directions0', 'previous_directions1', 'previous_directions2', 'previous_directions3', 'previous_directions4']


In [36]:
X = pd.get_dummies(X, columns=categorical_columns, drop_first=True, dtype='bool')

In [37]:
dict_for_replacing = {
    '0': 0,
    '15': 1,
    '30': 2,
    '40': 3,
    'AD': 4,
    '1': 1,
    '2': 2,
    '3': 3,
    '4': 4,
    '5': 5,
    '6': 6,
    '7': 7,
    '8': 8,
    '9': 9,
    '10': 10,
    '11': 11,
    '12': 12,
    '13': 13,
    '14': 14,
    '15': 15
}

X['PointsReturner'] = X['PointsReturner'].replace(to_replace=dict_for_replacing)
X['PointsServer'] = X['PointsServer'].replace(to_replace=dict_for_replacing)

#### Encode Y

In [38]:
label_encoder = LabelEncoder()
encoded_y = label_encoder.fit_transform(y)

## Model

In [53]:
display(X.dtypes)

PointsServer                         int64
PointsReturner                       int64
GamesServer                          int64
GamesReturner                        int64
SetsServer                           int64
SetsReturner                         int64
IsFirstServe                          bool
previous_point_server_wins0          int64
previous_point_server_wins1          int64
previous_point_server_wins2          int64
previous_point_server_wins3          int64
previous_point_server_wins4          int64
previous_rally_length0               int64
previous_rally_length1               int64
previous_rally_length2               int64
previous_rally_length3               int64
previous_rally_length4               int64
previous_serve_is_first0              bool
previous_serve_is_first1              bool
previous_serve_is_first2              bool
previous_serve_is_first3              bool
previous_serve_is_first4              bool
Surface_Clay                          bool
Surface_Gra

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, encoded_y, test_size=0.2)
simple_rf = RandomForestClassifier(max_depth=1, max_features=5, n_estimators=1200)
simple_rf.fit(X_train, y_train)
y_pred = simple_rf.predict(X_test)
y_train_pred = simple_rf.predict(X_train)

test_accuracy = accuracy_score(y_test, y_pred)
print("Accuracy test:", test_accuracy)

train_accuracy = accuracy_score(y_train, y_train_pred)
print("Accuracy train:", train_accuracy)

Accuracy test: 0.4302688704394851
Accuracy train: 0.43359040274207367


In [50]:
# Export the first three decision trees from the forest

for i in range(3):
    tree = simple_rf.estimators_[i]
    dotfile = open(f"../dtree{i}.dot", 'w')
    dot_data = export_graphviz(tree,
                               feature_names=X_train.columns,  
                               filled=True,                                 
                               impurity=False, 
                               proportion=True,
                               out_file=dotfile)
    dotfile.close()    

In [41]:
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X, encoded_y)
dummy_clf.predict(X)
print(dummy_clf.score(X, encoded_y))

0.4329260861617208


In [42]:
param_dist = {'n_estimators': randint(150,1000),
              'max_depth': randint(1,20),
              'max_features': randint(5,10)}

# Create a random forest classifier
rf = RandomForestClassifier()

# Use random search to find the best hyperparameters
rand_search = RandomizedSearchCV(rf, 
                                 param_distributions = param_dist, 
                                 n_iter=15, 
                                 cv=10)

rand_search.fit(X_train, y_train)
best_rf = rand_search.best_estimator_

# Print the best hyperparameters
print('Best hyperparameters:',  rand_search.best_params_)

y_pred = best_rf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print("Accuracy test:", test_accuracy)

Best hyperparameters: {'max_depth': 14, 'max_features': 9, 'n_estimators': 931}
Accuracy test: 0.4768070683220352


In [44]:
y_pred_train = best_rf.predict(X_train)
train_accuracy = accuracy_score(y_train, y_pred_train)
print("Accuracy train:", train_accuracy)

Accuracy train: 0.8418356660001904
