# First Serve Djokovic
### A Model is trained, which can predict the direction of a First Serve of Djokovic (46,82% accuracy)
### A 2nd Model is trained, which will can predict the Error Type/Ace of a first serve of Djokovic
### To get a First serve, both models need to be used 

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
# import data
df = pd.read_csv(r"C:\Users\carlo\TrainingsTool\Tennis\ML_Stuff\df_djoker_ready_to_split.csv", encoding="unicode_escape", low_memory=False)
df

Unnamed: 0.1,Unnamed: 0,Pts,Gm1,Gm2,Set1,Set2,Serving,0,1,2,...,44,45,46,47,48,49,50,51,52,53
0,0,0-0,0,0,0,0,ND,"4n,",4,f18,...,,,,,,,,,,
1,1,0-15,0,0,0,0,ND,"4n,",6,f2n#,...,,,,,,,,,,
2,2,15-15,0,0,0,0,ND,"4*,",,,...,,,,,,,,,,
3,3,30-15,0,0,0,0,ND,"4n,",5,b3n@,...,,,,,,,,,,
4,4,40-15,0,0,0,0,ND,4,r28,f1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19319,19319,0-15,5,4,1,1,ND,6,f38,b3,...,,,,,,,,,,
19320,19320,0-30,5,4,1,1,ND,"6w,",5,b39,...,,,,,,,,,,
19321,19321,15-30,5,4,1,1,ND,4,b39,b3,...,,,,,,,,,,
19322,19322,30-30,5,4,1,1,ND,"4d,",4,f19,...,,,,,,,,,,


In [3]:
# drop redundant index
df = df.drop("Unnamed: 0", axis = 1)
df

Unnamed: 0,Pts,Gm1,Gm2,Set1,Set2,Serving,0,1,2,3,...,44,45,46,47,48,49,50,51,52,53
0,0-0,0,0,0,0,ND,"4n,",4,f18,f3,...,,,,,,,,,,
1,0-15,0,0,0,0,ND,"4n,",6,f2n#,,...,,,,,,,,,,
2,15-15,0,0,0,0,ND,"4*,",,,,...,,,,,,,,,,
3,30-15,0,0,0,0,ND,"4n,",5,b3n@,,...,,,,,,,,,,
4,40-15,0,0,0,0,ND,4,r28,f1,r2,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19319,0-15,5,4,1,1,ND,6,f38,b3,b3,...,,,,,,,,,,
19320,0-30,5,4,1,1,ND,"6w,",5,b39,b2,...,,,,,,,,,,
19321,15-30,5,4,1,1,ND,4,b39,b3,b3n@,...,,,,,,,,,,
19322,30-30,5,4,1,1,ND,"4d,",4,f19,f3,...,,,,,,,,,,


In [4]:
# drop all columns starting from the "1" Column because we only want to predict 1st serve of djokovic
df = df.iloc[:, : 7]
df

Unnamed: 0,Pts,Gm1,Gm2,Set1,Set2,Serving,0
0,0-0,0,0,0,0,ND,"4n,"
1,0-15,0,0,0,0,ND,"4n,"
2,15-15,0,0,0,0,ND,"4*,"
3,30-15,0,0,0,0,ND,"4n,"
4,40-15,0,0,0,0,ND,4
...,...,...,...,...,...,...,...
19319,0-15,5,4,1,1,ND,6
19320,0-30,5,4,1,1,ND,"6w,"
19321,15-30,5,4,1,1,ND,4
19322,30-30,5,4,1,1,ND,"4d,"


In [5]:
# drop all rows where Djokovic is not serving
df_1 = df[df.Serving == "ND"]
df_1

Unnamed: 0,Pts,Gm1,Gm2,Set1,Set2,Serving,0
0,0-0,0,0,0,0,ND,"4n,"
1,0-15,0,0,0,0,ND,"4n,"
2,15-15,0,0,0,0,ND,"4*,"
3,30-15,0,0,0,0,ND,"4n,"
4,40-15,0,0,0,0,ND,4
...,...,...,...,...,...,...,...
19319,0-15,5,4,1,1,ND,6
19320,0-30,5,4,1,1,ND,"6w,"
19321,15-30,5,4,1,1,ND,4
19322,30-30,5,4,1,1,ND,"4d,"


In [6]:
# now that df_1 only has djocovic serving, we can drop the serving column because is does not add anything to the data anymore
df_1 = df_1.drop("Serving", axis = 1)
df_1

Unnamed: 0,Pts,Gm1,Gm2,Set1,Set2,0
0,0-0,0,0,0,0,"4n,"
1,0-15,0,0,0,0,"4n,"
2,15-15,0,0,0,0,"4*,"
3,30-15,0,0,0,0,"4n,"
4,40-15,0,0,0,0,4
...,...,...,...,...,...,...
19319,0-15,5,4,1,1,6
19320,0-30,5,4,1,1,"6w,"
19321,15-30,5,4,1,1,4
19322,30-30,5,4,1,1,"4d,"


In [7]:
# seperate the Pts column to create pts_A and pts_B, splitting at the "-" seperator
df_1[["Pts_A", "Pts_B"]] = df_1.Pts.str.split("-", expand = True)
df_1

Unnamed: 0,Pts,Gm1,Gm2,Set1,Set2,0,Pts_A,Pts_B
0,0-0,0,0,0,0,"4n,",0,0
1,0-15,0,0,0,0,"4n,",0,15
2,15-15,0,0,0,0,"4*,",15,15
3,30-15,0,0,0,0,"4n,",30,15
4,40-15,0,0,0,0,4,40,15
...,...,...,...,...,...,...,...,...
19319,0-15,5,4,1,1,6,0,15
19320,0-30,5,4,1,1,"6w,",0,30
19321,15-30,5,4,1,1,4,15,30
19322,30-30,5,4,1,1,"4d,",30,30


In [8]:
# drop the now redundant "Pts" Column
df_1 = df_1.drop("Pts", axis = 1)
df_1

Unnamed: 0,Gm1,Gm2,Set1,Set2,0,Pts_A,Pts_B
0,0,0,0,0,"4n,",0,0
1,0,0,0,0,"4n,",0,15
2,0,0,0,0,"4*,",15,15
3,0,0,0,0,"4n,",30,15
4,0,0,0,0,4,40,15
...,...,...,...,...,...,...,...
19319,5,4,1,1,6,0,15
19320,5,4,1,1,"6w,",0,30
19321,5,4,1,1,4,15,30
19322,5,4,1,1,"4d,",30,30


In [9]:
# rearrange the columns in the dataframe
df_1 = df_1[["Pts_A", "Pts_B", "Gm1", "Gm2", "Set1", "Set2", "0"]]
df_1

Unnamed: 0,Pts_A,Pts_B,Gm1,Gm2,Set1,Set2,0
0,0,0,0,0,0,0,"4n,"
1,0,15,0,0,0,0,"4n,"
2,15,15,0,0,0,0,"4*,"
3,30,15,0,0,0,0,"4n,"
4,40,15,0,0,0,0,4
...,...,...,...,...,...,...,...
19319,0,15,5,4,1,1,6
19320,0,30,5,4,1,1,"6w,"
19321,15,30,5,4,1,1,4
19322,30,30,5,4,1,1,"4d,"


In [10]:
# Now we need to translate "AD" in Pts_A and Pts_B to "50"
columns = ["Pts_A", "Pts_B"]
for i in columns:
    df_1[str(i)] = df_1[str(i)].str.replace('AD', '50')

df_1

Unnamed: 0,Pts_A,Pts_B,Gm1,Gm2,Set1,Set2,0
0,0,0,0,0,0,0,"4n,"
1,0,15,0,0,0,0,"4n,"
2,15,15,0,0,0,0,"4*,"
3,30,15,0,0,0,0,"4n,"
4,40,15,0,0,0,0,4
...,...,...,...,...,...,...,...
19319,0,15,5,4,1,1,6
19320,0,30,5,4,1,1,"6w,"
19321,15,30,5,4,1,1,4
19322,30,30,5,4,1,1,"4d,"


In [11]:
# dropping the "," because it does not add any information
df_1["0"] = df_1["0"].str.replace(",", "")
df_1

Unnamed: 0,Pts_A,Pts_B,Gm1,Gm2,Set1,Set2,0
0,0,0,0,0,0,0,4n
1,0,15,0,0,0,0,4n
2,15,15,0,0,0,0,4*
3,30,15,0,0,0,0,4n
4,40,15,0,0,0,0,4
...,...,...,...,...,...,...,...
19319,0,15,5,4,1,1,6
19320,0,30,5,4,1,1,6w
19321,15,30,5,4,1,1,4
19322,30,30,5,4,1,1,4d


In [12]:
# seperating all chars in the column "0" into multiple columns (1 for each char) translating them and putting them back together
#df_1.head(50)
df_test = df_1
df_test

Unnamed: 0,Pts_A,Pts_B,Gm1,Gm2,Set1,Set2,0
0,0,0,0,0,0,0,4n
1,0,15,0,0,0,0,4n
2,15,15,0,0,0,0,4*
3,30,15,0,0,0,0,4n
4,40,15,0,0,0,0,4
...,...,...,...,...,...,...,...
19319,0,15,5,4,1,1,6
19320,0,30,5,4,1,1,6w
19321,15,30,5,4,1,1,4
19322,30,30,5,4,1,1,4d


In [13]:
# filling in a space to seperate the chars
df_test["0"] = df_test["0"].str.replace("4", "4-")
df_test["0"] = df_test["0"].str.replace("5", "5-")
df_test["0"] = df_test["0"].str.replace("6", "6-")
df_test

Unnamed: 0,Pts_A,Pts_B,Gm1,Gm2,Set1,Set2,0
0,0,0,0,0,0,0,4-n
1,0,15,0,0,0,0,4-n
2,15,15,0,0,0,0,4-*
3,30,15,0,0,0,0,4-n
4,40,15,0,0,0,0,4-
...,...,...,...,...,...,...,...
19319,0,15,5,4,1,1,6-
19320,0,30,5,4,1,1,6-w
19321,15,30,5,4,1,1,4-
19322,30,30,5,4,1,1,4-d


In [14]:
df_test["0"] = df_test["0"].str.replace(" ", "")
df_test

Unnamed: 0,Pts_A,Pts_B,Gm1,Gm2,Set1,Set2,0
0,0,0,0,0,0,0,4-n
1,0,15,0,0,0,0,4-n
2,15,15,0,0,0,0,4-*
3,30,15,0,0,0,0,4-n
4,40,15,0,0,0,0,4-
...,...,...,...,...,...,...,...
19319,0,15,5,4,1,1,6-
19320,0,30,5,4,1,1,6-w
19321,15,30,5,4,1,1,4-
19322,30,30,5,4,1,1,4-d


In [15]:
# seperating on the "-" 
df_test[["0_dir", "0_spec"]] = df_test["0"].str.split("-", expand = True)
df_test

Unnamed: 0,Pts_A,Pts_B,Gm1,Gm2,Set1,Set2,0,0_dir,0_spec
0,0,0,0,0,0,0,4-n,4,n
1,0,15,0,0,0,0,4-n,4,n
2,15,15,0,0,0,0,4-*,4,*
3,30,15,0,0,0,0,4-n,4,n
4,40,15,0,0,0,0,4-,4,
...,...,...,...,...,...,...,...,...,...
19319,0,15,5,4,1,1,6-,6,
19320,0,30,5,4,1,1,6-w,6,w
19321,15,30,5,4,1,1,4-,4,
19322,30,30,5,4,1,1,4-d,4,d


In [16]:
df_test = df_test.drop("0", axis=1)
df_test

Unnamed: 0,Pts_A,Pts_B,Gm1,Gm2,Set1,Set2,0_dir,0_spec
0,0,0,0,0,0,0,4,n
1,0,15,0,0,0,0,4,n
2,15,15,0,0,0,0,4,*
3,30,15,0,0,0,0,4,n
4,40,15,0,0,0,0,4,
...,...,...,...,...,...,...,...,...
19319,0,15,5,4,1,1,6,
19320,0,30,5,4,1,1,6,w
19321,15,30,5,4,1,1,4,
19322,30,30,5,4,1,1,4,d


In [17]:
df_test["0_spec"] = df_test["0_spec"].str.replace("n", "110")
df_test["0_spec"] = df_test["0_spec"].str.replace("w", "119")
df_test["0_spec"] = df_test["0_spec"].str.replace("d", "100")
df_test["0_spec"] = df_test["0_spec"].str.replace("x", "120")
df_test["0_spec"] = df_test["0_spec"].str.replace("*", "42")
df_test["0_spec"] = df_test["0_spec"].str.replace("+", "43")
df_test["0_spec"] = df_test["0_spec"].str.replace("#", "35")
df_test["0_spec"] = df_test["0_spec"].str.replace("C", "67")
df_test

Unnamed: 0,Pts_A,Pts_B,Gm1,Gm2,Set1,Set2,0_dir,0_spec
0,0,0,0,0,0,0,4,110
1,0,15,0,0,0,0,4,110
2,15,15,0,0,0,0,4,42
3,30,15,0,0,0,0,4,110
4,40,15,0,0,0,0,4,
...,...,...,...,...,...,...,...,...
19319,0,15,5,4,1,1,6,
19320,0,30,5,4,1,1,6,119
19321,15,30,5,4,1,1,4,
19322,30,30,5,4,1,1,4,100


In [18]:
df_test["0_dir"] = df_test["0_dir"].str.replace("c", "99")
df_test["0_dir"] = df_test["0_dir"].str.replace("g", "103")

In [19]:
df_test.to_csv("first_serve_pred_djoko_data.csv")

In [20]:
df_direction_pred_data = df_test.drop("0_spec", axis=1)
df_direction_pred_data

Unnamed: 0,Pts_A,Pts_B,Gm1,Gm2,Set1,Set2,0_dir
0,0,0,0,0,0,0,4
1,0,15,0,0,0,0,4
2,15,15,0,0,0,0,4
3,30,15,0,0,0,0,4
4,40,15,0,0,0,0,4
...,...,...,...,...,...,...,...
19319,0,15,5,4,1,1,6
19320,0,30,5,4,1,1,6
19321,15,30,5,4,1,1,4
19322,30,30,5,4,1,1,4


In [21]:
df_specs_pred_data = df_test.drop("0_dir", axis=1)
df_specs_pred_data

Unnamed: 0,Pts_A,Pts_B,Gm1,Gm2,Set1,Set2,0_spec
0,0,0,0,0,0,0,110
1,0,15,0,0,0,0,110
2,15,15,0,0,0,0,42
3,30,15,0,0,0,0,110
4,40,15,0,0,0,0,
...,...,...,...,...,...,...,...
19319,0,15,5,4,1,1,
19320,0,30,5,4,1,1,119
19321,15,30,5,4,1,1,
19322,30,30,5,4,1,1,100


In [22]:
# now we create the model to predict the first serve direction of djokovic
X = df_direction_pred_data.drop("0_dir", axis=1)
y = df_direction_pred_data["0_dir"]

In [23]:
X

Unnamed: 0,Pts_A,Pts_B,Gm1,Gm2,Set1,Set2
0,0,0,0,0,0,0
1,0,15,0,0,0,0
2,15,15,0,0,0,0
3,30,15,0,0,0,0
4,40,15,0,0,0,0
...,...,...,...,...,...,...
19319,0,15,5,4,1,1
19320,0,30,5,4,1,1
19321,15,30,5,4,1,1
19322,30,30,5,4,1,1


In [24]:
clf = RandomForestClassifier(n_estimators=30)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [26]:
clf.fit(X_train, y_train)

In [27]:
y_preds = clf.predict(X_test)
y_preds

array(['6', '4', '6', ..., '6', '6', '4'], dtype=object)

In [28]:
y_test

9243     6
17243    4
3910     6
872      4
10480    4
        ..
12492    4
4549     4
1829     6
13911    4
14730    4
Name: 0_dir, Length: 2813, dtype: object

In [29]:
clf.score(X_test, y_test)

0.44472093849982225

In [30]:
np.random.seed(42)
for i in range(10, 100, 10):
    print(f"Trying model with {i} estimators...")
    clf = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
    print(f"Model accuracy on test set: {clf.score(X_test, y_test) * 100:.2f}%")
    print("")

Trying model with 10 estimators...
Model accuracy on test set: 44.79%

Trying model with 20 estimators...
Model accuracy on test set: 44.76%

Trying model with 30 estimators...


Model accuracy on test set: 44.47%

Trying model with 40 estimators...
Model accuracy on test set: 45.08%

Trying model with 50 estimators...
Model accuracy on test set: 44.97%

Trying model with 60 estimators...
Model accuracy on test set: 44.26%

Trying model with 70 estimators...
Model accuracy on test set: 44.05%

Trying model with 80 estimators...
Model accuracy on test set: 44.54%

Trying model with 90 estimators...
Model accuracy on test set: 44.33%

