In [31]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [32]:
# import the whole dataset
points_m_from_2017 = pd.read_csv(r"C:\Users\carlo\TrainingsTool\Tennis\Datasets\charting-m-points-from-2017-new.csv", encoding= 'unicode_escape', low_memory=False)

In [33]:
points_m_from_2017.loc[:, ["Pts", "Gm1", "Gm2", "Set1", "Set2", "Serving"]].head(5)

Unnamed: 0,Pts,Gm1,Gm2,Set1,Set2,Serving
0,0-0,0,0,0,0,MC
1,0-15,0,0,0,0,MC
2,15-15,0,0,0,0,MC
3,15-30,0,0,0,0,MC
4,30-30,0,0,0,0,MC


In [34]:
# only take matches where a specific player played
djokovic_points_from_2017 = points_m_from_2017.loc[points_m_from_2017["match_id"].str.contains("Novak_Djokovic", case = True)]


In [6]:
len(djokovic_points_from_2017)

19324

In [35]:
# take the relevant columns from the points where djokovic played
test_df = djokovic_points_from_2017.loc[:, ["Pts", "Gm1", "Gm2", "Set1", "Set2", "Serving", "1st", "2nd"]]
len(test_df)

19324

In [36]:
# ralley colums are being converted into strings
test_df = test_df.astype({"1st": str, "2nd": str})
len(test_df)

19324

In [37]:
# join the two ralley colums together into one column with "," seperator
test_df["1st,2nd"] = test_df[["1st", "2nd"]].apply(lambda x: ",".join(x), axis=1)
test_df.head(5)

Unnamed: 0,Pts,Gm1,Gm2,Set1,Set2,Serving,1st,2nd,"1st,2nd"
637,0-0,0,0,0,0,ND,4n,4f18f3s2f1f3b3n@,"4n,4f18f3s2f1f3b3n@"
638,0-15,0,0,0,0,ND,4n,6f2n#,"4n,6f2n#"
639,15-15,0,0,0,0,ND,4*,,"4*,nan"
640,30-15,0,0,0,0,ND,4n,5b3n@,"4n,5b3n@"
641,40-15,0,0,0,0,ND,4r28f1r2f3b3f2f1f1f3b3b3b1r2n#,,"4r28f1r2f3b3f2f1f1f3b3b3b1r2n#,nan"


In [40]:
# drop the two other columns which are irrelevant now
test_df.drop(["1st", "2nd"], axis = 1, inplace=True)
test_df.head(5)
len(test_df)


19324

In [11]:
# only takes the ralley column and safes it as a dataframe
ralley_df = test_df["1st,2nd"]
print(ralley_df)
type(ralley_df)

637                                    4n,4f18f3s2f1f3b3n@
638                                               4n,6f2n#
639                                                 4*,nan
640                                               4n,5b3n@
641                     4r28f1r2f3b3f2f1f1f3b3b3b1r2n#,nan
                                ...                       
295158        6f38b3b3b2f3b3s3f3b3f3b3f2f1f1f2s3s3b1n@,nan
295159           6w,5b39b2b2f3b3b3s3b3b3b3b3s3b3b1f1f1f2n@
295160                                      4b39b3b3n@,nan
295161                   4d,4f19f3b3b3b1f2b3s3f3b3b3b2b3n@
295162    6n,5f39b2f3b1f1f1f2b3b2b1f1f1f2b3b3b1f1f3s3f+-1*
Name: 1st,2nd, Length: 19324, dtype: object


pandas.core.series.Series

In [12]:
# creating a dataframe to store the shot_lists in
shot_list_dataframe = pd.DataFrame()
#for i in range(1):
#    shot_list_dataframe["shot_"+str(i)] = np.nan

shot_list_dataframe.head(5)

In [16]:
# now comes the tricky bit: each tennis shot has to be its own column to make predictions on that

# go through each character in the "1st,2nd" (ralley) column and follow the schema to add the new columns
# if the next char in the ralley is in the list of possible_shots
# then add the chars until that char as a new shot in a new column

possible_shots = ["f", "b", "r", "s", "v", "z", "o", "p", "y", "l", "m", "h", "i", "j", "k", "t", "u"]
second_serve = ","
len_of_longest_ralley = 0
k = 0
# i is a ralley
for i in ralley_df:
    # Initializeing new shot for each new ralley
    shot = ""
    shot_list = []
    # every char in each ralley will be looked at
    for char in range(0, len(i)):
        # if the char is not in the possible_shots list, the char is appended to the current shot
        if i[char] not in possible_shots and i[char] != second_serve:
            shot = shot + str(i[char])
        # if the char is the ",", shot is done and appended to the shot_list
        elif i[char] == second_serve:
            shot = shot + second_serve
            shot_list.append(shot)
            shot = ""
        # in all other cases, shot is done and appended to the shot_list
        else:
            shot_list.append(shot)
            shot = str(i[char])
    # the last shot has to be appended here, because it is not going into the loop again
    shot_list.append(shot)

    # if the last element of the shot list is "nan" we delete that because we dont want wo predict "nan" and also the "," from the last shot
    if shot_list[-1] == "nan":
        shot_list.pop()
    if shot_list[-1][-1] == "," and len(shot_list) > 1:
        shot_list[-1] = shot_list[-1].rstrip(shot_list[-1][-1])
    
    # finding the longest shot_list/ralley in the dataset
    if len(shot_list) > len_of_longest_ralley:
        len_of_longest_ralley = len(shot_list)

    shot_list_series = pd.Series(shot_list)
    shot_list_small_df = pd.DataFrame(shot_list)

    # now the shot list is beautiful and can add it to the empyt shot_list_dataframe
    #shot_list_dataframe = shot_list_dataframe.append(pd.Series(shot_list, index=shot_list_dataframe.columns[:len(shot_list)]), ignore_index=True)
    #shot_list_dataframe.loc[len(shot_list_dataframe)] = pd.Series(shot_list)
    #pd.concat([shot_list_dataframe, pd.Series([shot_list])], ignore_index=True)
    #df = shot_list_dataframe.append(shot_list_series, ignore_index=True)
    #shot_list_dataframe["ralley_" + str(k)] = pd.Series(shot_list)
    #k = k + 1
    shot_list_dataframe = pd.concat([shot_list_dataframe, shot_list_small_df], ignore_index=True, axis=1)
    #shot_list_small_df
    #print(shot_list_small_df)
    #print(shot_list)
#print(type(shot_list_series))    
shot_list_dataframe.head(5)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19314,19315,19316,19317,19318,19319,19320,19321,19322,19323
0,"4n,","4n,","4*,","4n,",4,"4n,","4n,","6n,","4d,","6d,",...,"4d,","6n,","4n,",5,6,6,"6w,",4,"4d,","6n,"
1,4,6,,5,r28,6,5,5,6,5,...,4,6,5,b1d#,s18,f38,5,b39,4,5
2,f18,f2n#,,b3n@,f1,b37,b38,f28,f38,b3w@,...,s3n#,b37,b18*,,f+1,b3,b39,b3,f19,f39
3,f3,,,,r2,b2,s3,f3,f3,,...,,b3,,,l2,b3,b2,b3n@,f3,b2
4,s2,,,,f3,f3,b3,s2,b2,,...,,b1,,,o=1d@,b2,b2,,b3,f3


In [15]:
#len(shot_list_dataframe)
# import the whole dataset
shot_list_dataframe = pd.read_csv(r"C:\Users\carlo\TrainingsTool\Tennis\ML_Stuff\shot_list_dataframe.csv", encoding= 'unicode_escape', low_memory=False)
shot_list_dataframe.head(3)

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,19314,19315,19316,19317,19318,19319,19320,19321,19322,19323
0,0,"4n,","4n,","4*,","4n,",4,"4n,","4n,","6n,","4d,",...,"4d,","6n,","4n,",5,6,6,"6w,",4,"4d,","6n,"
1,1,4,6,,5,r28,6,5,5,6,...,4,6,5,b1d#,s18,f38,5,b39,4,5
2,2,f18,f2n#,,b3n@,f1,b37,b38,f28,f38,...,s3n#,b37,b18*,,f+1,b3,b39,b3,f19,f39


In [20]:
# transpose the dataframe from above and concat it to the big dataset with the other stuff
shot_list_df_transposed = shot_list_dataframe.transpose()
# drop first row because indexes where redundant for some reason
shot_list_df_transposed = shot_list_df_transposed.iloc[1:]
shot_list_df_transposed.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,53
0,"4n,",4,f18,f3,s2,f1,f3,b3n@,,,...,,,,,,,,,,
1,"4n,",6,f2n#,,,,,,,,...,,,,,,,,,,
2,"4*,",,,,,,,,,,...,,,,,,,,,,
3,"4n,",5,b3n@,,,,,,,,...,,,,,,,,,,
4,4,r28,f1,r2,f3,b3,f2,f1,f1,f3,...,,,,,,,,,,


In [68]:
# add the transposed df to the df with the Big one
# check if they have the same length and same dtype
#type(test_df)
len(test_df)==len(shot_list_df_transposed) and type(test_df)==type(shot_list_df_transposed)


pandas.core.frame.DataFrame

In [104]:
# check if they have the right indexes from 0 to 19324
#test_df
#shot_list_df_transposed

# in case they dont:
test_df.reset_index(drop=True, inplace=True)
shot_list_df_transposed.reset_index(drop=True, inplace=True)

In [105]:
# This Cell was just for testing
test_df_3 = pd.DataFrame()
test_df_1 = test_df.head(20)
test_df_2 = shot_list_df_transposed.head(20)
test_df_3 = pd.concat([test_df_1, test_df_2], axis=1)
test_df_3

Unnamed: 0,Pts,Gm1,Gm2,Set1,Set2,Serving,"1st,2nd",0,1,2,...,44,45,46,47,48,49,50,51,52,53
0,0-0,0,0,0,0,ND,"4n,4f18f3s2f1f3b3n@","4n,",4,f18,...,,,,,,,,,,
1,0-15,0,0,0,0,ND,"4n,6f2n#","4n,",6,f2n#,...,,,,,,,,,,
2,15-15,0,0,0,0,ND,"4*,nan","4*,",,,...,,,,,,,,,,
3,30-15,0,0,0,0,ND,"4n,5b3n@","4n,",5,b3n@,...,,,,,,,,,,
4,40-15,0,0,0,0,ND,"4r28f1r2f3b3f2f1f1f3b3b3b1r2n#,nan",4,r28,f1,...,,,,,,,,,,
5,0-0,1,0,0,0,CR,"4n,6b37b2f3s2f1f1f3w@","4n,",6,b37,...,,,,,,,,,,
6,15-0,1,0,0,0,CR,"4n,5b38s3b3b3b3b3b1*","4n,",5,b38,...,,,,,,,,,,
7,15-15,1,0,0,0,CR,"6n,5f28f3s2f+3b2v1d@","6n,",5,f28,...,,,,,,,,,,
8,15-30,1,0,0,0,CR,"4d,6f38f3b2f1f2f3d@","4d,",6,f38,...,,,,,,,,,,
9,15-40,1,0,0,0,CR,"6d,5b3w@","6d,",5,b3w@,...,,,,,,,,,,


In [107]:
# now both are ready to be joined
df_ready = pd.DataFrame()
#df_ralley_taken_apart = pd.DataFrame()
#len(df_ralley_taken_apart)
df_ready = pd.concat([test_df, shot_list_df_transposed], axis=1)
df_ready.drop(["1st,2nd"], axis = 1, inplace=True)
df_ready

Unnamed: 0,Pts,Gm1,Gm2,Set1,Set2,Serving,0,1,2,3,...,44,45,46,47,48,49,50,51,52,53
0,0-0,0,0,0,0,ND,"4n,",4,f18,f3,...,,,,,,,,,,
1,0-15,0,0,0,0,ND,"4n,",6,f2n#,,...,,,,,,,,,,
2,15-15,0,0,0,0,ND,"4*,",,,,...,,,,,,,,,,
3,30-15,0,0,0,0,ND,"4n,",5,b3n@,,...,,,,,,,,,,
4,40-15,0,0,0,0,ND,4,r28,f1,r2,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19319,0-15,5,4,1,1,ND,6,f38,b3,b3,...,,,,,,,,,,
19320,0-30,5,4,1,1,ND,"6w,",5,b39,b2,...,,,,,,,,,,
19321,15-30,5,4,1,1,ND,4,b39,b3,b3n@,...,,,,,,,,,,
19322,30-30,5,4,1,1,ND,"4d,",4,f19,f3,...,,,,,,,,,,


In [115]:
# now that the dataset is prepared we need to split it up into the dataframes that we want to train models on
df_all = pd.read_csv(r"C:\Users\carlo\TrainingsTool\Tennis\ML_Stuff\df_djoker_ready_to_split.csv", encoding='unicode_escape', low_memory=False)
# ToDo: first thing would be to split the whole thing into a return dataset and a serve dataset
df_all
# ToDo: From those 2 datasets take all the ralleys with the same lengths as seperate Datasets

# note to future self: start with a set for 2 or 3 shots and look at the accuracy of that


Unnamed: 0.1,Unnamed: 0,Pts,Gm1,Gm2,Set1,Set2,Serving,0,1,2,...,44,45,46,47,48,49,50,51,52,53
0,0,0-0,0,0,0,0,ND,"4n,",4,f18,...,,,,,,,,,,
1,1,0-15,0,0,0,0,ND,"4n,",6,f2n#,...,,,,,,,,,,
2,2,15-15,0,0,0,0,ND,"4*,",,,...,,,,,,,,,,
3,3,30-15,0,0,0,0,ND,"4n,",5,b3n@,...,,,,,,,,,,
4,4,40-15,0,0,0,0,ND,4,r28,f1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19319,19319,0-15,5,4,1,1,ND,6,f38,b3,...,,,,,,,,,,
19320,19320,0-30,5,4,1,1,ND,"6w,",5,b39,...,,,,,,,,,,
19321,19321,15-30,5,4,1,1,ND,4,b39,b3,...,,,,,,,,,,
19322,19322,30-30,5,4,1,1,ND,"4d,",4,f19,...,,,,,,,,,,
