In [63]:
import pandas as pd
import numpy as np
import seaborn as sns
pd.options.display.max_columns = 50
pd.options.display.max_rows = 100
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.model_selection import cross_val_score, cross_validate

In [2]:
def add_data_info():
    df1 = pd.read_csv('https://raw.githubusercontent.com/JeffSackmann/tennis_MatchChartingProject/master/charting-m-points-from-2017.csv', encoding= 'unicode_escape', low_memory=False)
    df2 = pd.read_csv('raw_data/charting-m-matches.csv', encoding= 'unicode_escape', low_memory=False)
    
    #get separated values from m-matches list of games - based on matchid
    df = pd.merge(df1,df2[['match_id','Date','Tournament','Round','Surface','Player 1','Player 2']], on=['match_id'], how='left')
    
    #add the condition of "Grand Slam" - 5 or 3 sets
    conditions = df['Tournament'].isin(["Wimbledon", "Australian Open", "US Open", "Roland Garros"])
    values = [1,0]

    # Create a new column  based on the conditions and values
    df['Grand Slam'] = pd.Series(pd.NA)
    df.loc[conditions, 'Grand Slam'] = values[0]
    df.loc[~conditions, 'Grand Slam'] = values[1]
    
    # save the original dataset with enriched data for future features
    df.to_csv('raw_data/charting-m-points-from-2017-enriched.csv', index=False)

In [3]:
def filter_player(df,opponent):
    
    # Read the dataset into a pandas DataFrame - from enriched file
    df = pd.read_csv(df, encoding= 'unicode_escape', low_memory=False)
    
    ## filtra pelo  oponente
    df = df[(df["Player 1"].str.contains(opponent)) | (df["Player 2"].str.contains(opponent))].copy()
    
    ##define se está sacando e se ganhou o ponto
    df['i_serve'] = np.where(((df['Svr'] == 1) & (df['Player 1'] == opponent )) | ((df['Svr'] == 2) & (df['Player 2'] == opponent)), 0, 1)
    df['i_win'] = np.where(((df['isSvrWinner'] == df['i_serve'])), 1, 0)
    df['is_second_service'] = np.where(((df['2nd'].isnull())), 0, 1)
    
    ## direção do saque
    # primeiro caractere do segundo saque
    df['dir_srv'] = df.loc[:, '2nd'].str[0]
    ## completa com o primeiro caractere do primeiro saque quando ele entrou
    df['dir_srv'].fillna(df.loc[:, '1st'].str[0], inplace=True)

    
    ##reseta o indice
    df.reset_index(inplace=True)
    
    ## salva arquivo e retorna dataset da funcao
    csv_file = "raw_data/" + opponent + "_points.csv"
    df.to_csv(csv_file, index=False)
    return df

In [114]:
add_data_info()

In [115]:
df = filter_player('raw_data/charting-m-points-from-2017-enriched.csv','Novak Djokovic')

In [116]:
df = df[df['dir_srv'].isin(['4','5','6'])]
df['rallyCount']=pd.to_numeric(df['rallyCount'])
df['dir_srv']=pd.to_numeric(df['dir_srv'])

In [154]:
features = ['Surface',
            'Round',
            'Grand Slam',
            'i_serve',
            'is_second_service',
            'dir_srv',
            'rallyCount',
            'isUnforced',
            'isForced'
            ]
X = df[features]
y = df['i_win']

In [155]:
feat_categorical = ['Surface','Round','dir_srv']

In [156]:
X['isUnforced'] = X['isUnforced'].replace({True: 1, False: 0})
X['isForced'] = X['isForced'].replace({True: 1, False: 0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['isUnforced'] = X['isUnforced'].replace({True: 1, False: 0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['isForced'] = X['isForced'].replace({True: 1, False: 0})


In [139]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

In [157]:
preproc_categorical_baseline = make_pipeline(
    OneHotEncoder(handle_unknown="ignore"))

preproc_numerical_baseline = make_pipeline(
    MinMaxScaler())

In [158]:
preproc_baseline = make_column_transformer(
    (preproc_numerical_baseline, ['rallyCount']),
    (preproc_categorical_baseline, feat_categorical),
    remainder="drop")

preproc_baseline

In [159]:
pipe_baseline = make_pipeline(preproc_baseline, LogisticRegression())
pipe_baseline

In [160]:
score_baseline = cross_val_score(pipe_baseline, X, y, cv=5, scoring='accuracy').mean()
score_baseline

0.5169323309339731

In [127]:
X['rallyCount']

0         6
1         1
2         1
3         1
4        13
         ..
19319    18
19320    17
19321     3
19322    13
19323    21
Name: rallyCount, Length: 18751, dtype: int64

In [131]:
features_tmp = [
            'Grand Slam',
            'i_serve',
            'is_second_service',
            'dir_srv',
            'rallyCount'
            # 'isUnforced',
            # 'isForced',
            # 'isUnret'
            ]
X_tmp = df[features_tmp]
y_tmp = df['i_win']

In [132]:
model = LogisticRegression(max_iter=1000)
cv_results = cross_validate(model, X_tmp, y_tmp, cv=5)
accuracy = cv_results['test_score'].mean()
accuracy

0.640498441304541

In [150]:
df

0        1
1        0
2        0
3        0
4        0
        ..
19319    1
19320    0
19321    0
19322    0
19323    0
Name: i_win, Length: 18751, dtype: int64