In [1]:
import pandas as pd
import statistics
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import r2_score

def indexCoeff0ByCourt(nameCourt):
    switcher={
            'Clay':2,
            'Grass':3,
            'Hard':4,
            'I.hard':5
            }
    return switcher.get(nameCourt)

class TournamentStats:
    def __init__(self,id,dateT, name, speed, court,level,location
                 , sumActual, sumExp, nbMatches, coeff0Regression, coeffRegAceFor,coeffRegAceVs):
        self.id=id
        self.dateT = dateT
        self.name = name
        self.speed = speed
        self.court = court
        self.level=level
        self.location=location
        self.sumActual=sumActual
        self.sumExp=sumExp
        self.nbMatches=nbMatches
        self.coeff0Regression=coeff0Regression
        self.coeffRegAceFor=coeffRegAceFor
        self.coeffRegAceVs=coeffRegAceVs
    def __repr__(self):
        return '{},{},{},{},{},{},{},{},{},{},{},{},{}'.format(self.name,self.id,round(self.speed,2),
                                    self.dateT, self.court, self.level, self.location,
                                    round(self.sumActual,2), round(self.sumExp,2), self.nbMatches,
                                    round(self.coeff0Regression,2),round(self.coeffRegAceFor,2)
                                    ,round(self.coeffRegAceVs,2))
    def __cmp__(self, other):
        if hasattr(other, 'speed'):
            return self.speed.__cmp__(other.speed)
    def isCourt(self, courtName):
        return (courtName==self.court)
    @staticmethod
    def FilterByCourtName(list, courtName):
        return filter(isCourt(courtName), list)

def getTrnsSpeeds(listTrnIds, dataFromFile, coeffsRegression, coeff0Regression):
"""
Calculate the Speed of a Tournament (difference between Actual Aces and Expectes Aces)
"""
    listTrn=[]
    for trnId in listTrnIds:
        strClauseTrn = 'TrnId=='+str(trnId)+' & '
        dfMatchesForTrn=dataFromFile.copy(deep=True)
        dfMatchesForTrn.dropna()
        dfMatchesForTrn = dfMatchesForTrn.replace('NaN',-1).query(strClauseTrn + 'IsValidData>0 & IsEnoughData>0 & AceRatePlayer>=0 & AvgAceRateP>0  & AvgAceRateOpp>0')
        courtCoeff0Reg = 0
        #print('Coeff0 for Court=' + str(courtCoeff0Reg))
        #surface not taken into account here!! else add courtCoeff0Reg below
        expectedAce = (courtCoeff0Reg+coeff0Regression+coeffsRegression[0]*dfMatchesForTrn['AvgAceRateP']+coeffsRegression[1]*dfMatchesForTrn['AvgAceRateOpp'])
        expectedAce= [max(t, 0.01) for t in expectedAce]  #no negative values
        #diff=df['AceRatePlayer']-expectedAce
        #diff= max(diff, 0.01)
        sumActual=sum(dfMatchesForTrn['AceRatePlayer'])
        sumExp=sum(expectedAce)
        #print(df.iloc[0,2]+':'+str(statistics.mean(diff)))
        #print('Sum Ace Rates:'+str(sumActual))
        #print('Sum Exp Rates:'+str(sumExp))
        #print(df.iloc[0,0] + df.iloc[0,2]+'Diff:'+str(sumActual/sumExp))
        trn=TournamentStats(dfMatchesForTrn.iloc[0,0], dfMatchesForTrn.iloc[0,1], dfMatchesForTrn.iloc[0,2]
                            , sumActual/sumExp, dfMatchesForTrn.iloc[0,5]
                            , dfMatchesForTrn.iloc[0,3], dfMatchesForTrn.iloc[0,4], sumActual, sumExp
                            , len(dfMatchesForTrn), coeff0Regression, coeffsRegression[0],coeffsRegression[1])
        #print(trn)
        listTrn.append(trn)
    #listTrnClay=[t for t in listTrn if (x.isCourt('Clay'))]
    #listTrnClay=filter(lambda x: x.isCourt('Clay'), listTrn)
    #list(listTrnClay)
    #listTrnClay.sort(key=lambda x: x.speed, reverse=True)
    #sorted(listTrnClay, key=lambda x: x.speed, reverse=True)
    listTrn.sort(key=lambda x: x.name, reverse=True)
    print(*listTrn, sep="\n")
    #listTrNonClay=list(filter(lambda x: not x.isCourt('Clay'), listTrn))
    #sorted(listTrNonClay, key=lambda x: x.speed, reverse=True)
    #print(*listTrNonClay, sep="\n")

def displayAllMatches(trnId, dataFromFile, coeffsRegression, coeff0Regression):
    df=dataFromFile.copy(deep=True)
    df.dropna()
    df = df.replace('NaN',-1).query('TrnId=='+trnId+' & IsValidData>0 & IsEnoughData>0 & AceRatePlayer>0 & AvgAceRateP>0  & AvgAceRateOpp>0')     
    print(df)
    for index, row in df.iterrows():
        print(row['Player']+' in '+ row['P1'] + '/' + row['P2'])
        courtCoeff0Reg = 0
        expectedAce = (courtCoeff0Reg+coeff0Regression+coeffsRegression[0]*row['AvgAceRateP']+coeffsRegression[1]*row['AvgAceRateOpp'])
        expectedAce= [max(t, 0.01) for t in expectedAce]  #no negative values
        print('Res=' + str(row['AceRatePlayer']) + ' - Expected=' + str(round(expectedAce,2)) + ' From ' + str(row['AvgAceRateP']) +' vs '  + str(row['AvgAceRateOpp']))
        diff=(row['AceRatePlayer']-expectedAce)
        #diff= min(max(diff, 0.01),1.99)
        print(diff)

IndentationError: expected an indented block (<ipython-input-1-337dc7ee27d3>, line 53)

In [None]:
#LOAD DATA
dataFromFile = pd.read_csv('./Data/ATP_all_matches.csv')    
loops=0
while (loops<1):
    loops+=1
    indexCourt = '0'#input("Enter Surface Id(NonClay=0,Clay=1,Hard=2,Indoor=3,Grass=4): ")
    strClauseCourt=''
    if (indexCourt =='1'):
        strClauseCourt = 'CourtId=="Clay" & '
    if (indexCourt =='0'):
        strClauseCourt = 'CourtId!="Clay" & '
    if (indexCourt =='2'):
        strClauseCourt = 'CourtId=="Hard" & '
    if (indexCourt =='3'):
        strClauseCourt = 'CourtId=="I.hard" & '
    if (indexCourt =='4'):
        strClauseCourt = 'CourtId=="Grass" & '
    dfMatchesForTrn=dataFromFile.copy(deep=True)
    dfMatchesForTrn.dropna()
    dfMatchesForTrn = dfMatchesForTrn.replace('NaN',-1).query(strClauseCourt + 'IsValidData >0 & IsEnoughData>0 & AceRatePlayer>0 & AvgAceRateP>0  & AvgAceRateOpp>0')
    #print(dfMatchesForTrn)
    
    #dfMatchesForTrn=dfMatchesForTrn[(dfMatchesForTrn['AvgAceRateP']>=0) & (dfMatchesForTrn['AvgAceRateP']<5)]
    print('nb matches: ' + str(len(dfMatchesForTrn)))
    X=dfMatchesForTrn[["AvgAceRateP","AvgAceRateOpp","TrnSpeed2"]]
    X["AvgAceRateP"]=X.AvgAceRateP/100
    X["AvgAceRateOpp"]=X.AvgAceRateOpp/100
    Y=dfMatchesForTrn.AceRatePlayer
    #X=pd.concat[poly2d.fit_transform(X.AvgAceRateP),X.AvgAceRateOpp]


# EXPLORATORY DATA ANALYSIS

In [None]:
#PLOT
X.hist()
X.describe()    

In [None]:
Y.hist()
Y.describe()    

In [None]:
import seaborn as sns
sns.pairplot(X, diag_kind="kde")

# ANALYSIS of the couples: AvgAceRateP, AvgAceRateOpp
WHAT is the median of AceRatePlayer for each couple of features:
STEPS:
- round the values so they can be groupped
- make a pivot table indexed by AvgAceRateP,AvgAceRateOpp
- keep only when the number of rows is a minimum (20)

In [None]:
dfMatchesForTrn2=dfMatchesForTrn[["AceRatePlayer","AvgAceRateP","AvgAceRateOpp"]]
#round the values
#dfMatchesForTrn2.AceRatePlayer=dfMatchesForTrn2.AceRatePlayer.round()
dfMatchesForTrn2.AvgAceRateP=dfMatchesForTrn2.AvgAceRateP.round()
dfMatchesForTrn2.AvgAceRateOpp=dfMatchesForTrn2.AvgAceRateOpp.round()
#Make a pivot table indexed by the differents couples of features
dfpivot_aces=pd.pivot_table(dfMatchesForTrn2,index=["AvgAceRateP","AvgAceRateOpp"],aggfunc=["median", "count"])
#Keep only when the number of rows is a minimum
dfpivot_aces=dfpivot_aces[dfpivot_aces["count"]["AceRatePlayer"]>20]
dfpivot_aces=dfpivot_aces.reset_index()
dfpivot_aces[dfpivot_aces.AvgAceRateOpp==5].head(22)

- We knew the average of aces is about 8.0

HYPOTHESIS
- So when AvgAceRateOpp=8 we should see that AceRatePlayer=AvgAceRateP (because the 2nd feature is at the median and shouldnt impact much because the level of the opponent is really neutral)

CONLUSION
- Indeed for most couples we've got (AvgAceRateP,8)=AceRatePlayer

# PLOTS

In [None]:
s1=[len( (dfMatchesForTrn[ (dfMatchesForTrn.AvgAceRateP.round()==round(valX)) & (dfMatchesForTrn.AceRatePlayer==round(valY))]) )/5 for valX,valY in zip(dfMatchesForTrn.AvgAceRateP,dfMatchesForTrn.AceRatePlayer)]
s2=[len( (dfMatchesForTrn[ (dfMatchesForTrn.AvgAceRateOpp.round()==round(valX)) & (dfMatchesForTrn.AceRatePlayer==round(valY))]) )/5 for valX,valY in zip(dfMatchesForTrn.AvgAceRateOpp,dfMatchesForTrn.AceRatePlayer)]


In [None]:
plt.scatter(X.AvgAceRateP.round(), Y.round(), color='blue',s=s1)
plt.xlabel('AvgAceRateP')
plt.ylabel('ActualAceRateP')
plt.show()
plt.scatter(X.AvgAceRateOpp.round(), Y.round(), color='red',s=s2)
plt.xlabel('AvgAceRateOpp')
plt.ylabel('ActualAceRateP')
plt.show()

In [None]:
plt.scatter(Y_test, y_pred, color = 'blue',s=1)
plt.xlabel('True Values [aces]')
plt.ylabel('Predictions [aces]')
plt.xlim([0,15])
plt.ylim([0,15])
plt.plot([0,15], [0,15])
f=X.AvgAceRateP*coeffsRegression[0]+X.AvgAceRateOpp*coeffsRegression[1]+X.TrnSpeed2*coeffsRegression[2]+lm.intercept_
print(f.iloc[0])
#plt.plot(X.AvgAceRateP,f, '-r')
#plt.plot(train_x, coeffsRegression[0]*train_x + regr.intercept_[0], '-r')

# LINEAR REGRESSION

EVALUATION
we compare the actual values and predicted values to calculate the accuracy of a regression model. Evaluation metrics provide a key role in the development of a model, as it provides insight to areas that require improvement.

METRICS:
- Mean absolute error: It is the mean of the absolute value of the errors. This is the easiest of the metrics to understand since it’s just average error.
- Mean Squared Error (MSE): Mean Squared Error (MSE) is the mean of the squared error. It’s more popular than Mean absolute error because the focus is geared more towards large errors. This is due to the squared term exponentially increasing larger errors in comparison to smaller ones.
- Root Mean Squared Error (RMSE): R-squared is not error, but is a popular metric for accuracy of your model. It represents how close the data are to the fitted regression line. The higher the R-squared, the better the model fits your data. Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse).

In [None]:
from sklearn.metrics import r2_score

def print_metrics(Y_test, y_pred):    
    print('Mean absolute error: %.2f' % np.mean(np.absolute(y_pred - Y_test)))
    print('Residual sum of squares (MSE): %.2f' % np.mean((y_pred - Y_test) **2 ))
    print('R2-score: %.2f' % r2_score(Y_test, y_pred))

In [None]:
X=dfMatchesForTrn[["AvgAceRateP","AvgAceRateOpp","TrnSpeed2"]]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=125)
lm=LinearRegression()
lm.fit(X_train, Y_train)
print(X.iloc[0,:])
#RESULTS
print('RESULTS FOR COURT:' + indexCourt)
coeffsLRegression=lm.coef_
print(coeffsLRegression)
#print(pd.DataFrame({'estimated coeff':coeffsRegression}, index=X_train.columns))
coeff0LRegression= lm.intercept_
print(coeff0LRegression)
y_pred=lm.predict(X_test)
print_metrics(Y_test, y_pred)

# POLYNOMIAL REGRESSION

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly=dfMatchesForTrn[["AvgAceRateP","AvgAceRateOpp","TrnSpeed2"]]
X_poly = poly.fit_transform(X_poly)
print(X_poly[0])

X_train, X_test, Y_train, Y_test = train_test_split(X_poly, Y, test_size=0.25, random_state=125)
lm=LinearRegression()
lm.fit(X_train, Y_train)

#RESULTS
print('RESULTS FOR COURT:' + indexCourt)
coeffsRegression=lm.coef_
print('COEFFS AceP(1) AceOpp(2) Trn(3) AceP**2(4) AceP*AceOpp(5) AceP*Trn(6) AceOpp**2(7) AceOpp*Trn(8) Trn**2(9)\n'+ str(coeffsRegression))
#print(pd.DataFrame({'estimated coeff':coeffsRegression}, index=X_train.columns))
coeff0Regression= lm.intercept_
print(coeff0Regression)
y_pred=lm.predict(X_test)

print_metrics(Y_test, y_pred)

Let's apply the regression coefficients on our table dfpivot_aces
We'll apply TrnSpeed=1 as it s the mean value

In [None]:
trnSpeed=1
dfpivot_aces["expAces_reg"] = [(coeff0LRegression+ace*coeffsLRegression[0]+acevs*coeffsLRegression[1]+trnSpeed*coeffsLRegression[2]) for (ace, acevs) in zip(dfpivot_aces.AvgAceRateP, dfpivot_aces.AvgAceRateOpp)]
dfpivot_aces["expAces_poly"] = [(coeff0Regression+ace*coeffsRegression[0]+acevs*coeffsRegression[1]+trnSpeed*coeffsRegression[2]+ace**2*coeffsRegression[3]+ace*acevs*coeffsRegression[4]+ace*trnSpeed*coeffsRegression[5]+acevs**2*coeffsRegression[6]+acevs*trnSpeed*coeffsRegression[7]+trnSpeed**2*coeffsRegression[8]) for (ace, acevs) in zip(dfpivot_aces.AvgAceRateP, dfpivot_aces.AvgAceRateOpp)]
dfpivot_aces["expAces_reg_gap"] = (dfpivot_aces["median"]["AceRatePlayer"]-dfpivot_aces["expAces_reg"]).abs()
dfpivot_aces["expAces_poly_gap"] = (dfpivot_aces["median"]["AceRatePlayer"]-dfpivot_aces["expAces_poly"]).abs()
dfpivot_aces.iloc[81:100,:]

# Application of Decision Tree regression

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=125)

dt_regressor = DecisionTreeRegressor(random_state = 125)
dt_regressor.fit(X_train,Y_train)
#Predicting using test set 
y_pred = dt_regressor.predict(X_test)
mae=metrics.mean_absolute_error(Y_test, y_pred)
mse=metrics.mean_squared_error(Y_test, y_pred)
# Printing the metrics
print('Suppport Vector Regression Accuracy: ', dt_regressor.score(X_test,Y_test))
print('R2 square:',metrics.r2_score(Y_test, y_pred))
print('MAE: ', mae)
print('MSE: ', mse)

# Application of Random Forest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_regressor = RandomForestRegressor(n_estimators = 30 ,  random_state = 0)
rf_regressor.fit(X_train,Y_train)
#Predicting the SalePrices using test set 
y_pred = rf_regressor.predict(X_test)
mae=metrics.mean_absolute_error(Y_test, y_pred)
mse=metrics.mean_squared_error(Y_test, y_pred)
# Printing the metrics
print('Suppport Vector Regression Accuracy: ', rf_regressor.score(X_test,Y_test))
print('R2 square:',metrics.r2_score(Y_test, y_pred))
print('MAE: ', mae)
print('MSE: ', mse)

# Application of Support Vector Regression

In [None]:
from sklearn.svm import SVR
regressor= SVR(kernel='rbf')
regressor.fit(X_train,Y_train)
y_pred_svm=regressor.predict(X_test)
#y_pred_svm = cross_val_predict(regressor, x, y)
mae=metrics.mean_absolute_error(Y_test, y_pred_svm)
mse=metrics.mean_squared_error(Y_test, y_pred_svm)
# Printing the metrics
print('Suppport Vector Regression Accuracy: ', regressor.score(X_test,Y_test))
print('R2 square:',metrics.r2_score(Y_test, y_pred_svm))
print('MAE: ', mae)
print('MSE: ', mse)

# Application of Deep Learning using Keras library
Here is the deep learning model mentioned. A sequential model has been used. The model has been created as a function named build_model so that we can call it anytime it is required in the process. The model has two connected hidden layers with a Rectified Linear Unit (relu) function and an output layer with a linear function.

The hidden layers have 12 and 8 neurons respectively with all the 8 input variables. Mean Squared Error is the loss function here as it is the most common loss function in case of regression problems.

In [None]:
#TODO
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

def build_model():
  model = keras.Sequential([
    layers.Dense(12,kernel_initializer='normal', activation='relu', input_shape=[len(train_dataset.keys())]),
    layers.Dense(8, activation='relu'),
    layers.Dense(1, activation='linear')
  ])

  optimizer = tf.keras.optimizers.RMSprop(0.001)

  model.compile(loss='mse',
                optimizer=optimizer,
                metrics=['mae', 'mse'])
  return model
model = build_model()


In [None]:
model.summary()