# Predicting results

## Imports

In [1]:
import mysql.connector as sql
import pandas as pd
import numpy as np
from datetime import date, timedelta


from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import  train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn import metrics

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from IPython.display import display, HTML
from tqdm import tqdm

## Definitions

In [2]:
from_year=2016
to_year=2018
streak_days_long=100
streak_days_short=20

In [3]:
def get_prev_matches_numbers(date_start, date_end, team_id, df):
    dfteam_previous = df[((df["team_home_id"]==team_id) | (df["team_away_id"]==team_id)) & (df["kickoff_time"]<date_end) & (df["kickoff_time"]>date_start)].copy()
    if len(dfteam_previous) == 0:
        win_rate = 0.
        score_diff_avg = 0.
    else:
        dfteam_previous["score_team"] = dfteam_previous.apply(lambda x: x["score_home"] if x["team_home_id"]==team_id else x["score_away"], axis=1)
        dfteam_previous["score_oponent"] = dfteam_previous.apply(lambda x: x["score_away"] if x["team_home_id"]==team_id else x["score_home"], axis=1)
        dfteam_previous["won"] = dfteam_previous["score_team"] > dfteam_previous["score_oponent"]
        dfteam_previous["score_diff"] = dfteam_previous["score_team"] - dfteam_previous["score_oponent"]
        win_rate = float(dfteam_previous["won"].sum()) / len(dfteam_previous)
        score_diff_avg = dfteam_previous["score_diff"].mean()
    return win_rate, score_diff_avg

In [4]:
def calculate_variables_last_X_train(df_games, last_X_days):
    # for each game add results of last ones
    win_rate_home_list = []
    win_rate_away_list = []
    score_diff_avg_home_list = []
    score_diff_avg_away_list = []
    for i, row in tqdm(df_games.iterrows()):
        date_end = row["kickoff_time"]
        date_start = date_end - timedelta(days=last_X_days)
        # For the home team
        team_id = row["team_home_id"]
        # print("Home team:", team_id)
        win_rate_home, score_diff_avg_home = get_prev_matches_numbers(date_start, date_end, team_id, df_games)
        win_rate_home_list += [win_rate_home]
        score_diff_avg_home_list += [score_diff_avg_home]
        # For the away team
        team_id = row["team_away_id"]
        # print("Away team:", team_id)
        win_rate_away, score_diff_avg_away = get_prev_matches_numbers(date_start, date_end, team_id, df_games)
        win_rate_away_list += [win_rate_away]
        score_diff_avg_away_list += [score_diff_avg_away]

    # add historical to df
    df_games["win_rate_home_last"+str(last_X_days)] = win_rate_home_list
    df_games["score_diff_avg_home_last"+str(last_X_days)] = score_diff_avg_home_list
    df_games["win_rate_away_last"+str(last_X_days)] = win_rate_away_list
    df_games["score_diff_avg_away_last"+str(last_X_days)] = score_diff_avg_away_list

    return df_games

## Loading data from db

In [5]:
#DB connection
db_connection = sql.connect(host='localhost', port=3306, database='acb', user='root', password='root')
df_games = pd.read_sql('SELECT * FROM game where game.season >= {}'.format(from_year), con=db_connection)

#print(df_games.shape)
#print(df_games.columns)
#df_games.head()

## Correlations

In [6]:
#correlation matrix
#corrmat = df_games.corr()
#f, ax = plt.subplots(figsize=(20,18))
#sns.heatmap(corrmat, square=True, cmap=sns.diverging_palette(240, 10, as_cmap=True))
#plt.show()

In [7]:
#cols_aux = [c for c in df_games.columns if (c.startswith("score") and not c.endswith("extra"))]
#print(cols_aux)
#sns.set(style="ticks", color_codes=True)
#sns.pairplot(df_games[cols_aux])
#plt.show()

## Preprocessing

In [8]:
# cols to keep
cols_to_keep = ['team_home_id', 'team_away_id', 'season', 'journey', 'kickoff_time',
                'score_home', 'score_away', 'referee_1']
cols_to_del = [c for c in df_games.columns if c not in cols_to_keep]
df_games.drop(cols_to_del, axis=1, inplace=True)

# check if nulls
#print("Number of nulls in df:", df_games.isnull().sum().max())

# checking amount of times a home team won
#win_home = df_games["score_away"] < df_games["score_home"]
#print("Home Team Win percentage: {0:.1f}%".format(100 * win_home.values.sum() / len(win_home)))

# create score difference feature
df_games["score_difference"] = df_games["score_home"] - df_games["score_away"]
#print("Mean of score difference:", df_games["score_difference"].mean())

df_games=calculate_variables_last_X_train(df_games, streak_days_long)
df_games=calculate_variables_last_X_train(df_games, streak_days_short)
df_games.head()


793it [00:07, 103.34it/s]
793it [00:06, 130.16it/s]


Unnamed: 0,team_home_id,team_away_id,season,journey,score_home,score_away,kickoff_time,referee_1,score_difference,win_rate_home_last100,score_diff_avg_home_last100,win_rate_away_last100,score_diff_avg_away_last100,win_rate_home_last20,score_diff_avg_home_last20,win_rate_away_last20,score_diff_avg_away_last20
0,1,9,2018,1,88,73,2018-09-30 19:15:00,Jiménez Trujillo,15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,18,1,2018,2,70,88,2018-10-05 20:45:00,Antonio Conde,-18,0.0,-13.0,1.0,15.0,0.0,-13.0,1.0,15.0
2,3,2,2018,12,78,73,2018-12-16 19:30:00,Carlos Peruga,5,0.727273,6.0,0.909091,13.181818,0.0,-7.0,1.0,4.0
3,17,10,2018,12,83,68,2018-12-16 19:30:00,M.A. Pérez Pérez,15,0.272727,-4.454545,0.545455,-3.090909,0.0,-24.0,1.0,7.0
4,4,16,2018,12,76,87,2018-12-15 20:30:00,Daniel Hierrezuelo,-11,0.454545,-3.636364,0.181818,-10.545455,0.0,-19.0,0.0,-14.0


In [9]:
"""df_games_inverse = df_games.copy()
for c in df_games_inverse.columns:
    if "home" in c:
        df_games_inverse[c.replace("home", "away")] = df_games[c]
    if "away" in c:
        df_games_inverse[c.replace("away", "home")] = df_games[c]
df_games_inverse["score_difference"] = -df_games["score_difference"]        
df_games_inverse.head()"""

'df_games_inverse = df_games.copy()\nfor c in df_games_inverse.columns:\n    if "home" in c:\n        df_games_inverse[c.replace("home", "away")] = df_games[c]\n    if "away" in c:\n        df_games_inverse[c.replace("away", "home")] = df_games[c]\ndf_games_inverse["score_difference"] = -df_games["score_difference"]        \ndf_games_inverse.head()'

## Regression

In [10]:
#df_final = pd.concat([df_games, df_games_inverse], ignore_index = True)
df_final = df_games.copy()
df_final.drop(["score_home", "score_away", "journey", "kickoff_time", "referee_1"], axis=1, inplace=True)
#df_final = df_final[["win_rate_home", "score_diff_avg_home", "win_rate_away", "score_diff_avg_away", "season", "score_difference"]]

# Years for train // Years for test (generally 2016-2017 and 2018)
train = df_final[df_final["season"]<to_year]
test = df_final[df_final["season"]>=to_year]

print("Rows for training:", len(train))
print("Rows for test:", len(test))

X_train = train.drop(['score_difference', "season"], axis=1)
y_train = train['score_difference']
X_test = test.drop(['score_difference', "season"], axis=1)
y_test = test['score_difference'].values
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

# Train algorithm
print(X_train.columns)
model = RandomForestRegressor(n_estimators=1000, random_state=20, max_depth=None, n_jobs=-1)
model.fit(X_train, y_train)

#print(model.feature_importances_)

# Estimating an interval
preds_estimators = {}
for i in range(len(model.estimators_)):
    e = model.estimators_[i]
    preds_estimators["Est" + str(i).zfill(2)] = e.predict(X_test)
df_preds_estimators = pd.DataFrame(data=preds_estimators)

"""
# Plotting the intervals and the real value
axes = df_preds_estimators.transpose().plot.box(figsize=(20, 20), showfliers=True,
                                                flierprops=dict(marker='+', color='lightblue'),
                                                color={'whiskers': 'lightblue', 'caps': 'lightblue',
                                                       'medians': 'lightblue', 'boxes': 'lightblue'})
axes.plot(y_test, color='red', marker='o', linestyle=' ')
axes.set_title("Predictions of the score difference for each game and real result")
axes.set_xlabel("Game")
axes.set_ylabel("Score difference")
axes.legend([Line2D([0], [0], color='lightblue', marker='o', linestyle=' '),
             Line2D([0], [0], color='red', marker='o', linestyle=' ')], ['Preds', 'Real'])
plt.show()

# Some conclusions about the intervals
df_preds_estimators["Upper"] = df_preds_estimators.apply(
    lambda x: np.percentile(x, 75) + 1.5 * (np.percentile(x, 75) - np.percentile(x, 25)), axis=1)
df_preds_estimators["Lower"] = df_preds_estimators.apply(
    lambda x: np.percentile(x, 25) - 1.5 * (np.percentile(x, 75) - np.percentile(x, 25)), axis=1)
df_preds_estimators["Real"] = y_test

mean_aux = df_preds_estimators.apply(lambda x: 1 if x.mean() >= 0 else 0, axis=1)
print("Percentage of the times the mean was positive:", mean_aux.mean())
median_aux=df_preds_estimators.apply(lambda x: 1 if np.percentile(x, 50)>=0 else 0, axis=1)
print("Percentage of the times the median was positive:", median_aux.mean())

df_preds_estimators["score_in_range?"] = (df_preds_estimators["Real"] >= df_preds_estimators["Lower"]) \
                                         & (df_preds_estimators["Real"] <= df_preds_estimators["Upper"])
print("Percentage of the times the score was in the given interval: {}%".format(
    100 * float(df_preds_estimators["score_in_range?"].sum()) / len(df_preds_estimators)))
"""

# Predicting score directly
y_pred = model.predict(X_test)

d = {'Real': y_test, 'Pred': y_pred}
df_res = pd.DataFrame(data=d)
df_res["winner_correct?"] = np.sign(df_res["Real"]) == np.sign(df_res["Pred"])

print("Mean absolute error: {} points".format(metrics.mean_absolute_error(y_test, y_pred)))
print("Percentage of the times the winner was correct: {}%".format(
    float(df_res["winner_correct?"].sum()) / len(df_res)))

Rows for training: 622
Rows for test: 171
Index(['team_home_id', 'team_away_id', 'win_rate_home_last100',
       'score_diff_avg_home_last100', 'win_rate_away_last100',
       'score_diff_avg_away_last100', 'win_rate_home_last20',
       'score_diff_avg_home_last20', 'win_rate_away_last20',
       'score_diff_avg_away_last20'],
      dtype='object')
Mean absolute error: 10.66528835978836 points
Percentage of the times the winner was correct: 0.672514619883041%


## Classification

In [11]:
#df_final = pd.concat([df_games, df_games_inverse], ignore_index = True)
df_final = df_games.copy()
df_final["winner"] = df_final["score_difference"] > 0 
df_final.drop(["score_home", "score_away", "journey", "kickoff_time", "referee_1", "score_difference"], axis=1, inplace=True)
#df_final = df_final[["win_rate_home", "score_diff_avg_home", "win_rate_away", "score_diff_avg_away", "season", "score_difference"]]

# Years for train // Years for test (generally 2016-2017 and 2018)
train = df_final[df_final["season"]<to_year]
test = df_final[df_final["season"]>=to_year]

print("Rows for training:", len(train))
print("Rows for test:", len(test))

X_train = train.drop(['winner', "season"], axis=1)
y_train = train['winner']
X_test = test.drop(['winner', "season"], axis=1)
y_test = test['winner'].values
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

# Train algorithm
print(X_train.columns)
model = RandomForestClassifier(n_estimators=1000, random_state=20, max_depth=None, n_jobs=-1)
model.fit(X_train, y_train)

#print(model.feature_importances_)

# Predicting score directly
y_pred = model.predict(X_test)

d = {'Real': y_test, 'Pred': y_pred}
df_res = pd.DataFrame(data=d)
df_res["winner_correct?"] = df_res["Real"] == df_res["Pred"]

print("Accuracy: {} points".format(df_res["winner_correct?"].mean()))

Rows for training: 622
Rows for test: 171
Index(['team_home_id', 'team_away_id', 'win_rate_home_last100',
       'score_diff_avg_home_last100', 'win_rate_away_last100',
       'score_diff_avg_away_last100', 'win_rate_home_last20',
       'score_diff_avg_home_last20', 'win_rate_away_last20',
       'score_diff_avg_away_last20'],
      dtype='object')
Accuracy: 0.6549707602339181 points
