In [16]:
import numpy as np
import pandas as pd
from sklearn import model_selection
import matplotlib.pyplot as plt
import lifelines
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, precision_score, recall_score, r2_score
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier

In [2]:
#importing data
players=pd.read_csv("data/players.csv")
players=players[["player_id", "pretty_name", "date_of_birth", "position", "sub_position"]]

appearances=pd.read_csv("data/appearances.csv")
appearances=appearances[["player_id", "game_id", "league_id", "goals", "assists", "minutes_played"]]
appearances=appearances[(appearances["league_id"] == "L1") | (appearances["league_id"] == "ES1") | (appearances["league_id"] == "FR1") | (appearances["league_id"] == "GB1") | (appearances["league_id"] == "IT1")]

games=pd.read_csv("data/games.csv")
games=games[["game_id", "season", "date"]]
games=games[games["season"]>2014]
appearances.head(1)

Unnamed: 0,player_id,game_id,league_id,goals,assists,minutes_played
1,84938,2581152,L1,0,0,90


In [3]:
#adding games data to appearances

appearances=appearances.merge(games, on="game_id", how="inner")
appearances.head(2)

Unnamed: 0,player_id,game_id,league_id,goals,assists,minutes_played,season,date
0,84938,2581152,L1,0,0,90,2015,2015-08-16
1,50512,2581152,L1,1,1,90,2015,2015-08-16


In [4]:
#creating each player final club for the season. That will be considered his season's club

last_game=pd.DataFrame(appearances.groupby(["player_id", "season"])["date"].max()).reset_index()
players_league=appearances.merge(last_game, on=["player_id", "season", "date"], how="inner")[["player_id", "season", 'league_id', 'date']]
players_league.head(3)

Unnamed: 0,player_id,season,league_id,date
0,167189,2015,L1,2015-08-29
1,85867,2015,L1,2015-09-19
2,263685,2015,L1,2015-10-31


In [5]:
#creating the player season stats

stats=appearances.groupby(["player_id", "season"])["goals", "assists", "minutes_played"].sum().reset_index()
stats.head(4)

  stats=appearances.groupby(["player_id", "season"])["goals", "assists", "minutes_played"].sum().reset_index()


Unnamed: 0,player_id,season,goals,assists,minutes_played
0,10,2015,7,7,1360
1,26,2015,0,0,90
2,26,2016,0,0,630
3,26,2017,0,0,91


In [6]:
#now, to create the career average stats (biggest issue here, with the small history. Life = since 15/16 season only)
#starting with loop to aggregate previous results

player_id=[]
season=[]
goals_pre=[]
assists_pre=[]
minutes_played=[]

for player in np.unique(stats.player_id):
    for year in np.unique(stats.season):
        df_temp=stats[(stats["player_id"]==player) & (stats["season"]<year)]
        if df_temp.shape[0]==0:
            continue
        player_id.append(player)
        season.append(year)
        goals_pre.append(df_temp.groupby("player_id")["goals"].sum().values[0]/len(np.unique(df_temp.season)))
        assists_pre.append(df_temp.groupby("player_id")["assists"].sum().values[0]/len(np.unique(df_temp.season)))
        minutes_played.append(df_temp.groupby("player_id")["minutes_played"].sum().values[0]/len(np.unique(df_temp.season)))




In [8]:
past_stats=pd.DataFrame(player_id, columns=['player_id'])
past_stats["season"]=season
past_stats["goals_pre"]=goals_pre
past_stats["assists_pre"]=assists_pre
past_stats["minutes_pre"]=minutes_played
past_stats.head(10)

Unnamed: 0,player_id,season,goals_pre,assists_pre,minutes_pre
0,10,2016,7.0,7.0,1360.0
1,10,2017,7.0,7.0,1360.0
2,10,2018,7.0,7.0,1360.0
3,10,2019,7.0,7.0,1360.0
4,10,2020,7.0,7.0,1360.0
5,10,2021,7.0,7.0,1360.0
6,26,2016,0.0,0.0,90.0
7,26,2017,0.0,0.0,360.0
8,26,2018,0.0,0.0,270.333333
9,26,2019,0.0,0.0,270.333333


In [9]:
stats['season_last'] = stats.groupby(['player_id'])['season'].shift()
stats['goals_last'] = stats.groupby(['player_id'])['goals'].shift()
stats['assists_last'] = stats.groupby(['player_id'])['assists'].shift()
stats['minutes_last'] = stats.groupby(['player_id'])['minutes_played'].shift()

for column in ['season_last', 'goals_last', 'assists_last', 'minutes_last']:
    stats[column]=np.where(stats['season']-stats['season_last']==1, stats[column], 0)
stats

Unnamed: 0,player_id,season,goals,assists,minutes_played,season_last,goals_last,assists_last,minutes_last
0,10,2015,7,7,1360,0.0,0.0,0.0,0.0
1,26,2015,0,0,90,0.0,0.0,0.0,0.0
2,26,2016,0,0,630,2015.0,0.0,0.0,90.0
3,26,2017,0,0,91,2016.0,0.0,0.0,630.0
4,80,2016,0,0,270,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
17143,855607,2021,0,0,11,0.0,0.0,0.0,0.0
17144,862880,2020,0,0,42,0.0,0.0,0.0,0.0
17145,882772,2020,0,0,34,0.0,0.0,0.0,0.0
17146,894205,2020,0,0,29,0.0,0.0,0.0,0.0


In [14]:
#join to the past stats to the stats, and them to the player data

total_stats=stats.merge(past_stats, on=['player_id', 'season'], how='inner')
abt=total_stats.merge(players_league, on=['player_id', 'season'])
abt=abt.merge(players, on=['player_id'])
abt['date']=pd.to_datetime(abt['date'])
abt['date_of_birth']=pd.to_datetime(abt['date_of_birth'])
abt['age_dif']=abt['date']-abt['date_of_birth']
abt['age']=[round(x.days/365,1) for x in abt['age_dif']]
abt=abt.drop('age_dif', axis=1)
abt['sub_position']=abt['sub_position'].fillna(0)
abt['next_season_played']=abt.groupby(['player_id'])['season'].shift(-1)
abt['next_season_played']=np.where(abt['next_season_played']>0, 1, 0)
abt=abt.dropna()
abt=abt[abt["season"]>2015]
abt=abt[abt["season"]<2021]


In [15]:
#trying linear regressor
scaler=preprocessing.StandardScaler()

X=abt[abt["season"]<2020].drop(["player_id", "season", "season_last", "date", "pretty_name", "date_of_birth", "next_season_played"], axis=1)
X = pd.get_dummies(X, columns=["league_id", "position", "sub_position"], drop_first=True)
X=pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
y=abt[abt["season"]<2020]["next_season_played"]
X_test=abt[abt["season"]==2020].drop(["player_id", "season", "season_last", "date", "pretty_name", "date_of_birth", "next_season_played"], axis=1)
X_test = pd.get_dummies(X_test, columns=["league_id", "position", "sub_position"], drop_first=True)
X_test=pd.DataFrame(scaler.transform(X_test), columns=X.columns)
y_test=abt[abt["season"]==2020]["next_season_played"]


reg=LogisticRegression()
reg.fit(X,y)
print('train accuracy=', accuracy_score(y, reg.predict(X)))
print('test accuracy=', accuracy_score(y_test, reg.predict(X_test)))

train accuracy= 0.8335075797177208
test accuracy= 0.6054389312977099


In [17]:
#trying random forest

forest=RandomForestClassifier()
forest.fit(X,y)
print('train accuracy=', accuracy_score(y, forest.predict(X)))
print('test accuracy=', accuracy_score(y_test, forest.predict(X_test)))

train accuracy= 0.9998693152117094
test accuracy= 0.6111641221374046


In [107]:
pd.DataFrame(y_test).groupby("next_season_played").size()


next_season_played
0     954
1    1142
dtype: int64