In [1]:
import pandas as pd 
from useful_funcs import *

# Getting dfs

In [2]:
gw1_url = "https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/refs/heads/master/data/2024-25/gws/gw1.csv"
gw1_df = pd.read_csv(gw1_url, index_col=0)

In [3]:
gw_df_list = []
for i in range(1, 39):
    gw_url = f"https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/refs/heads/master/data/2024-25/gws/gw{i}.csv"
    gw_df = pd.read_csv(gw_url, index_col=0)
    gw_df['gw'] = i
    gw_df_list.append(gw_df)

In [4]:
gw_df = pd.concat(gw_df_list)

# Cleaning up dataframe

In [5]:
gw_df.columns

Index(['position', 'team', 'xP', 'assists', 'bonus', 'bps', 'clean_sheets',
       'creativity', 'element', 'expected_assists',
       'expected_goal_involvements', 'expected_goals',
       'expected_goals_conceded', 'fixture', 'goals_conceded', 'goals_scored',
       'ict_index', 'influence', 'kickoff_time', 'minutes', 'modified',
       'opponent_team', 'own_goals', 'penalties_missed', 'penalties_saved',
       'red_cards', 'round', 'saves', 'selected', 'starts', 'team_a_score',
       'team_h_score', 'threat', 'total_points', 'transfers_balance',
       'transfers_in', 'transfers_out', 'value', 'was_home', 'yellow_cards',
       'gw', 'mng_clean_sheets', 'mng_draw', 'mng_goals_scored', 'mng_loss',
       'mng_underdog_draw', 'mng_underdog_win', 'mng_win'],
      dtype='object')

In [6]:
gw_df['team_goals'] = gw_df.apply(lambda row: get_team_goals(row['was_home'], row['team_h_score'], row['team_a_score']), axis=1)
gw_df['opponent_goals'] = gw_df.apply(lambda row: get_opponent_goals(row['was_home'], row['team_h_score'], row['team_a_score']), axis=1)
gw_df['team_points'] = gw_df.apply(lambda row: get_team_points(row['was_home'], row['team_h_score'], row['team_a_score']), axis=1)
gw_df['opponent_points'] = gw_df['team_points'].apply(get_opponent_points)

In [8]:
gw_df['full_name'] = [clean_name(idx) for idx in gw_df.index]

In [53]:
gw_df.columns

Index(['position', 'team', 'xP', 'assists', 'bonus', 'bps', 'clean_sheets',
       'creativity', 'element', 'expected_assists',
       'expected_goal_involvements', 'expected_goals',
       'expected_goals_conceded', 'fixture', 'goals_conceded', 'goals_scored',
       'ict_index', 'influence', 'kickoff_time', 'minutes', 'modified',
       'opponent_team', 'own_goals', 'penalties_missed', 'penalties_saved',
       'red_cards', 'round', 'saves', 'selected', 'starts', 'team_a_score',
       'team_h_score', 'threat', 'total_points', 'transfers_balance',
       'transfers_in', 'transfers_out', 'value', 'was_home', 'yellow_cards',
       'gw', 'mng_clean_sheets', 'mng_draw', 'mng_goals_scored', 'mng_loss',
       'mng_underdog_draw', 'mng_underdog_win', 'mng_win', 'team_goals',
       'opponent_goals', 'team_points', 'opponent_points', 'full_name'],
      dtype='object')

In [83]:
simple_gw_df = gw_df[['assists', 'bonus', 'bps', 'clean_sheets', 'goals_conceded', 'goals_scored',
                      'influence', 'creativity', 'threat', 'ict_index', 'team_goals', 'opponent_goals', 'minutes',
                        'gw', 'full_name', 'total_points', 'position']]

In [84]:
simple_gw_df_reset_index = simple_gw_df.reset_index(drop=True)

In [85]:
# Ensure the DataFrame is sorted by 'full_name' and 'gw'
simple_gw_df_reset_index = simple_gw_df_reset_index.sort_values(['full_name', 'gw'])

# List of columns to apply EWMA to
# value_cols = ['influence', 'creativity', 'threat', 'ict_index', 'team_goals', 'opponent_goals', 'minutes']
value_cols = ['assists', 'bonus', 'bps', 'clean_sheets', 'goals_conceded', 'goals_scored',
                      'influence', 'creativity', 'threat', 'ict_index', 'team_goals', 'opponent_goals', 'minutes']

# Apply EWMA within each group
ewma_simple_gw_df = (
    simple_gw_df_reset_index
    .groupby('full_name', group_keys=False)
    [value_cols]
    .apply(lambda x: x.ewm(alpha=0.3, adjust=False).mean())
)

ewma_simple_gw_df = simple_gw_df_reset_index[['full_name', 'gw', 'total_points', 'position']].join(ewma_simple_gw_df)

In [92]:
player_gt_0_points = ewma_simple_gw_df.groupby('full_name').sum().query('total_points>0').index

In [99]:
ewma_simple_gw_df_f1 = ewma_simple_gw_df.query('full_name in @player_gt_0_points')
ewma_simple_gw_df_f2 = ewma_simple_gw_df_f1.query('gw>9')

Unnamed: 0,full_name,gw,total_points,position,assists,bonus,bps,clean_sheets,goals_conceded,goals_scored,influence,creativity,threat,ict_index,team_goals,opponent_goals,minutes
6534,adam_armstrong,10,7,FWD,0.000,0.30000,9.966125,0.30000,0.235675,0.30000,11.313186,9.231271,15.367983,3.593058,0.844057,1.463862,44.453984
7212,adam_armstrong,11,2,FWD,0.000,0.21000,7.276288,0.21000,0.764973,0.21000,8.939230,7.991890,13.457588,3.055141,0.590840,1.624703,58.117789
7896,adam_armstrong,12,7,FWD,0.300,0.44700,14.993401,0.14700,1.135481,0.44700,21.797461,9.794323,20.820312,5.258598,1.013588,2.037292,63.482452
8586,adam_armstrong,13,5,FWD,0.510,0.31290,15.295381,0.10290,1.094837,0.31290,16.938223,10.756026,19.974218,4.791019,1.009512,1.726105,71.437716
9279,adam_armstrong,14,1,FWD,0.357,0.21903,11.606767,0.07203,1.666386,0.21903,11.856756,8.129218,15.181953,3.473713,1.006658,2.708273,68.306402
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23843,youssef_ramalho_chermiti,34,1,FWD,0.000,0.00000,0.616932,0.00000,0.000000,0.00000,0.110926,0.075126,2.609262,0.276052,0.474484,1.081822,8.642751
24507,youssef_ramalho_chermiti,35,1,FWD,0.000,0.00000,1.331852,0.00000,0.000000,0.00000,0.137648,0.082588,1.826483,0.193237,0.932139,1.357275,6.349926
25308,youssef_ramalho_chermiti,36,0,FWD,0.000,0.00000,0.932297,0.00000,0.000000,0.00000,0.096354,0.057812,1.278538,0.135266,1.552497,1.250093,4.444948
26109,youssef_ramalho_chermiti,37,0,FWD,0.000,0.00000,0.652608,0.00000,0.000000,0.00000,0.067448,0.040468,0.894977,0.094686,1.686748,0.875065,3.111464


# Trying some models

## Random forest

In [130]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

In [111]:
ewma_simple_gw_df_f2_fwd = ewma_simple_gw_df_f2.query('position=="FWD"').set_index(['full_name', 'gw'])
X = ewma_simple_gw_df_f2_fwd[['assists', 'bonus', 'bps', 'clean_sheets', 'goals_conceded',
                              'goals_scored', 'influence', 'creativity', 'threat', 'ict_index',
                              'team_goals', 'opponent_goals', 'minutes']]
y = ewma_simple_gw_df_f2_fwd['total_points']

In [113]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [136]:
model = RandomForestClassifier(random_state=0)
model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [137]:
y_pred = model.predict(X_test)

In [138]:
from sklearn.metrics import accuracy_score

print('Model accuracy score with 10 decision-trees : {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

Model accuracy score with 10 decision-trees : 0.6585


## xGBoost

In [140]:
from xgboost import XGBClassifier

model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_train, y_train)

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17], got [-2 -1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 16 17]

In [141]:
y_train

full_name                 gw
liam_delap                28     2
niclas_fullkrug           35     2
taiwo_awoniyi             31     0
rasmus_hojlund            15     8
dominic_solanke_mitchell  38     7
                                ..
michail_antonio           19     0
odsonne_edouard           38     0
jean_philippe_mateta      22    13
remy_rees_dottin          22     1
michail_antonio           15     0
Name: total_points, Length: 1153, dtype: int64