In [46]:
import pandas as pd
from useful_funcs2 import *

In [47]:
# Wrangle data
def import_data_from_vastaav(year, n_gws):
    year_range = f'20{year-1}-{year}'
    gw_df_list = []
    for i in range(1, n_gws+1):
        gw_url = f"https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/refs/heads/master/data/{year_range}/gws/gw{i}.csv"
        gw_df = pd.read_csv(gw_url, index_col=0)
        gw_df['gw'] = i
        gw_df_list.append(gw_df)

    gw_df = pd.concat(gw_df_list)
    return gw_df.reset_index()

In [48]:
def add_team_data(gw_df):
    gw_df['team_goals'] = gw_df.apply(lambda row: get_team_goals(row['was_home'], row['team_h_score'], row['team_a_score']), axis=1)
    gw_df['opponent_goals'] = gw_df.apply(lambda row: get_opponent_goals(row['was_home'], row['team_h_score'], row['team_a_score']), axis=1)
    gw_df['team_points'] = gw_df.apply(lambda row: get_team_points(row['was_home'], row['team_h_score'], row['team_a_score']), axis=1)
    gw_df['opponent_points'] = gw_df['team_points'].apply(get_opponent_points)
    return gw_df

In [49]:
def combine_names(first_name, second_name):
    full_name = first_name + '_' + second_name    
    return full_name

def clean_name(name):
    name = name.replace(" ", "_")
    name = name.replace("-", "_")
    name = unidecode.unidecode(name)
    return name.strip().lower()

In [50]:
def ewma(gw_df, groupby_col, value_cols, alpha, rename_dict, remerge_cols):
    # Ensure the DataFrame is sorted by 'full_name' and 'gw'
    gw_df_sorted = gw_df.sort_values([groupby_col, 'gw']).reset_index()

    # Apply EWMA within each group
    ewma_gw_df = (
        gw_df_sorted
        .groupby(groupby_col, group_keys=False)
        [value_cols]
        .apply(lambda x: x.ewm(alpha=alpha, adjust=False).mean())
    )
    ewma_gw_df.rename(columns=rename_dict, inplace=True)
    ewma_gw_df = gw_df_sorted[remerge_cols].join(ewma_gw_df)
    return ewma_gw_df

In [51]:
def get_teams_df(gw_df):
    gw_df_teams = gw_df[['team', 'gw', 'team_goals', 'team_points']]
    gw_df_teams = gw_df_teams.groupby(['team', 'gw']).first().reset_index()
    return gw_df_teams

In [52]:
def get_teamcodes(year):
    teams_url = f'https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/refs/heads/master/data/20{year-1}-{year}/teams.csv'
    teams = pd.read_csv(teams_url)
    teamcode_dict = dict(zip(teams['id'], teams['name']))
    return teamcode_dict

In [53]:
def merge_ewma_dfs(ewma_gw_df_players, ewma_gw_df_teams, year):
    teamcode_dict = get_teamcodes(year)
    ewma_gw_df_players['opponent_team_name'] = ewma_gw_df_players['opponent_team'].map(teamcode_dict)
    ewma_gw_df = ewma_gw_df_players.merge(ewma_gw_df_teams, on=['team', 'gw'], how='left')

    ewma_gw_df_opponent_team = ewma_gw_df_teams.rename(columns={'team': 'opponent_team_name', 
                                                            'ewma_team_goals': 'ewma_team_goals_nw_opponent',
                                                            'ewma_team_points': 'ewma_team_points_nw_opponent'})
    ewma_gw_df = ewma_gw_df.merge(ewma_gw_df_opponent_team, on=['opponent_team_name', 'gw'], how='left')
    return ewma_gw_df

In [54]:
def lag_feature(ewma_gw_df, feature):
    new_feature = (
    ewma_gw_df
    .sort_values(['full_name', 'gw'])
    .groupby('full_name')[feature]
    .shift(-1))
    return new_feature

In [55]:
def get_ewma_df(year, gw, ewma_alpha):
    gw_df = import_data_from_vastaav(year, gw)
    gw_df = add_team_data(gw_df)

    gw_df['full_name'] = gw_df['name'].apply(clean_name)

    player_value_cols = ['xP', 'assists', 'bonus', 'bps',
       'clean_sheets', 'creativity', 'expected_assists',
       'expected_goal_involvements', 'expected_goals',
       'expected_goals_conceded', 'goals_conceded', 'goals_scored',
       'ict_index', 'influence', 'minutes',
       'own_goals', 'penalties_missed', 'penalties_saved',
       'red_cards', 'saves', 'starts',
       'threat', 'total_points', 'transfers_balance',
       'transfers_in', 'transfers_out', 'value', 'yellow_cards']
    merge_cols_players = ['full_name', 'gw', 'total_points', 'position','team','opponent_team']
    ewma_gw_df_players = ewma(gw_df, 'full_name', player_value_cols, ewma_alpha, {'total_points': 'ewma_total_points'}, merge_cols_players)

    gw_df_teams = get_teams_df(gw_df)
    team_value_cols = ['team_goals', 'team_points']
    merge_cols_teams = ['team', 'gw']
    ewma_gw_df_teams = ewma(gw_df_teams, 'team', team_value_cols, ewma_alpha, {'team_goals': 'ewma_team_goals',
                                                                            'team_points': 'ewma_team_points'}, merge_cols_teams)
    merged_ewma_df = merge_ewma_dfs(ewma_gw_df_players, ewma_gw_df_teams, year)

    return merged_ewma_df


In [56]:
def lag_data_for_training(merged_ewma_df):
    merged_ewma_df['total_points_nw'] = lag_feature(merged_ewma_df, 'total_points')
    merged_ewma_df['opponent_nw'] = lag_feature(merged_ewma_df, 'opponent_team_name')

    return merged_ewma_df

In [57]:
gw_df = import_data_from_vastaav(25, 38)
gw_df = add_team_data(gw_df)

In [58]:
gw_df['full_name'] = gw_df['name'].apply(clean_name)

In [59]:
alpha = 0.4

In [60]:
player_value_cols = ['xP', 'assists', 'bonus', 'bps',
       'clean_sheets', 'creativity', 'expected_assists',
       'expected_goal_involvements', 'expected_goals',
       'expected_goals_conceded', 'goals_conceded', 'goals_scored',
       'ict_index', 'influence', 'minutes',
       'own_goals', 'penalties_missed', 'penalties_saved',
       'red_cards', 'saves', 'starts',
       'threat', 'total_points', 'transfers_balance',
       'transfers_in', 'transfers_out', 'value', 'yellow_cards']
merge_cols_players = ['full_name', 'gw', 'total_points', 'position','team','opponent_team']
ewma_gw_df_players = ewma(gw_df, 'full_name', player_value_cols, alpha, {'total_points': 'ewma_total_points'}, merge_cols_players)


In [61]:
gw_df_teams = get_teams_df(gw_df)
team_value_cols = ['team_goals', 'team_points']
merge_cols_teams = ['team', 'gw']
ewma_gw_df_teams = ewma(gw_df_teams, 'team', team_value_cols, alpha, {'team_goals': 'ewma_team_goals',
                                                                        'team_points': 'ewma_team_points'}, merge_cols_teams)

In [62]:
merged_ewma_df = merge_ewma_dfs(ewma_gw_df_players, ewma_gw_df_teams, 25)

In [63]:
merged_ewma_df['total_points_nw'] = lag_feature(merged_ewma_df, 'total_points')
merged_ewma_df['opponent_nw'] = lag_feature(merged_ewma_df, 'opponent_team_name')

In [64]:
# from wrangle_data import *
import pandas as pd

In [65]:
gw_df = get_ewma_df(25, 38, 0.3)
training_df = lag_data_for_training(gw_df).dropna(subset=['total_points_nw'])

In [66]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error


def evaluate_model(X, y, model):
    y_pred = model.predict(X)
    mae = mean_absolute_error(y, y_pred)
    rmse = root_mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    return mae, rmse, r2

def create_model(training_df, features, model, test):
    model_dict = {}
    rmse_dict = {}
    for pos in ['GK', 'DEF', 'MID', 'FWD']:
        model = model
        training_df_pos = training_df.query('position==@pos')
        X = training_df_pos[features]
        y = training_df_pos['total_points_nw']

        if test is True:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
            model_dict[pos] = model.fit(X_train, y_train)
            mae, rmse, r2 = evaluate_model(X_test, y_test, model_dict[pos])
            rmse_dict[pos] = rmse
        else:
            model_dict[pos] = model.fit(X, y)
            rmse_dict[pos] = None
    return model_dict, rmse_dict

def predict_scores(prediction_df, features, model_dict):
    for pos in ['GK', 'DEF', 'MID', 'FWD']:
        prediction_df_pos = prediction_df.query('position==@pos')
        X_pred = prediction_df_pos[features]
        prediction_df.loc[prediction_df['position']==pos, 'predicted_points'] = model_dict[pos].predict(X_pred)
    return prediction_df

In [None]:
features = [
        'assists', 'bonus', 'bps', 'clean_sheets', 'goals_conceded',
        'goals_scored', 'influence', 'creativity', 'threat', 'ict_index',
        'minutes', 'ewma_total_points', 'ewma_team_goals', 'ewma_team_points',
        'ewma_team_goals_nw_opponent', 'ewma_team_points_nw_opponent']

non_zero_players = training_df.groupby('full_name').sum().query('total_points_nw>0').index
training_df_f = training_df.query('gw>1 and full_name in @non_zero_players')

In [83]:
model_dict, rmse_dict = create_model(training_df_f, features, LinearRegression(), test=True)
print(rmse_dict)

{'GK': 2.0962160119369093, 'DEF': 2.139696670607732, 'MID': 2.2385495433720943, 'FWD': 2.736262449919245}


In [86]:
model_dict, rmse_dict = create_model(training_df_f, features, LinearRegression(), test=True)
print(rmse_dict)

{'GK': 2.261820030265231, 'DEF': 2.172126651267967, 'MID': 2.2920581079354476, 'FWD': 2.456604004742539}


In [None]:
import requests
year = 26
gw = 5

def get_prediction_df(year, gw):
    prediction_df = get_ewma_df(year, gw-1, alpha).drop(['ewma_team_goals_nw_opponent', 'ewma_team_points_nw_opponent'], axis=1)
    prediction_df = prediction_df.query(f'gw=={gw-1}')

    teamcode_dict = get_teamcodes(26)

    fixtures_url = 'https://fantasy.premierleague.com/api/fixtures/'
    r = requests.get(fixtures_url).json()
    fixtures_df = pd.json_normalize(r)

    fixtures_df_gw = fixtures_df.query(f'event=={gw}')
    h_teams = fixtures_df_gw['team_h'].map(teamcode_dict)
    a_teams = fixtures_df_gw['team_a'].map(teamcode_dict)
    fixture_dict = {**dict(zip(h_teams, a_teams)), **dict(zip(a_teams, h_teams))}

    prediction_df['team_name_nw_opponent'] = prediction_df['team'].map(fixture_dict)

    opp_team_df = prediction_df[['team', 'ewma_team_goals', 'ewma_team_points']].groupby('team').first().reset_index()
    prediction_df = prediction_df.merge(opp_team_df, left_on='team_name_nw_opponent', right_on='team', suffixes=('', '_nw_opponent'))

    return prediction_df

In [None]:
model_dict, rmse_dict = create_model(training_df_f, features, LinearRegression(), test=False)
prediction_df = get_prediction_df(26, 5)
pred_df = predict_scores(prediction_df.dropna(), features, model_dict)