### Import some stuff

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Load some fpl data

In [2]:
# MIGHT NEED CHANGING FOR OLDER YEARS!!!!!!!
def process_name(name):
    if len(name.split('_')) > 1:
        try: 
            int(name.split('_')[-1]) # See if last bit is a number
            out = ' '.join(name.split('_')[0:-1]).lower()
        except:
            out = ' '.join(name.split('_')).lower()
    else:
        out = name.lower()
    return out

In [3]:
def get_team_names_dict(year, fpl_dir, encoding):
    # Get team names (in seperate file for last years)
    if year in ['2022-23', '2021-2022']:
        df_teams_tmp = pd.read_csv(fpl_dir+'data/'+year+'/teams.csv', encoding=encoding)
        df_teams_tmp['team'] = df_teams_tmp['id'] #df_teams_tmp['code'] # Resets to 1-20 from this year onwards
        df_teams_tmp['team_name'] = df_teams_tmp['name']
    else:
        df_teams = pd.read_csv(fpl_dir+'data/master_team_list.csv')
        df_teams_tmp = df_teams[df_teams['season']==year]

    # Turn to dict
    df_teams_dict = {}
    for team, name in zip(df_teams_tmp['team'], df_teams_tmp['team_name']):
        df_teams_dict[team] = name
        
    return df_teams_dict


In [4]:
def get_player_positions_dict(year, fpl_dir, encoding):
    # Get player positions (in seperate file for last years)
    if year in ['2019-20', '2018-19','2017-18','2016-17']:
        
        mapping = {'1':'GKP',
                   '2':'DEF',
                   '3':'MID',
                   '4':'FWD'}

        # Read file
        df_players_tmp = pd.read_csv(fpl_dir+'data/'+year+'/players_raw.csv', encoding='utf_8')
        
        # Turn to mapping dict (with name in correct form)
        df_pos_dict = {}
        if year in ['2019-20','2018-19']:
            for first_name, second_name, p_id, element_type in zip(df_players_tmp['first_name'], 
                                                                   df_players_tmp['second_name'],
                                                                   df_players_tmp['id'],
                                                                   df_players_tmp['element_type']):
                df_pos_dict[first_name+'_'+second_name+'_'+str(p_id)] = mapping[str(element_type)]
        
        # Slightly different name format for these earlier two years (no id)
        else:
            for first_name, second_name, p_id, element_type in zip(df_players_tmp['first_name'], 
                                                                   df_players_tmp['second_name'],
                                                                   df_players_tmp['id'],
                                                                   df_players_tmp['element_type']):
                df_pos_dict[first_name+'_'+second_name] = mapping[str(element_type)]
    
    # Return null for other years
    else:
        df_pos_dict = None


    return df_pos_dict




In [131]:
def load_data(fpl_dir = '/Users/dominicbates/Documents/Github/Fantasy-Premier-League/'):
    
    # Set cols
    print('\nLoading data...')
    cols = ['season','GW','name','position','opponent_name','opponent_team', 'kickoff_time', 'was_home', 'selected','selected_weight','minutes','total_points','saves','bonus','clean_sheets','goals_conceded','goals_scored','assists','red_cards','yellow_cards']
    
#     df_teams = pd.read_csv(fpl_dir+'data/master_team_list.csv')
    df_all = pd.DataFrame()

    # Loop through all years
    all_years = ['2022-23', '2021-22', '2020-21','2019-20', '2018-19','2017-18','2016-17']
    print('Loading data for years:',all_years)
    for year in all_years:
        print('... Processing year:',year)

        # Set encoding to avoid errors
        if year in ['2018-19', '2017-18', '2016-17']:
            encoding = 'latin-1'
        else:
            encoding = 'utf_8'

        # Load season data
        df_tmp = pd.read_csv(fpl_dir+'data/'+year+'/gws/merged_gw.csv', encoding=encoding)
        df_tmp['season'] = year

        
        
        # Get team names
        id_to_team = get_team_names_dict(year, fpl_dir, encoding)
        df_tmp['opponent_name'] = [id_to_team[team] for team in df_tmp['opponent_team']]
#         print(list(df_tmp))
        
        # Get positions
        name_to_pos = get_player_positions_dict(year, fpl_dir, encoding)
        if name_to_pos is not None:
            df_tmp['position'] = [name_to_pos[name] if name in list(name_to_pos) else None for name in df_tmp['name']]
        df_tmp.loc[(df_tmp['position'] == 'GK'),'position'] = 'GKP'
        # Set selected weight
        df_tmp['selected_weight'] = df_tmp['selected'] / df_tmp['selected'].mean()

        # Get final columns set and add to dataframe
        df_tmp = df_tmp[cols]
        df_all = pd.concat([df_all, df_tmp])


    # Clean up final dataframe
    df_all = df_all.sort_values(['season','GW','opponent_name','kickoff_time','name'])
    df_all['name_cleaned'] = [process_name(name) for name in df_all['name']]

    print('\nDropping Nulls')
    print('... Size:',len(df_all))
    m_nonulls = pd.notnull(df_all).all(axis=1)
    df_all = df_all[m_nonulls].reset_index(drop=True)
    print('... New Size:',len(df_all))    
    print('...',(m_nonulls==0).sum(),'rows dropped')
    print('\nData loaded!')
    return df_all


### Create training data

In [132]:
# bins = [[0,1], [1,2], [2,3], [3,4],  [4,5], [5,10], [10,20]]

# features = {
#     'position':{'name':'f|position|',
#                 'bins':bins[0]},
#     'total_points':{'name':'f|points|',
#               'bins':bins},
#     'minutes':{'name':'f|mins|',
#               'bins':bins},
#     'goals':{'name':'f|goals|',
#               'bins':bins},
#     'assists':{'name':'f|goals|',
#               'bins':bins},
#     'clean_sheets':{'name':'f|clean_sheets|',
#               'bins':bins},
#     'bonus':{'name':'f|bonus|',
#               'bins':bins},
#     'was_home':{'name':'f|home|',
#               'bins':bins},
#     'time_horizon':{'name':'f|time_horizon|',
#                     'bins':bins[0:5]},
#     'season_start':{'name':'f|season_start|',
#                      'bins':bins},
#     'fixture_m1':{'name':'f|fixture_m1|',
#                     'bins':bins[0:5]},
#     'fixture_m2':{'name':'f|fixture_m2|',
#                     'bins':bins[0:5]},
#     'fixture_m3':{'name':'f|fixture_m3|',
#                     'bins':bins[0:5]},
#     'fixture_m4':{'name':'f|fixture_m4|',
#                     'bins':bins[0:5]},
#     'fixture_m5':{'name':'f|fixture_m5|',
#                     'bins':bins[0:5]}
#     # were they around then?
# }

In [133]:
        
def process_row(df_all, n, bins, binned_features, vals):

    # Cretae current featurs
    vals['f|current|position|'].append(df_all['position'].iloc[n])
    vals['f|current|is_home'].append(df_all['was_home'].iloc[n])
        
    # Loop through binned features
    for bin_range in bins:
        # Get bin limits
        bin_start = n-bin_range[0]
        bin_end = n-bin_range[1]
        name_cleaned = df_all['name_cleaned'].iloc[n]
        season = df_all['season'].iloc[n]
        
        # Work out if bin possible
        if (bin_end<0) or (df_all['name_cleaned'].iloc[bin_end] != name_cleaned): # If bin outside of dataframe or different player
            # No player in data for this bin, so set vals to 0
            for col in binned_features+['current_season']:
                vals['f|'+col+'|'+str(bin_range[0])+'_to_'+str(bin_range[1])].append(0.0)
            vals['f|player_exists|'+str(bin_range[0])+'_to_'+str(bin_range[1])].append(0.0)

        # If possible, get bin average for each feature
        else:
            # Bin available
            vals['f|player_exists|'+str(bin_range[0])+'_to_'+str(bin_range[1])].append(1.0)
            for col in binned_features:
                mean = df_all[col].iloc[bin_end:bin_start].mean()
                vals['f|'+col+'|'+str(bin_range[0])+'_to_'+str(bin_range[1])].append(mean)

            # Create feature to say if bin is (entirely) within this season
            if season != df_all['season'].iloc[bin_end]:
                vals['f|current_season|'+str(bin_range[0])+'_to_'+str(bin_range[1])].append(0.0)
            else:
                vals['f|current_season|'+str(bin_range[0])+'_to_'+str(bin_range[1])].append(1.0)
    return vals





In [134]:

def process_features(df, bins = [[0,1], [1,2], [2,3], [3,4],  [4,5], [5,10], [10,20]]):
    '''
    # Define range of bins ([0,1] means just last game, i.e. not current week [0,2] would be last two games)

    '''
    print('\nProcessing features...')


    # Dict for storing vals (for speed)
    vals = {}
    binned_features = ['total_points', 'minutes', 'goals_scored', 'assists', 'clean_sheets', 'bonus','was_home']
    for col in binned_features+['current_season','player_exists']:
        for bin_range in bins:
            vals['f|'+col+'|'+str(bin_range[0])+'_to_'+str(bin_range[1])] = []
    for col in ['f|current|position|', 'f|current|is_home']: # ending in '|' means one hot encode
        vals[col] = []

    # Sort to allow "windowing" to calculate stats
    df = df.sort_values(by = ['name_cleaned','season','GW'],
                                ascending = [True,True,True]).reset_index(drop=True)
    print('Processing dataframe binned features')
    # loop through all rows
    for n in range(len(df)):
        test = len(vals['f|total_points|0_to_1'])
        vals = process_row(df_all, n, bins, binned_features, vals)
        if ((n%5000)==0):
            print('...',n,'/',len(df_all),'rows complete')

        if test != len(vals['f|total_points|0_to_1'])-1:
            print('problem at',n)
            
    for col in vals:
        df[col] = vals[col]
    print('\nFeatures processed!')
    return df



In [135]:
def one_hot_encode(df):
    print('\nOne-hot encoding all columns ending in "|"...')
    for col in list(df):
        # If ends in '|' one hot encode features
        if col[-1] == '|':
            features = list(set(df[col]))
            for f in features:
                df[col+f] = (df[col]==f).astype(int)
            df = df.drop(columns=[col])
    print('Finished!')
    return df


def get_avg_goals_conceded(df):
    '''
    Get average goals conceded for each team per season 
    '''
    print('\nExtracting avg goals conceded for all teams...')
    avg_gc = (df.groupby(['season','opponent_name'])['goals_scored'].sum()).reset_index(name='avg_goals_conceded')
    # Create default
    avg_gc_default = np.percentile(avg_gc['avg_goals_conceded'], 85) # 85th percentile is ~17th pos
    default = pd.DataFrame({'season':['default'], 
                            'opponent_name':['default'],
                            'avg_goals_conceded':[avg_gc_default]})
    avg_gc = pd.concat([avg_gc, default], axis=0).reset_index(drop=True)
    print('Extracted!')
    return avg_gc




In [139]:
df_all = load_data()
avg_gc = get_avg_goals_conceded(df_all)
df_all = process_features(df_all)
df_all = one_hot_encode(df_all)


Loading data...
Loading data for years: ['2022-23', '2021-22', '2020-21', '2019-20', '2018-19', '2017-18', '2016-17']
... Processing year: 2022-23
... Processing year: 2021-22
... Processing year: 2020-21
... Processing year: 2019-20
... Processing year: 2018-19
... Processing year: 2017-18
... Processing year: 2016-17

Dropping Nulls
... Size: 166653
... New Size: 165873
... 780 rows dropped

Data loaded!

Extracting avg goals conceded for all teams...
Extracted!

Processing features...
Processing dataframe binned features
... 0 / 165873 rows complete
... 5000 / 165873 rows complete
... 10000 / 165873 rows complete
... 15000 / 165873 rows complete
... 20000 / 165873 rows complete
... 25000 / 165873 rows complete
... 30000 / 165873 rows complete
... 35000 / 165873 rows complete
... 40000 / 165873 rows complete
... 45000 / 165873 rows complete
... 50000 / 165873 rows complete
... 55000 / 165873 rows complete
... 60000 / 165873 rows complete
... 65000 / 165873 rows complete
... 70000 / 

In [140]:
list(df_all)

['season',
 'GW',
 'name',
 'position',
 'opponent_name',
 'opponent_team',
 'kickoff_time',
 'was_home',
 'selected',
 'selected_weight',
 'minutes',
 'total_points',
 'saves',
 'bonus',
 'clean_sheets',
 'goals_conceded',
 'goals_scored',
 'assists',
 'red_cards',
 'yellow_cards',
 'name_cleaned',
 'f|total_points|0_to_1',
 'f|total_points|1_to_2',
 'f|total_points|2_to_3',
 'f|total_points|3_to_4',
 'f|total_points|4_to_5',
 'f|total_points|5_to_10',
 'f|total_points|10_to_20',
 'f|minutes|0_to_1',
 'f|minutes|1_to_2',
 'f|minutes|2_to_3',
 'f|minutes|3_to_4',
 'f|minutes|4_to_5',
 'f|minutes|5_to_10',
 'f|minutes|10_to_20',
 'f|goals_scored|0_to_1',
 'f|goals_scored|1_to_2',
 'f|goals_scored|2_to_3',
 'f|goals_scored|3_to_4',
 'f|goals_scored|4_to_5',
 'f|goals_scored|5_to_10',
 'f|goals_scored|10_to_20',
 'f|assists|0_to_1',
 'f|assists|1_to_2',
 'f|assists|2_to_3',
 'f|assists|3_to_4',
 'f|assists|4_to_5',
 'f|assists|5_to_10',
 'f|assists|10_to_20',
 'f|clean_sheets|0_to_1',
 'f