### Import some stuff

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Load some fpl data

In [2]:
# MIGHT NEED CHANGING FOR OLDER YEARS!!!!!!!
def process_name(name):
    if len(name.split('_')) > 1:
        try: 
            int(name.split('_')[-1]) # See if last bit is a number
            out = ' '.join(name.split('_')[0:-1]).lower()
        except:
            out = ' '.join(name.split('_')).lower()
    else:
        out = name.lower()
    return out

In [3]:
def get_team_names_dict(year, fpl_dir, encoding):
    # Get team names (in seperate file for last years)
    if year in ['2022-23', '2021-2022']:
        df_teams_tmp = pd.read_csv(fpl_dir+'data/'+year+'/teams.csv', encoding=encoding)
        df_teams_tmp['team'] = df_teams_tmp['id'] #df_teams_tmp['code'] # Resets to 1-20 from this year onwards
        df_teams_tmp['team_name'] = df_teams_tmp['name']
    else:
        df_teams = pd.read_csv(fpl_dir+'data/master_team_list.csv')
        df_teams_tmp = df_teams[df_teams['season']==year]

    # Turn to dict
    df_teams_dict = {}
    for team, name in zip(df_teams_tmp['team'], df_teams_tmp['team_name']):
        df_teams_dict[team] = name
        
    return df_teams_dict


In [4]:
def get_player_positions_dict(year, fpl_dir, encoding):
    # Get player positions (in seperate file for last years)
    if year in ['2019-20', '2018-19','2017-18','2016-17']:
        
        mapping = {'1':'GKP',
                   '2':'DEF',
                   '3':'MID',
                   '4':'FWD'}

        # Read file
        df_players_tmp = pd.read_csv(fpl_dir+'data/'+year+'/players_raw.csv', encoding='utf_8')
        
        # Turn to mapping dict (with name in correct form)
        df_pos_dict = {}
        if year in ['2019-20','2018-19']:
            for first_name, second_name, p_id, element_type in zip(df_players_tmp['first_name'], 
                                                                   df_players_tmp['second_name'],
                                                                   df_players_tmp['id'],
                                                                   df_players_tmp['element_type']):
                df_pos_dict[first_name+'_'+second_name+'_'+str(p_id)] = mapping[str(element_type)]
        
        # Slightly different name format for these earlier two years (no id)
        else:
            for first_name, second_name, p_id, element_type in zip(df_players_tmp['first_name'], 
                                                                   df_players_tmp['second_name'],
                                                                   df_players_tmp['id'],
                                                                   df_players_tmp['element_type']):
                df_pos_dict[first_name+'_'+second_name] = mapping[str(element_type)]
    
    # Return null for other years
    else:
        df_pos_dict = None


    return df_pos_dict




In [5]:
def load_data(fpl_dir = '/Users/dominicbates/Documents/Github/Fantasy-Premier-League/'):
    
    # Set cols
    cols = ['season','GW','name','position','opponent_name','opponent_team', 'kickoff_time', 'was_home', 'selected','selected_weight','minutes','total_points','saves','bonus','clean_sheets','goals_conceded','goals_scored','assists','red_cards','yellow_cards']
    
#     df_teams = pd.read_csv(fpl_dir+'data/master_team_list.csv')
    df_all = pd.DataFrame()

    # Loop through all years
    all_years = ['2022-23', '2021-22', '2020-21','2019-20', '2018-19','2017-18','2016-17']
    print('Loading data for years:',all_years)
    for year in all_years:
        print('... Processing year:',year)

        # Set encoding to avoid errors
        if year in ['2018-19', '2017-18', '2016-17']:
            encoding = 'latin-1'
        else:
            encoding = 'utf_8'

        # Load season data
        df_tmp = pd.read_csv(fpl_dir+'data/'+year+'/gws/merged_gw.csv', encoding=encoding)
        df_tmp['season'] = year

        
        
        # Get team names
        id_to_team = get_team_names_dict(year, fpl_dir, encoding)
        df_tmp['opponent_name'] = [id_to_team[team] for team in df_tmp['opponent_team']]
#         print(list(df_tmp))
        
        # Get positions
        name_to_pos = get_player_positions_dict(year, fpl_dir, encoding)
        if name_to_pos is not None:
            df_tmp['position'] = [name_to_pos[name] if name in list(name_to_pos) else None for name in df_tmp['name']]
        df_tmp.loc[(df_tmp['position'] == 'GK'),'position'] = 'GKP'
        # Set selected weight
        df_tmp['selected_weight'] = df_tmp['selected'] / df_tmp['selected'].mean()

        # Get final columns set and add to dataframe
        df_tmp = df_tmp[cols]
        df_all = pd.concat([df_all, df_tmp])


    # Clean up final dataframe
    df_all = df_all.sort_values(['season','GW','opponent_name','kickoff_time','name'])
    df_all['name_cleaned'] = [process_name(name) for name in df_all['name']]

    print('\nDropping Nulls')
    print('... Size:',len(df_all))
    m_nonulls = pd.notnull(df_all).all(axis=1)
    df_all = df_all[m_nonulls].reset_index(drop=True)
    print('... New Size:',len(df_all))    
    print('...',(m_nonulls==0).sum(),'rows dropped')
    
    return df_all


### Create training data

In [6]:
df_all = load_data()

Loading data for years: ['2022-23', '2021-22', '2020-21', '2019-20', '2018-19', '2017-18', '2016-17']
... Processing year: 2022-23
... Processing year: 2021-22
... Processing year: 2020-21
... Processing year: 2019-20
... Processing year: 2018-19
... Processing year: 2017-18
... Processing year: 2016-17

Dropping Nulls
... Size: 166653
... New Size: 165873
... 780 rows dropped


In [7]:
set(df_all['name_cleaned'])

{'markus henriksen',
 'joão mário naval costa eduardo',
 'bright enobakhare',
 'boubacar traoré',
 'gylfi sigurdsson',
 'chris long',
 'fredrik ulvestad',
 'filip marschall',
 'ali gabr',
 'lewis bate',
 'antonee robinson',
 'lewis grabban',
 'malcolm ebiowei',
 'emilio nsue lopez',
 'marcus bettinelli',
 'thomas robson',
 'maximillian aarons',
 'dexter lembikisa',
 'illan meslier',
 'sean mcallister',
 'josé reina',
 'massadio haidara',
 'james weir',
 'oleksandr zinchenko',
 'leon balogun',
 'dynel simeu',
 'amadou onana',
 'eduardo dos reis carvalho',
 'zak brunt',
 'callum hudson-odoi',
 'laurent koscielny',
 'dara costelloe',
 'david ospina',
 'christian eriksen',
 'nathan tella',
 'jason puncheon',
 'connor roberts',
 'muhamed bešić',
 'francisco jorge tomás oliveira',
 'bernardo veiga de carvalho e silva',
 'sebastian larsson',
 'wayne routledge',
 'mathias jensen',
 'josé heriberto izquierdo mena',
 'vito mannone',
 'owen dodgson',
 'julien ngoy',
 'kayky da silva chagas',
 'ja

In [8]:
# bins = [[0,1], [1,2], [2,3], [3,4],  [4,5], [5,10], [10,20]]

# features = {
#     'position':{'name':'f|position|',
#                 'bins':bins[0]},
#     'total_points':{'name':'f|points|',
#               'bins':bins},
#     'minutes':{'name':'f|mins|',
#               'bins':bins},
#     'goals':{'name':'f|goals|',
#               'bins':bins},
#     'assists':{'name':'f|goals|',
#               'bins':bins},
#     'clean_sheets':{'name':'f|clean_sheets|',
#               'bins':bins},
#     'bonus':{'name':'f|bonus|',
#               'bins':bins},
#     'was_home':{'name':'f|home|',
#               'bins':bins},
#     'time_horizon':{'name':'f|time_horizon|',
#                     'bins':bins[0:5]},
#     'season_start':{'name':'f|season_start|',
#                      'bins':bins},
#     'fixture_m1':{'name':'f|fixture_m1|',
#                     'bins':bins[0:5]},
#     'fixture_m2':{'name':'f|fixture_m2|',
#                     'bins':bins[0:5]},
#     'fixture_m3':{'name':'f|fixture_m3|',
#                     'bins':bins[0:5]},
#     'fixture_m4':{'name':'f|fixture_m4|',
#                     'bins':bins[0:5]},
#     'fixture_m5':{'name':'f|fixture_m5|',
#                     'bins':bins[0:5]}
#     # were they around then?
# }

In [9]:
        
def process_row(df_all, n, bins, binned_features, vals):
    for bin_range in bins:
        # Get bin limits
        bin_start = n-bin_range[0]
        bin_end = n-bin_range[1]
        name_cleaned = df_all['name_cleaned'].iloc[n]
        season = df_all['season'].iloc[n]

        # Work out if bin possible
        if (bin_end<0) or (df_all['name_cleaned'].iloc[bin_end] != name_cleaned): # If bin outside of dataframe or different player
            # No player in data for this bin, so set vals to 0
            for col in binned_features+['current_season']:
                vals['f|'+col+'|'+str(bin_range[0])+'_to_'+str(bin_range[1])].append(0.0)
            vals['f|player_exists|'+str(bin_range[0])+'_to_'+str(bin_range[1])].append(0.0)

        # If possible, get bin average for each feature
        else:
            # Bin available
            vals['f|player_exists|'+str(bin_range[0])+'_to_'+str(bin_range[1])].append(1.0)
            for col in binned_features:
                mean = df_all[col].iloc[bin_end:bin_start].mean()
                vals['f|'+col+'|'+str(bin_range[0])+'_to_'+str(bin_range[1])].append(mean)

            # Create feature to say if bin is (entirely) within this season
            if season != df_all['season'].iloc[bin_end]:
                vals['f|current_season|'+str(bin_range[0])+'_to_'+str(bin_range[1])].append(0.0)
            else:
                vals['f|current_season|'+str(bin_range[0])+'_to_'+str(bin_range[1])].append(1.0)
    return vals





In [38]:
# Define range of bins ([0,1] means just last game, i.e. not current week [0,2] would be last two games)
bins = [[0,1], [1,2], [2,3], [3,4],  [4,5], [5,10], [10,20]]

# Dict foer storing vals (for speed)
vals = {}
binned_features = ['total_points', 'minutes', 'goals_scored', 'assists', 'clean_sheets', 'bonus','was_home']
for col in binned_features+['current_season','player_exists']:
    for bin_range in bins:
        vals['f|'+col+'|'+str(bin_range[0])+'_to_'+str(bin_range[1])] = []
        

        
# Sort to allow "windowing" to calculate stats
df_all = df_all.sort_values(by = ['name_cleaned','season','GW'],
                            ascending = [True,True,True]).reset_index(drop=True)
print('Processing dataframe binned features')
# loop through all rows
for n in range(len(df_all)):
    test = len(vals['f|total_points|0_to_1'])
    vals = process_row(df_all, n, bins, binned_features, vals)
    if ((n%1000)==0):
        print('...',n,'/',len(df_all),'rows complete')
        
    if test != len(vals['f|total_points|0_to_1'])-1:
        print('problem at',n)
    
    
    

Processing dataframe binned features
... 0 / 165873 rows complete
... 1000 / 165873 rows complete
... 2000 / 165873 rows complete
... 3000 / 165873 rows complete
... 4000 / 165873 rows complete
... 5000 / 165873 rows complete
... 6000 / 165873 rows complete
... 7000 / 165873 rows complete
... 8000 / 165873 rows complete
... 9000 / 165873 rows complete
... 10000 / 165873 rows complete
... 11000 / 165873 rows complete
... 12000 / 165873 rows complete
... 13000 / 165873 rows complete
... 14000 / 165873 rows complete
... 15000 / 165873 rows complete
... 16000 / 165873 rows complete
... 17000 / 165873 rows complete
... 18000 / 165873 rows complete
... 19000 / 165873 rows complete
... 20000 / 165873 rows complete
... 21000 / 165873 rows complete
... 22000 / 165873 rows complete
... 23000 / 165873 rows complete
... 24000 / 165873 rows complete
... 25000 / 165873 rows complete
... 26000 / 165873 rows complete
... 27000 / 165873 rows complete
... 28000 / 165873 rows complete
... 29000 / 165873 

In [40]:
for col in vals:
    df_all[col] = vals[col]

In [62]:
df_all[df_all['name_cleaned']=='erling haaland'].iloc[0][0:60]

season                                  2022-23
GW                                          1.0
name                             Erling Haaland
position                                    FWD
opponent_name                          West Ham
opponent_team                                19
kickoff_time               2022-08-07T15:30:00Z
was_home                                  False
selected                                3398599
selected_weight                       14.434919
minutes                                      77
total_points                                 13
saves                                         0
bonus                                         3
clean_sheets                                  1
goals_conceded                                0
goals_scored                                  2
assists                                       0
red_cards                                     0
yellow_cards                                  0
name_cleaned                     erling 

In [15]:
list(features)

def create_weekly_features(df_all, bins):
    df_all.Groupby('name_cleaned')
    
['total_points', 'minutes', 'goals', 'assists', 'clean_sheets', 'bonus']

['position',
 'total_points',
 'minutes',
 'goals',
 'assists',
 'clean_sheets',
 'was_home',
 'bonus',
 'time_horizon',
 'season_start',
 'fixture_m1',
 'fixture_m2',
 'fixture_m3',
 'fixture_m4',
 'fixture_m5']

In [63]:
df_all

Unnamed: 0,season,GW,name,position,opponent_name,opponent_team,kickoff_time,was_home,selected,selected_weight,...,f|current_season|4_to_5,f|current_season|5_to_10,f|current_season|10_to_20,f|player_exists|0_to_1,f|player_exists|1_to_2,f|player_exists|2_to_3,f|player_exists|3_to_4,f|player_exists|4_to_5,f|player_exists|5_to_10,f|player_exists|10_to_20
0,2019-20,4.0,Aaron_Connolly_534,FWD,Man City,11,2019-08-31T14:00:00Z,False,0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2019-20,5.0,Aaron_Connolly_534,FWD,Burnley,5,2019-09-14T14:00:00Z,True,14029,0.077205,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2019-20,6.0,Aaron_Connolly_534,FWD,Newcastle,13,2019-09-21T16:30:00Z,False,22804,0.125496,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
3,2019-20,7.0,Aaron_Connolly_534,FWD,Chelsea,6,2019-09-28T14:00:00Z,False,32699,0.179950,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
4,2019-20,8.0,Aaron_Connolly_534,FWD,Spurs,17,2019-10-05T11:30:00Z,True,35026,0.192756,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165868,2020-21,35.0,Ørjan Nyland,GKP,Everton,7,2021-05-13T17:00:00Z,True,276480,1.507178,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
165869,2020-21,35.0,Ørjan Nyland,GKP,Man Utd,13,2021-05-09T13:05:00Z,True,276480,1.507178,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
165870,2020-21,36.0,Ørjan Nyland,GKP,Crystal Palace,6,2021-05-16T11:00:00Z,False,275199,1.500195,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
165871,2020-21,37.0,Ørjan Nyland,GKP,Spurs,17,2021-05-19T17:00:00Z,False,273494,1.490900,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [26]:
df_all.groupby(by = ['name_cleaned', 'season', 'GW'], sort = True)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x177937310>

In [25]:
list(df_all)

['season',
 'GW',
 'name',
 'position',
 'opponent_name',
 'opponent_team',
 'kickoff_time',
 'was_home',
 'selected',
 'selected_weight',
 'minutes',
 'total_points',
 'saves',
 'bonus',
 'clean_sheets',
 'goals_conceded',
 'goals_scored',
 'assists',
 'red_cards',
 'yellow_cards',
 'name_cleaned']