### Import some stuff

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Load some fpl data

In [2]:
def process_name(name):
    if len(name.split('_')) > 1:
        out = ' '.join(name.split('_')[0:-1])
    else:
        out = name
    return out

In [3]:
def get_team_names_dict(year, fpl_dir, encoding):
    # Get team names (in seperate file for last years)
    if year in ['2022-23', '2021-2022']:
        df_teams_tmp = pd.read_csv(fpl_dir+'data/'+year+'/teams.csv', encoding=encoding)
        df_teams_tmp['team'] = df_teams_tmp['id'] #df_teams_tmp['code'] # Resets to 1-20 from this year onwards
        df_teams_tmp['team_name'] = df_teams_tmp['name']
    else:
        df_teams = pd.read_csv(fpl_dir+'data/master_team_list.csv')
        df_teams_tmp = df_teams[df_teams['season']==year]

    # Turn to dict
    df_teams_dict = {}
    for team, name in zip(df_teams_tmp['team'], df_teams_tmp['team_name']):
        df_teams_dict[team] = name
        
    return df_teams_dict


In [60]:
def get_player_positions_dict(year, fpl_dir, encoding):
    # Get player positions (in seperate file for last years)
    if year in ['2019-20', '2018-19','2017-18','2016-17']:
        
        mapping = {'1':'GKP',
                   '2':'DEF',
                   '3':'MID',
                   '4':'FWD'}

        # Read file
        df_players_tmp = pd.read_csv(fpl_dir+'data/'+year+'/players_raw.csv', encoding='utf_8')
        
        # Turn to mapping dict (with name in correct form)
        df_pos_dict = {}
        if year in ['2019-20','2018-19']:
            for first_name, second_name, p_id, element_type in zip(df_players_tmp['first_name'], 
                                                                   df_players_tmp['second_name'],
                                                                   df_players_tmp['id'],
                                                                   df_players_tmp['element_type']):
                df_pos_dict[first_name+'_'+second_name+'_'+str(p_id)] = mapping[str(element_type)]
        
        # Slightly different name format for these earlier two years (no id)
        else:
            for first_name, second_name, p_id, element_type in zip(df_players_tmp['first_name'], 
                                                                   df_players_tmp['second_name'],
                                                                   df_players_tmp['id'],
                                                                   df_players_tmp['element_type']):
                df_pos_dict[first_name+'_'+second_name] = mapping[str(element_type)]
    
    # Return null for other years
    else:
        df_pos_dict = None


    return df_pos_dict




In [80]:
def load_data(fpl_dir = '/Users/dominicbates/Documents/Github/Fantasy-Premier-League/'):
    
    # Set cols
    cols = ['season','GW','name','position','opponent_name','opponent_team', 'kickoff_time', 'was_home', 'selected','selected_weight','minutes','total_points','saves','bonus','clean_sheets','goals_conceded','goals_scored','assists','red_cards','yellow_cards']
    
#     df_teams = pd.read_csv(fpl_dir+'data/master_team_list.csv')
    df_all = pd.DataFrame()

    # Loop through all years
    all_years = ['2022-23', '2021-22', '2020-21','2019-20', '2018-19','2017-18','2016-17']
    print('Loading data for years:',all_years)
    for year in all_years:
        print('... Processing year:',year)

        # Set encoding to avoid errors
        if year in ['2018-19', '2017-18', '2016-17']:
            encoding = 'latin-1'
        else:
            encoding = 'utf_8'

        # Load season data
        df_tmp = pd.read_csv(fpl_dir+'data/'+year+'/gws/merged_gw.csv', encoding=encoding)
        df_tmp['season'] = year

        
        
        # Get team names
        id_to_team = get_team_names_dict(year, fpl_dir, encoding)
        df_tmp['opponent_name'] = [id_to_team[team] for team in df_tmp['opponent_team']]
#         print(list(df_tmp))
        
        # Get positions
        name_to_pos = get_player_positions_dict(year, fpl_dir, encoding)
        if name_to_pos is not None:
            df_tmp['position'] = [name_to_pos[name] if name in list(name_to_pos) else None for name in df_tmp['name']]
        df_tmp.loc[(df_tmp['position'] == 'GK'),'position'] = 'GKP'
        # Set selected weight
        df_tmp['selected_weight'] = df_tmp['selected'] / df_tmp['selected'].mean()

        # Get final columns set and add to dataframe
        df_tmp = df_tmp[cols]
        df_all = pd.concat([df_all, df_tmp])


    # Clean up final dataframe
    df_all = df_all.sort_values(['season','GW','opponent_name','kickoff_time','name'])
    df_all['name_cleaned'] = [process_name(name) for name in df_all['name']]

    print('\nDropping Nulls')
    print('... Size:',len(df_all))
    m_nonulls = pd.notnull(df_all).all(axis=1)
    df_all = df_all[m_nonulls].reset_index(drop=True)
    print('... New Size:',len(df_all))    
    print('...',(m_nonulls==0).sum(),'rows dropped')
    
    return df_all


### Create training data

In [81]:
df_all = load_data()

Loading data for years: ['2022-23', '2021-22', '2020-21', '2019-20', '2018-19', '2017-18', '2016-17']
... Processing year: 2022-23
... Processing year: 2021-22
... Processing year: 2020-21
... Processing year: 2019-20
... Processing year: 2018-19
... Processing year: 2017-18
... Processing year: 2016-17

Dropping Nulls
... Size: 166653
... New Size: 165873
... 780 rows dropped


In [79]:
df_all['selected_weight']

0         1.505809
1         1.707807
2         0.000000
3         0.014097
4         0.777986
            ...   
165868    0.004748
165869    0.082258
165870    0.234069
165871    0.215003
165872    7.554142
Name: selected_weight, Length: 165873, dtype: float64

In [29]:
bins = [[0,1], [1,2], [2,3], [3,4],  [4,5], [5,10], [10,20]]

features = {
    'total_points':{'name':'f|points|',
              'bins':bins},
    'minutes':{'name':'f|mins|',
              'bins':bins},
    'goals':{'name':'f|goals|',
              'bins':bins},
    'assists':{'name':'f|goals|',
              'bins':bins},
    'clean_sheets':{'name':'f|clean_sheets|',
              'bins':bins},
    'was_home':{'name':'f|home|',
              'bins':bins},
    'bonus':{'name':'f|bonus|',
              'bins':bins},
    'time_horizon':{'name':'f|time_horizon|',
                    'bins':bins[0:5]},
    'fixture_m1':{'name':'f|fixture_m1|',
                    'bins':bins[0:5]},
    'fixture_m2':{'name':'f|fixture_m2|',
                    'bins':bins[0:5]},
    'fixture_m3':{'name':'f|fixture_m3|',
                    'bins':bins[0:5]},
    'fixture_m4':{'name':'f|fixture_m4|',
                    'bins':bins[0:5]},
    'fixture_m5':{'name':'f|fixture_m5|',
                    'bins':bins[0:5]}
    # WHat about the fixture????
}

In [79]:
from collections import Counter
Counter(df_all['season']).most_common()

[('2022-23', 25667),
 ('2021-22', 25447),
 ('2020-21', 24365),
 ('2019-20', 22560),
 ('2018-19', 18979)]