# FPL Exploratory Data Analysis

In [11]:
import numpy as np
import pandas as pd

## 1. Inspect cleaned_merged_seasons

In [12]:
cleaned_merged_seasons = pd.read_csv('data/cleaned_merged_seasons.csv', index_col=0)

  cleaned_merged_seasons = pd.read_csv('data/cleaned_merged_seasons.csv', index_col=0)


In [22]:
# column names for the cleaned merged seasons dataframe
cleaned_merged_seasons.columns

Index(['season_x', 'name', 'position', 'team_x', 'assists', 'bonus', 'bps',
       'clean_sheets', 'creativity', 'element', 'fixture', 'goals_conceded',
       'goals_scored', 'ict_index', 'influence', 'kickoff_time', 'minutes',
       'opponent_team', 'opp_team_name', 'own_goals', 'penalties_missed',
       'penalties_saved', 'red_cards', 'round', 'saves', 'selected',
       'team_a_score', 'team_h_score', 'threat', 'total_points',
       'transfers_balance', 'transfers_in', 'transfers_out', 'value',
       'was_home', 'yellow_cards', 'GW'],
      dtype='object')

In [15]:
# check if in every row of the cleaned merged seasons dataframe 'round' column is equal to the 'GW' column
cleaned_merged_seasons['round'].equals(cleaned_merged_seasons['GW'])

True

Column 'round' is equal to the 'GW' column (delete redundant column)

Conclusion:
**cleaned_merged_seasons data is not updated every game week, so it's better to use data from specific seasons folders**

## 2. Inspect data in season/gws folder

### 2.1 Inspect gws/gw1 data

In [17]:
gws_gw1 = pd.read_csv('data/2021-22/gws/gw1.csv')

In [18]:
gws_gw1.columns

Index(['name', 'position', 'team', 'xP', 'assists', 'bonus', 'bps',
       'clean_sheets', 'creativity', 'element', 'fixture', 'goals_conceded',
       'goals_scored', 'ict_index', 'influence', 'kickoff_time', 'minutes',
       'opponent_team', 'own_goals', 'penalties_missed', 'penalties_saved',
       'red_cards', 'round', 'saves', 'selected', 'team_a_score',
       'team_h_score', 'threat', 'total_points', 'transfers_balance',
       'transfers_in', 'transfers_out', 'value', 'was_home', 'yellow_cards'],
      dtype='object')

### 2.2 Inspect gws/xP1 data

In [103]:
gws_xP1 = pd.read_csv('data/2021-22/gws/xP1.csv')

In [104]:
gws_xP1.head(10)

Unnamed: 0,id,xP
0,1,3.6
1,2,2.0
2,3,3.3
3,4,1.1
4,5,2.7
5,6,0.9
6,7,2.4
7,8,2.7
8,9,3.2
9,10,2.7


#### Check if data in gws_xP1 is the same as in 'xP' column of gws_gw1

In [105]:
gws_xP1_sorted = gws_xP1.sort_values(by='id').reset_index(drop=True)

In [106]:
extracted_xP1_sorted = gws_gw1[['element', 'xP']].sort_values(by='element').rename(columns={'element': 'id'}).reset_index(drop=True)

In [109]:
# compare gws_xP1 and extracted_xP1
gws_xP1_sorted.compare(extracted_xP1_sorted)
gws_xP1_sorted.equals(extracted_xP1_sorted)

True

Conclusion:
**Data from xPN files is presented as a column in game weeks data, so there is no need to use these files**

### 2.3 Inspect gws/merged_gw data

In [25]:
merged_gw = pd.read_csv('data/2021-22/gws/merged_gw.csv')

In [26]:
merged_gw.columns

Index(['name', 'position', 'team', 'xP', 'assists', 'bonus', 'bps',
       'clean_sheets', 'creativity', 'element', 'fixture', 'goals_conceded',
       'goals_scored', 'ict_index', 'influence', 'kickoff_time', 'minutes',
       'opponent_team', 'own_goals', 'penalties_missed', 'penalties_saved',
       'red_cards', 'round', 'saves', 'selected', 'team_a_score',
       'team_h_score', 'threat', 'total_points', 'transfers_balance',
       'transfers_in', 'transfers_out', 'value', 'was_home', 'yellow_cards',
       'GW'],
      dtype='object')

In [41]:
merged_gw['round'].equals(merged_gw['GW'])

True

In [33]:
# show differences between columns in merged_gw and gws_gw1
merged_gw.columns.difference(gws_gw1.columns)

Index(['GW'], dtype='object')

Conclusion:
**merged_gw.csv contains data from every game week, so it's preferable to just use it instead of loading every gw data separately**
It contains additional column 'GW' which is equal to the 'round' column (delete redundant column)

## 3. Inspect players data

### 3.1 Inspect players/Player_Name

In [20]:
moder_gw = pd.read_csv('data/2021-22/players/Jakub_Moder_75/gw.csv')

In [39]:
moder_gw.columns

Index(['assists', 'bonus', 'bps', 'clean_sheets', 'creativity', 'element',
       'fixture', 'goals_conceded', 'goals_scored', 'ict_index', 'influence',
       'kickoff_time', 'minutes', 'opponent_team', 'own_goals',
       'penalties_missed', 'penalties_saved', 'red_cards', 'round', 'saves',
       'selected', 'team_a_score', 'team_h_score', 'threat', 'total_points',
       'transfers_balance', 'transfers_in', 'transfers_out', 'value',
       'was_home', 'yellow_cards'],
      dtype='object')

In [38]:
# show differences between columns in merged_gw and moder_gw
merged_gw.columns.difference(moder_gw.columns)

Index(['GW', 'name', 'position', 'team', 'xP'], dtype='object')

Conclusion:
**It looks like there is also no point using data from 'players' folder**
Each player has his own 'gw.csv' file with data from each game week, but it's preferable to get that data from 'merged_gw.csv' file where that information is in a single file.
There is also 'hisotry.csv' file of every player which is overview of previous season, but it's also not needed since I use more detailed data from previous seasons

## 4. Inspect understat data

### 4.1 Inspect understat/Player_Name

In [113]:
moder_understat = pd.read_csv('data/2021-22/understat/Jakub_Moder_9284.csv')

In [115]:
moder_understat.head(10)

Unnamed: 0,goals,shots,xG,time,position,h_team,a_team,h_goals,a_goals,date,id,season,roster_id,xA,assists,key_passes,npg,npxG,xGChain,xGBuildup
0,0,0,0.0,10,Sub,Brighton,Tottenham,0,2,2022-03-16,16528,2021,520734,0.0,0,0,0,0.0,0.0,0.0
1,0,0,0.0,62,MC,Newcastle United,Brighton,2,1,2022-03-05,16651,2021,517569,0.138703,0,1,0,0.0,0.138703,0.0
2,0,1,0.046313,65,MC,Brighton,Aston Villa,0,2,2022-02-26,16638,2021,516112,0.022925,0,1,0,0.046313,0.069237,0.0
3,0,0,0.0,57,MR,Brighton,Burnley,0,3,2022-02-19,16628,2021,514181,0.0,0,0,0,0.0,0.0,0.0
4,0,3,0.380553,90,AMC,Manchester United,Brighton,2,0,2022-02-15,16549,2021,513778,0.314073,0,1,0,0.380553,1.063944,0.369318
5,0,3,0.308855,90,MC,Watford,Brighton,0,2,2022-02-12,16625,2021,512693,0.293531,0,1,0,0.308855,0.48225,0.0
6,0,1,0.017743,90,AMC,Leicester,Brighton,1,1,2022-01-23,16602,2021,510224,0.0,0,0,0,0.017743,0.239173,0.22143
7,0,1,0.094756,90,FW,Brighton,Chelsea,1,1,2022-01-18,16607,2021,508923,0.0,0,0,0,0.094756,0.55049,0.455734
8,0,2,0.53066,90,AMC,Brighton,Crystal Palace,1,1,2022-01-14,16587,2021,507825,0.0,0,0,0,0.53066,0.53066,0.0
9,0,0,0.0,6,Sub,Everton,Brighton,2,3,2022-01-02,16580,2021,505591,0.0,0,0,0,0.0,0.0,0.0
