In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np

In [10]:
# Load raw data
elements_df = pd.read_csv('../data/raw/elements.csv')
element_types_df = pd.read_csv('../data/raw/element_types.csv')
teams_df = pd.read_csv('../data/raw/teams.csv')

# Display initial data samples
display(elements_df.head())
display(element_types_df.head())
display(teams_df.head())

Unnamed: 0,chance_of_playing_next_round,chance_of_playing_this_round,code,cost_change_event,cost_change_event_fall,cost_change_start,cost_change_start_fall,dreamteam_count,element_type,ep_next,...,now_cost_rank,now_cost_rank_type,form_rank,form_rank_type,points_per_game_rank,points_per_game_rank_type,selected_rank,selected_rank_type,starts_per_90,clean_sheets_per_90
0,0.0,75.0,438098,-1,1,-1,1,0,3,0.0,...,170,107,601,271,601,271,567,246,0.0,0.0
1,75.0,,205651,-1,1,-2,2,0,4,0.8,...,37,17,625,68,625,68,175,30,0.0,0.0
2,,,226597,0,0,0,0,0,2,7.0,...,81,5,31,8,35,10,28,10,1.0,1.0
3,,,219847,0,0,1,-1,1,4,8.0,...,12,4,17,3,18,3,18,5,1.0,1.0
4,0.0,0.0,463748,0,0,0,0,0,1,0.0,...,608,63,380,45,380,45,507,63,0.0,0.0


Unnamed: 0,id,plural_name,plural_name_short,singular_name,singular_name_short,squad_select,squad_min_select,squad_max_select,squad_min_play,squad_max_play,ui_shirt_specific,sub_positions_locked,element_count
0,1,Goalkeepers,GKP,Goalkeeper,GKP,2,,,1,1,True,[12],67
1,2,Defenders,DEF,Defender,DEF,5,,,3,5,False,[],207
2,3,Midfielders,MID,Midfielder,MID,5,,,2,5,False,[],287
3,4,Forwards,FWD,Forward,FWD,3,,,1,3,False,[],72


Unnamed: 0,code,draw,form,id,loss,name,played,points,position,short_name,...,team_division,unavailable,win,strength_overall_home,strength_overall_away,strength_attack_home,strength_attack_away,strength_defence_home,strength_defence_away,pulse_id
0,3,0,,1,0,Arsenal,0,0,0,ARS,...,,False,0,1350,1380,1370,1370,1330,1390,1
1,7,0,,2,0,Aston Villa,0,0,0,AVL,...,,False,0,1120,1245,1110,1140,1130,1350,2
2,91,0,,3,0,Bournemouth,0,0,0,BOU,...,,False,0,1100,1100,1075,1100,1130,1105,127
3,94,0,,4,0,Brentford,0,0,0,BRE,...,,False,0,1100,1100,1105,1095,1100,1110,130
4,36,0,,5,0,Brighton,0,0,0,BHA,...,,False,0,1100,1100,1100,1105,1100,1100,131


In [11]:
# Check for missing values
print("Missing values in elements_df:")
print(elements_df.isnull().sum())

# Example: Fill missing values or drop rows
elements_df = elements_df.fillna({
    'goals_scored': 0, 
    'assists': 0, 
    'clean_sheets': 0  # Replace with appropriate columns and fill values
})

# Display the first few rows to verify the changes
print("\nFirst few rows of the updated DataFrame:")
print(elements_df.head())


Missing values in elements_df:
chance_of_playing_next_round    486
chance_of_playing_this_round    511
code                              0
cost_change_event                 0
cost_change_event_fall            0
                               ... 
points_per_game_rank_type         0
selected_rank                     0
selected_rank_type                0
starts_per_90                     0
clean_sheets_per_90               0
Length: 88, dtype: int64

First few rows of the updated DataFrame:
   chance_of_playing_next_round  chance_of_playing_this_round    code  \
0                           0.0                          75.0  438098   
1                          75.0                           NaN  205651   
2                           NaN                           NaN  226597   
3                           NaN                           NaN  219847   
4                           0.0                           0.0  463748   

   cost_change_event  cost_change_event_fall  cost_change_start  \


In [12]:
# Ensure numerical columns are correctly typed
elements_df['now_cost'] = elements_df['now_cost'].astype(float) / 10  # Convert to float and scale cost

# Convert categorical columns to category types
elements_df['position'] = elements_df['element_type'].astype('category')

# Display the first few rows to verify the changes
print("\nFirst few rows of the updated DataFrame:")
print(elements_df.head())


First few rows of the updated DataFrame:
   chance_of_playing_next_round  chance_of_playing_this_round    code  \
0                           0.0                          75.0  438098   
1                          75.0                           NaN  205651   
2                           NaN                           NaN  226597   
3                           NaN                           NaN  219847   
4                           0.0                           0.0  463748   

   cost_change_event  cost_change_event_fall  cost_change_start  \
0                 -1                       1                 -1   
1                 -1                       1                 -2   
2                  0                       0                  0   
3                  0                       0                  1   
4                  0                       0                  0   

   cost_change_start_fall  dreamteam_count  element_type  ep_next  ...  \
0                       1                0

In [13]:
# Create additional features
elements_df['points_per_game'] = elements_df['total_points'] / elements_df['minutes'].replace(0, np.nan)
elements_df['value_for_money'] = elements_df['total_points'] / elements_df['now_cost']


In [14]:
# Scale features if necessary (example using min-max scaling)
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
elements_df[['scaled_cost', 'scaled_points']] = scaler.fit_transform(elements_df[['now_cost', 'total_points']])


In [15]:
# Merge elements with positions and teams
elements_df = elements_df.merge(element_types_df[['id', 'singular_name']], 
                                left_on='element_type', right_on='id', how='left')
elements_df = elements_df.merge(teams_df[['id', 'name']], 
                                left_on='team', right_on='id', how='left')

# Rename columns for better readability
elements_df = elements_df.rename(columns={'singular_name': 'position', 'name': 'team_name'})


In [16]:
# Identify outliers using IQR method
Q1 = elements_df['total_points'].quantile(0.25)
Q3 = elements_df['total_points'].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds for acceptable range
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
elements_df = elements_df[(elements_df['total_points'] >= lower_bound) & (elements_df['total_points'] <= upper_bound)]


In [17]:
# Save the cleaned and processed data
elements_df.to_csv('../data/processed/elements_processed.csv', index=False)
print("Processed data saved successfully!")


Processed data saved successfully!
