# 03. Data Processing

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Importing dataset

df = pd.read_csv('../data/processed/dataset_cleaned.csv')

In [3]:
# Converting result_map to numbers
result_map = {"W": 1, "D": 0, "L": -1}
df["Result"] = df["Result"].map(result_map)

In [4]:
# Feature Engineering GD, xGD, Home

df["GD"] = df["GF"] - df["GA"]
df["xGD"] = df["xG"] - df["xGA"]
df["Home"] = (df["Venue"] == "Home").astype(int)

In [5]:
# Scaling features

from sklearn.preprocessing import StandardScaler

# list of continuous features
scale_cols = ["GF", "GA", "xG", "xGA", "Poss", "Sh", "SoT", 
              "Dist", "FK", "PK", "PKatt", "GD", "xGD", "Round"]

scaler = StandardScaler()

# fit and transform
scaled_values = scaler.fit_transform(df[scale_cols])

# create new columns with suffix "_scaled"
scaled_df = pd.DataFrame(
    scaled_values, 
    columns=[col + "_scaled" for col in scale_cols],
    index=df.index
)

# merge back into df
df = pd.concat([df, scaled_df], axis=1)

In [6]:
# Duplicate matches need to be fixed
    
if 'Venue' in df.columns:
    
    df_home = df[df['Venue'] == 'Home'].copy()
    
    df_home = df_home.rename(columns={
        'Team': 'Home_Team',
        'Opponent': 'Away_Team',
        'GF': 'Home_Goals',
        'GA': 'Away_Goals',
        'xG': 'Home_xG',
        'xGA': 'Away_xG',
        'Poss': 'Home_Poss',
        'Sh': 'Home_Shots',
        'SoT': 'Home_SoT'
    })
    
    away_stats = []
    for idx, row in df_home.iterrows():
        away_match = df[(df['Team'] == row['Away_Team']) & 
                      (df['Opponent'] == row['Home_Team']) & 
                      (df['Round'] == row['Round'])]
        
        if len(away_match) > 0:
            away_match = away_match.iloc[0]
            away_stats.append({
                'Away_Poss': away_match['Poss'],
                'Away_Shots': away_match['Sh'],
                'Away_SoT': away_match['SoT'],
                'Away_xG': away_match['xG'],
                'Away_xGA': away_match['xGA']
            })
        else:
            # If no match found, estimate from home stats
            away_stats.append({
                'Away_Poss': 100 - row['Home_Poss'],
                'Away_Shots': row['Home_Shots'] * 0.8,  # Estimate
                'Away_SoT': row['Home_SoT'] * 0.8,
                'Away_xG': row['Away_xG'],
                'Away_xGA': row['Home_xG']
            })
    
    # Add away stats to dataframe
    away_df = pd.DataFrame(away_stats)
    df_matches = pd.concat([df_home.reset_index(drop=True), away_df], axis=1)
    
    # Convert Result column (W/D/L from home perspective to H/D/A)
    result_map = {'W': 'H', 'D': 'D', 'L': 'A'}
    if df_matches['Result'].dtype == 'object':
        df_matches['Result'] = df_matches['Result'].map(result_map)
    elif df_matches['Result'].dtype in ['int64', 'float64']:
        # If Result is numeric (1=Win, 0=Draw, -1=Loss)
        df_matches['Result'] = df_matches['Result'].map({1: 'H', 0: 'D', -1: 'A'})
    
    

In [11]:
df_matches.to_csv('../data/processed/dataset_processed.csv', index=False)

In [10]:
df_matches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Data columns (total 39 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Round         380 non-null    int64  
 1   Day           380 non-null    object 
 2   Venue         380 non-null    object 
 3   Result        380 non-null    object 
 4   Home_Goals    380 non-null    int64  
 5   Away_Goals    380 non-null    int64  
 6   Away_Team     380 non-null    object 
 7   Home_xG       380 non-null    float64
 8   Away_xG       380 non-null    float64
 9   Home_Poss     380 non-null    float64
 10  Home_Shots    380 non-null    float64
 11  Home_SoT      380 non-null    float64
 12  Dist          380 non-null    float64
 13  FK            380 non-null    float64
 14  PK            380 non-null    int64  
 15  PKatt         380 non-null    int64  
 16  Home_Team     380 non-null    object 
 17  GD            380 non-null    int64  
 18  xGD           380 non-null    