# Feature Engineering & Preprocessing Pipeline

**DAMA Hackathon 2026**  
Builds the shared preprocessing pipeline. Outputs `../outputs/processed_data.pkl`.

Two regression targets are constructed:
- **`market_value_million_eur`** — original target (serves as negative-control experiment)
- **FIFA Performance Value Index (FPVI)** — domain-informed composite (primary regression target)


In [1]:
import pandas as pd
import numpy as np
import pickle
from pathlib import Path
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split

DATA_PATH = Path('../data/fifa_player_performance_market_value.csv')
df = pd.read_csv(DATA_PATH)
print(f'Loaded {len(df)} rows, {len(df.columns)} columns')
df.head(3)

Loaded 2800 rows, 16 columns


Unnamed: 0,player_id,player_name,age,nationality,club,position,overall_rating,potential_rating,matches_played,goals,assists,minutes_played,market_value_million_eur,contract_years_left,injury_prone,transfer_risk_level
0,1,Player_1,23,Germany,Liverpool,ST,65,87,8,6,14,2976,122.51,3,No,Low
1,2,Player_2,36,England,FC Barcelona,ST,90,76,19,3,18,2609,88.47,5,No,High
2,3,Player_3,31,France,Juventus,RB,75,91,34,12,15,1158,20.24,3,No,Medium


## 1. Feature Engineering

In [2]:
# --- Safe per-90 stats (floor denominator at 1 full game to prevent outliers) ---
min_games = np.maximum(df['minutes_played'] / 90, 1.0)
df['goals_per_90']      = (df['goals']   / min_games).clip(0, 4)
df['assists_per_90']    = (df['assists'] / min_games).clip(0, 4)
df['contributions_p90'] = df['goals_per_90'] + df['assists_per_90']

# --- Development potential ---
df['rating_gap']         = df['potential_rating'] - df['overall_rating']
df['rating_x_potential'] = df['overall_rating'] * df['potential_rating']

# --- Age–rating ratio ---
df['age_rating_ratio'] = df['overall_rating'] / (df['age'] + 1)

# --- Contract scarcity flag ---
df['expiring_soon'] = (df['contract_years_left'] <= 1).astype(int)

# --- Position group ---
pos_map = {
    'GK': 'Goalkeeper',
    'CB': 'Defender', 'LB': 'Defender', 'RB': 'Defender',
    'CDM': 'Midfielder', 'CM': 'Midfielder',
    'LW': 'Attacker', 'RW': 'Attacker', 'ST': 'Attacker'
}
df['position_group'] = df['position'].map(pos_map)

# ----------------------------------------------------------------
# FIFA Performance Value Index (FPVI) — primary regression target
# Formula: age-adjusted rating + potential upside + production stats
# ----------------------------------------------------------------
age_fac     = np.exp(-0.08 * np.maximum(0, df['age'] - 26) ** 2).clip(0.1, 1.0)
rating_norm = (df['overall_rating'] - 60) / 34   # 0→1 for rating range 60–94
pot_gap     = np.maximum(0, df['rating_gap'])

fpvi_raw = (
    rating_norm * 100 * age_fac          # quality × age-prime, up to 100 M€
    + pot_gap * 1.2 * age_fac            # development upside premium
    + df['goals_per_90'] * 8             # goal-scoring threat
    + df['assists_per_90'] * 5           # creativity
)
np.random.seed(42)
noise = np.random.normal(0, fpvi_raw.values * 0.10)
df['fpvi']          = (fpvi_raw + noise).clip(0.5, 200)
df['log_fpvi']      = np.log1p(df['fpvi'])
df['log_market_value'] = np.log1p(df['market_value_million_eur'])

print('Feature engineering done.')
df[['goals_per_90', 'assists_per_90', 'rating_gap', 'fpvi']].describe().T

Feature engineering done.


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
goals_per_90,2800.0,1.210629,1.201005,0.0,0.365538,0.774478,1.550303,4.0
assists_per_90,2800.0,0.879865,1.078357,0.0,0.232095,0.485546,0.949724,4.0
rating_gap,2800.0,4.697143,13.970688,-29.0,-5.0,4.0,15.0,37.0
fpvi,2800.0,48.870671,34.151806,1.702228,17.735045,42.892976,72.735202,181.393449


## 2. Encoding

In [3]:
# Ordinal encode transfer_risk_level: Low=0, Medium=1, High=2
risk_enc = OrdinalEncoder(categories=[['Low', 'Medium', 'High']])
df['transfer_risk_encoded'] = risk_enc.fit_transform(df[['transfer_risk_level']]).astype(int)

# Binary: injury prone
df['injury_prone_bin'] = (df['injury_prone'] == 'Yes').astype(int)

# One-hot encode categorical columns
df_enc = pd.get_dummies(df, columns=['position', 'nationality', 'club', 'position_group'],
                         drop_first=False, dtype=int)

print(f'Shape after encoding: {df_enc.shape}')

Shape after encoding: (2800, 53)


## 3. Feature Sets

In [4]:
drop_always = ['player_id', 'player_name', 'injury_prone', 'transfer_risk_level']
targets     = ['market_value_million_eur', 'log_market_value',
                'fpvi', 'log_fpvi', 'transfer_risk_encoded']

feature_cols = [c for c in df_enc.columns if c not in drop_always + targets]
print(f'Feature count: {len(feature_cols)}')
print(feature_cols)

Feature count: 44
['age', 'overall_rating', 'potential_rating', 'matches_played', 'goals', 'assists', 'minutes_played', 'contract_years_left', 'goals_per_90', 'assists_per_90', 'contributions_p90', 'rating_gap', 'rating_x_potential', 'age_rating_ratio', 'expiring_soon', 'injury_prone_bin', 'position_CB', 'position_CDM', 'position_CM', 'position_GK', 'position_LB', 'position_LW', 'position_RB', 'position_RW', 'position_ST', 'nationality_Argentina', 'nationality_Brazil', 'nationality_England', 'nationality_France', 'nationality_Germany', 'nationality_Netherlands', 'nationality_Portugal', 'nationality_Spain', 'club_Bayern Munich', 'club_FC Barcelona', 'club_Juventus', 'club_Liverpool', 'club_Manchester City', 'club_PSG', 'club_Real Madrid', 'position_group_Attacker', 'position_group_Defender', 'position_group_Goalkeeper', 'position_group_Midfielder']


## 4. Train / Validation / Test Split (70 / 15 / 15)

In [5]:
RANDOM_STATE = 42

X       = df_enc[feature_cols]
y_mv_l  = df_enc['log_market_value']
y_mv_r  = df_enc['market_value_million_eur']
y_fp_l  = df_enc['log_fpvi']
y_fp_r  = df_enc['fpvi']
y_cls   = df_enc['transfer_risk_encoded']

(X_tr, X_tmp,
 y_mv_l_tr, y_mv_l_tmp,
 y_mv_r_tr, y_mv_r_tmp,
 y_fp_l_tr, y_fp_l_tmp,
 y_fp_r_tr, y_fp_r_tmp,
 y_cls_tr,  y_cls_tmp) = train_test_split(
    X, y_mv_l, y_mv_r, y_fp_l, y_fp_r, y_cls,
    test_size=0.30, random_state=RANDOM_STATE, stratify=y_cls
)

(X_val, X_te,
 y_mv_l_val, y_mv_l_te,
 y_mv_r_val, y_mv_r_te,
 y_fp_l_val, y_fp_l_te,
 y_fp_r_val, y_fp_r_te,
 y_cls_val,  y_cls_te) = train_test_split(
    X_tmp, y_mv_l_tmp, y_mv_r_tmp, y_fp_l_tmp, y_fp_r_tmp, y_cls_tmp,
    test_size=0.50, random_state=RANDOM_STATE, stratify=y_cls_tmp
)

print(f'Train: {len(X_tr)} | Val: {len(X_val)} | Test: {len(X_te)}')

Train: 1960 | Val: 420 | Test: 420


## 5. Scaling

In [6]:
scaler = StandardScaler()
Xsc_tr  = pd.DataFrame(scaler.fit_transform(X_tr),  columns=feature_cols, index=X_tr.index)
Xsc_val = pd.DataFrame(scaler.transform(X_val),      columns=feature_cols, index=X_val.index)
Xsc_te  = pd.DataFrame(scaler.transform(X_te),       columns=feature_cols, index=X_te.index)
print('Scaling done.')

Scaling done.


## 6. Save Processed Bundle

In [7]:
bundle = {
    # Unscaled (tree models)
    'X_train': X_tr,   'X_val': X_val,   'X_test': X_te,
    # Scaled (linear models)
    'X_train_sc': Xsc_tr, 'X_val_sc': Xsc_val, 'X_test_sc': Xsc_te,
    # Market value targets
    'y_mv_log_tr': y_mv_l_tr, 'y_mv_log_val': y_mv_l_val, 'y_mv_log_te': y_mv_l_te,
    'y_mv_raw_tr': y_mv_r_tr, 'y_mv_raw_val': y_mv_r_val, 'y_mv_raw_te': y_mv_r_te,
    # FPVI targets
    'y_fpvi_log_tr': y_fp_l_tr, 'y_fpvi_log_val': y_fp_l_val, 'y_fpvi_log_te': y_fp_l_te,
    'y_fpvi_raw_tr': y_fp_r_tr, 'y_fpvi_raw_val': y_fp_r_val, 'y_fpvi_raw_te': y_fp_r_te,
    # Classification
    'y_cls_tr': y_cls_tr, 'y_cls_val': y_cls_val, 'y_cls_te': y_cls_te,
    # Metadata
    'feature_cols': feature_cols,
    'scaler': scaler,
    'risk_encoder': risk_enc,
    'random_state': RANDOM_STATE,
}

with open('../outputs/processed_data.pkl', 'wb') as f:
    pickle.dump(bundle, f)

print('Saved → ../outputs/processed_data.pkl')

Saved → ../outputs/processed_data.pkl
