# Notebook 3: Feature Engineering
# 
## Transfermarkt Player Value Prediction
### Creating position-specific features for each player group

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)


In [2]:
print("=" * 60)
print("FEATURE ENGINEERING")
print("=" * 60)

FEATURE ENGINEERING


In [3]:
# paths
data_path    = "C:\\Users\\Dimitris\\Desktop\\DAMA\\homeworks\\hw dama61\\hw4\\env_nn\\data\\"
outputs_path = "C:\\Users\\Dimitris\\Desktop\\DAMA\\homeworks\\hw dama61\\hw4\\env_nn\\outputs\\"

In [5]:
# Load each position group
gk_df = pd.read_csv(os.path.join(outputs_path, 'merged_data_gk.csv'))
def_df = pd.read_csv(os.path.join(outputs_path, 'merged_data_def.csv'))
mid_df = pd.read_csv(os.path.join(outputs_path, 'merged_data_mid.csv'))
att_df = pd.read_csv(os.path.join(outputs_path, 'merged_data_att.csv'))

print(f"Goalkeepers: {len(gk_df):,} players")
print(f"Defenders: {len(def_df):,} players")
print(f"Midfielders: {len(mid_df):,} players")
print(f"Attackers: {len(att_df):,} players")

Goalkeepers: 3,897 players
Defenders: 10,883 players
Midfielders: 9,888 players
Attackers: 9,390 players


In [57]:
print("\n" + "="*60)
print("üßπ ADDING CLEAN SHEET FEATURES")
print("="*60)

# Load additional data files
games_df = pd.read_csv(os.path.join(data_path, 'games.csv'))
club_games_df = pd.read_csv(os.path.join(data_path, 'club_games.csv'))
appearances_df = pd.read_csv(os.path.join(data_path, 'appearances.csv'))
players_df = pd.read_csv(os.path.join(data_path, 'players.csv'))
print(f"Loaded Players: {len(players_df):,}")
print(f"Loaded games: {len(games_df):,}")
print(f"Loaded club games: {len(club_games_df):,}")
print(f"Loaded appearances: {len(appearances_df):,}")


üßπ ADDING CLEAN SHEET FEATURES
Loaded Players: 34,291
Loaded games: 77,995
Loaded club games: 155,990
Loaded appearances: 1,722,865


In [31]:
print("\n" + "="*60)
print("üîç CHECKING REQUIRED COLUMNS FOR CLEAN SHEETS")
print("="*60)


üîç CHECKING REQUIRED COLUMNS FOR CLEAN SHEETS


In [34]:
print("\nüìä Appearances columns:")
for i, col in enumerate(appearances_df.columns, 1):
    print(f"  {i}. {col}")

print("\nüìä Club games columns:")
for i, col in enumerate(club_games_df.columns, 1):
    print(f"  {i}. {col}")

print("\nüìä Games columns:")
for i, col in enumerate(games_df.columns, 1):
    print(f"  {i}. {col}")


üìä Appearances columns:
  1. appearance_id
  2. game_id
  3. player_id
  4. player_club_id
  5. player_current_club_id
  6. date
  7. player_name
  8. competition_id
  9. yellow_cards
  10. red_cards
  11. goals
  12. assists
  13. minutes_played

üìä Club games columns:
  1. game_id
  2. club_id
  3. own_goals
  4. own_position
  5. own_manager_name
  6. opponent_id
  7. opponent_goals
  8. opponent_position
  9. opponent_manager_name
  10. hosting
  11. is_win

üìä Games columns:
  1. game_id
  2. competition_id
  3. season
  4. round
  5. date
  6. home_club_id
  7. away_club_id
  8. home_club_goals
  9. away_club_goals
  10. home_club_position
  11. away_club_position
  12. home_club_manager_name
  13. away_club_manager_name
  14. stadium
  15. attendance
  16. referee
  17. url
  18. home_club_formation
  19. away_club_formation
  20. home_club_name
  21. away_club_name
  22. aggregate
  23. competition_type


In [59]:
# Store in data dictionary for easy access
data = {
    'appearances_df': appearances_df,
    'games_df': games_df,
    'club_games_df': club_games_df,
    'players_df': players_df
}

In [36]:
print("\n" + "="*60)
print("üßπ CALCULATING CLEAN SHEETS")
print("="*60)


üßπ CALCULATING CLEAN SHEETS


In [None]:
# Let's identify which club ID column to use
club_id_col = None
if 'player_club_id' in appearances_df.columns:
    club_id_col = 'player_club_id'
    print("‚úÖ Using 'player_club_id' from appearances")
elif 'player_current_club_id' in appearances_df.columns:
    club_id_col = 'player_current_club_id' 
    print("‚úÖ Using 'player_current_club_id' from appearances")
# For clean sheets, we need to know when the player's team conceded 0 goals
# Goals conceded = opponent_goals + own_goals
# Clean sheet = (opponent_goals + own_goals) == 0

if club_id_col and 'opponent_goals' in club_games_df.columns:
    print(f"\nMerging appearances with club_games on game_id and {club_id_col}...")
    
    # Merge appearances with club_games to get opponent goals and own goals
    appearances_with_goals = appearances_df.merge(
        club_games_df[['game_id', 'club_id', 'opponent_goals', 'own_goals']], 
        left_on=['game_id', club_id_col],
        right_on=['game_id', 'club_id'],
        how='left'
    )
    print(f"After merge: {len(appearances_with_goals):,} appearances")
    
    # Check if required columns exist
    if 'opponent_goals' in appearances_with_goals.columns and 'own_goals' in appearances_with_goals.columns:
        # Handle missing values
        missing_opponent = appearances_with_goals['opponent_goals'].isna().sum()
        missing_own = appearances_with_goals['own_goals'].isna().sum()
        
        if missing_opponent > 0:
            print(f"‚ö†Ô∏è Missing opponent_goals for {missing_opponent:,} appearances")
            appearances_with_goals['opponent_goals'] = appearances_with_goals['opponent_goals'].fillna(0)

‚úÖ Using 'player_club_id' from appearances

Merging appearances with club_games on game_id and player_club_id...


In [48]:
# For clean sheets, we need to know when the player's team conceded 0 goals
# Goals conceded = opponent_goals + own_goals
# Clean sheet = (opponent_goals + own_goals) == 0

if club_id_col and 'opponent_goals' in club_games_df.columns:
    print(f"\nMerging appearances with club_games on game_id and {club_id_col}...")
    
    # Merge appearances with club_games to get opponent goals and own goals
    appearances_with_goals = appearances_df.merge(
        club_games_df[['game_id', 'club_id', 'opponent_goals', 'own_goals']], 
        left_on=['game_id', club_id_col],
        right_on=['game_id', 'club_id'],
        how='left'
    )
    print(f"After merge: {len(appearances_with_goals):,} appearances")


Merging appearances with club_games on game_id and player_club_id...


After merge: 1,722,865 appearances


In [50]:
# Calculate total goals conceded (opponent goals + own goals)
appearances_with_goals['total_goals_conceded'] = (
    appearances_with_goals['opponent_goals'] + appearances_with_goals['own_goals']
)
        
# Define clean sheet (total_goals_conceded == 0)
appearances_with_goals['is_clean_sheet'] = (appearances_with_goals['total_goals_conceded'] == 0).astype(int)

In [51]:
# Also track games with own goals (for analysis)
appearances_with_goals['had_own_goal'] = (appearances_with_goals['own_goals'] > 0).astype(int)
        
print(f"\nüìä Clean sheet calculation:")
print(f"   - Total appearances: {len(appearances_with_goals):,}")
print(f"   - Clean sheets: {appearances_with_goals['is_clean_sheet'].sum():,} ({appearances_with_goals['is_clean_sheet'].mean()*100:.1f}%)")
print(f"   - Games with own goals: {appearances_with_goals['had_own_goal'].sum():,}")


üìä Clean sheet calculation:
   - Total appearances: 1,722,865
   - Clean sheets: 114,840 (6.7%)
   - Games with own goals: 1,263,135


In [52]:
# Count clean sheets per player
print("\nCalculating clean sheet statistics per player...")
clean_sheet_counts = appearances_with_goals.groupby('player_id').agg(
    clean_sheets=('is_clean_sheet', 'sum'),
    games_with_own_goals=('had_own_goal', 'sum'),
    total_appearances_with_data=('game_id', 'count'),
    avg_opponent_goals=('opponent_goals', 'mean'),
    avg_own_goals=('own_goals', 'mean'),
    total_goals_conceded=('total_goals_conceded', 'sum'),
    avg_goals_conceded=('total_goals_conceded', 'mean')
).reset_index()


Calculating clean sheet statistics per player...


In [53]:
print(f"‚úÖ Calculated clean sheets for {len(clean_sheet_counts):,} players")
        
print("\nClean sheet statistics:")
display(clean_sheet_counts['clean_sheets'].describe())

‚úÖ Calculated clean sheets for 26,489 players

Clean sheet statistics:


count    26489.000000
mean         4.335384
std          5.890968
min          0.000000
25%          0.000000
50%          2.000000
75%          6.000000
max         47.000000
Name: clean_sheets, dtype: float64

In [54]:
 # Show top players by clean sheets
print("\nüèÜ Top 10 players by clean sheets:")
top_clean_sheets = clean_sheet_counts.nlargest(10, 'clean_sheets')


üèÜ Top 10 players by clean sheets:


In [56]:
# Save for later use
clean_sheet_path = os.path.join(outputs_path, 'clean_sheet_counts.csv')
clean_sheet_counts.to_csv(clean_sheet_path, index=False)
print(f"\n‚úÖ Saved clean sheet counts to: {clean_sheet_path}")


‚úÖ Saved clean sheet counts to: C:\Users\Dimitris\Desktop\DAMA\homeworks\hw dama61\hw4\env_nn\outputs\clean_sheet_counts.csv


### 1. Common Features for All Positions
These features apply to all player types

In [6]:
print("\n" + "="*60)
print("1Ô∏è‚É£ CREATING COMMON FEATURES")
print("="*60)


1Ô∏è‚É£ CREATING COMMON FEATURES


In [65]:
# %% [code]
print("\n" + "="*60)
print("1Ô∏è‚É£ CREATING COMMON FEATURES")
print("="*60)

# First, make sure clean_sheet_pct is calculated in clean_sheet_counts
if clean_sheet_counts is not None:
    # Calculate clean sheet percentage if not already present
    if 'clean_sheet_pct' not in clean_sheet_counts.columns:
        clean_sheet_counts['clean_sheet_pct'] = (
            clean_sheet_counts['clean_sheets'] / clean_sheet_counts['total_appearances_with_data']
        ).fillna(0)
        print("‚úÖ Added clean_sheet_pct to clean_sheet_counts")

def add_common_features(df, position_name, clean_sheet_data=None):
    """Add features that are common to all positions"""
    
    # Make a copy to avoid warnings
    df = df.copy()
    
    # Avoid division by zero
    df['minutes_played'] = df['minutes_played'].replace(0, np.nan)
    
    # Per 90 minute stats
    df['goals_per_90'] = (df['goals'] / df['minutes_played'] * 90).fillna(0)
    df['assists_per_90'] = (df['assists'] / df['minutes_played'] * 90).fillna(0)
    df['goal_contributions_per_90'] = df['goals_per_90'] + df['assists_per_90']
    
    # Card stats
    df['cards_per_90'] = ((df['yellow_cards'] + df['red_cards']) / df['minutes_played'] * 90).fillna(0)
    
    # Experience metrics
    df['total_minutes'] = df['minutes_played'].fillna(0)
    df['total_appearances'] = df['appearances_count']
    
    # Age squared (to capture non-linear effects)
    df['age_squared'] = df['age'] ** 2
    
    # Log of market value (for modeling)
    df['log_market_value'] = np.log1p(df['market_value_m'])
    
    # Flag if player has no appearances
    df['has_appearances'] = (df['appearances_count'] > 0).astype(int)
    
    # Flag if player has market value
    df['has_value'] = (df['market_value_m'] > 0).astype(int)
    
    # Add clean sheet data if provided
    if clean_sheet_data is not None:
        # Check which columns exist in clean_sheet_data
        available_cols = ['player_id']
        for col in ['clean_sheets', 'avg_goals_conceded', 'total_goals_conceded', 
                    'games_with_own_goals', 'clean_sheet_pct', 'avg_opponent_goals',
                    'avg_own_goals']:
            if col in clean_sheet_data.columns:
                available_cols.append(col)
        
        print(f"\n{position_name}: Merging with clean sheet columns: {available_cols}")
        
        # Merge with available columns
        df = df.merge(clean_sheet_data[available_cols], on='player_id', how='left')
        
        # Fill missing values
        if 'clean_sheets' in df.columns:
            df['clean_sheets'] = df['clean_sheets'].fillna(0)
        else:
            df['clean_sheets'] = 0
            
        if 'avg_goals_conceded' in df.columns:
            df['avg_goals_conceded'] = df['avg_goals_conceded'].fillna(99)
        else:
            df['avg_goals_conceded'] = 99
            
        if 'total_goals_conceded' in df.columns:
            df['total_goals_conceded'] = df['total_goals_conceded'].fillna(0)
        else:
            df['total_goals_conceded'] = 0
            
        if 'games_with_own_goals' in df.columns:
            df['games_with_own_goals'] = df['games_with_own_goals'].fillna(0)
        else:
            df['games_with_own_goals'] = 0
            
        if 'clean_sheet_pct' in df.columns:
            df['clean_sheet_pct'] = df['clean_sheet_pct'].fillna(0)
        else:
            # Calculate from clean_sheets if available
            if 'clean_sheets' in df.columns and 'total_appearances' in df.columns:
                df['clean_sheet_pct'] = (df['clean_sheets'] / df['total_appearances']).fillna(0)
            else:
                df['clean_sheet_pct'] = 0
        
        # Clean sheets per 90
        df['clean_sheets_per_90'] = (df['clean_sheets'] / (df['minutes_played'] / 90)).fillna(0)
        
        # Goals conceded per 90
        if 'total_goals_conceded' in df.columns:
            df['goals_conceded_per_90'] = (df['total_goals_conceded'] / (df['minutes_played'] / 90)).fillna(99)
        else:
            df['goals_conceded_per_90'] = 99
    
    print(f"\n{position_name}: Added common features")
    print(f"  - goals_per_90: {df['goals_per_90'].mean():.3f}")
    print(f"  - assists_per_90: {df['assists_per_90'].mean():.3f}")
    print(f"  - cards_per_90: {df['cards_per_90'].mean():.3f}")
    if clean_sheet_data is not None:
        print(f"  - clean_sheets: {df['clean_sheets'].mean():.1f} avg per player")
        print(f"  - clean_sheet_pct: {df['clean_sheet_pct'].mean():.3f}")
        print(f"  - clean_sheets_per_90: {df['clean_sheets_per_90'].mean():.3f}")
    
    return df

# Apply to all position groups with clean sheet data
print("\nAdding common features to all position groups...")
gk_df = add_common_features(gk_df, 'Goalkeepers', clean_sheet_counts)
def_df = add_common_features(def_df, 'Defenders', clean_sheet_counts)
mid_df = add_common_features(mid_df, 'Midfielders', clean_sheet_counts)
att_df = add_common_features(att_df, 'Attackers', clean_sheet_counts)


1Ô∏è‚É£ CREATING COMMON FEATURES
‚úÖ Added clean_sheet_pct to clean_sheet_counts

Adding common features to all position groups...

Goalkeepers: Merging with clean sheet columns: ['player_id', 'clean_sheets', 'avg_goals_conceded', 'total_goals_conceded', 'games_with_own_goals', 'clean_sheet_pct', 'avg_opponent_goals', 'avg_own_goals']

Goalkeepers: Added common features
  - goals_per_90: 0.000
  - assists_per_90: 0.001
  - cards_per_90: 0.036
  - clean_sheets: 2.1 avg per player
  - clean_sheet_pct: 0.031
  - clean_sheets_per_90: 0.041

Defenders: Merging with clean sheet columns: ['player_id', 'clean_sheets', 'avg_goals_conceded', 'total_goals_conceded', 'games_with_own_goals', 'clean_sheet_pct', 'avg_opponent_goals', 'avg_own_goals']

Defenders: Added common features
  - goals_per_90: 0.028
  - assists_per_90: 0.045
  - cards_per_90: 0.196
  - clean_sheets: 3.5 avg per player
  - clean_sheet_pct: 0.048
  - clean_sheets_per_90: 0.108

Midfielders: Merging with clean sheet columns: ['

### 2. Goalkeeper-Specific Features

In [66]:
print("\n" + "="*60)
print("2Ô∏è‚É£ CREATING GOALKEEPER-SPECIFIC FEATURES")
print("="*60)


2Ô∏è‚É£ CREATING GOALKEEPER-SPECIFIC FEATURES


In [67]:
def add_gk_features(df):
    """Add goalkeeper-specific features including clean sheets"""
    
    df = df.copy()
    
    # Discipline metric
    df['gk_discipline_per_90'] = df['cards_per_90']
    
    # Offensive contribution (rare for keepers)
    df['gk_offensive_contribution'] = df['goal_contributions_per_90']
    
    # Clean sheet metrics
    if 'clean_sheets' in df.columns:
        # Clean sheet percentage (already calculated)
        df['gk_clean_sheet_pct'] = df['clean_sheet_pct']
        # Save percentage proxy (using clean sheets and goals conceded)
        df['save_percentage_proxy'] = 1 - (df['goals_conceded_per_90'] / (df['goals_conceded_per_90'] + 3))
        df['save_percentage_proxy'] = df['save_percentage_proxy'].clip(0, 1)
        
        # Clean sheet consistency (normalized by appearances)
        df['clean_sheet_consistency'] = df['clean_sheets'] / (df['total_appearances'] + 0.01)
        
        # Goals prevented (compared to average)
        avg_goals_conceded = df['goals_conceded_per_90'].median()
        df['goals_prevented_per_90'] = avg_goals_conceded - df['goals_conceded_per_90']
    
    print("\nGoalkeeper features added:")
    if 'clean_sheets' in df.columns:
        print(f"  - gk_clean_sheet_pct: {df['gk_clean_sheet_pct'].mean():.3f}")
        print(f"  - save_percentage_proxy: {df['save_percentage_proxy'].mean():.3f}")
        print(f"  - goals_prevented_per_90: {df['goals_prevented_per_90'].mean():.3f}")
    
    return df

print("Adding goalkeeper-specific features...")
gk_df = add_gk_features(gk_df)

Adding goalkeeper-specific features...

Goalkeeper features added:
  - gk_clean_sheet_pct: 0.031
  - save_percentage_proxy: 0.283
  - goals_prevented_per_90: -42.267


### 3. Defender-Specific Features

In [16]:
print("\n" + "="*60)
print("3Ô∏è‚É£ CREATING DEFENDER-SPECIFIC FEATURES")
print("="*60)


3Ô∏è‚É£ CREATING DEFENDER-SPECIFIC FEATURES


In [68]:
def add_defender_features(df):
    """Add defender-specific features including clean sheets"""
    
    df = df.copy()
    
    # Defensive contribution
    df['defensive_aggression'] = df['cards_per_90']
    
    # Offensive contribution
    df['defender_goals_per_90'] = df['goals_per_90']
    df['defender_assists_per_90'] = df['assists_per_90']
    df['defender_contributions_per_90'] = df['goal_contributions_per_90']
    
    # Clean sheet metrics for defenders
    if 'clean_sheets' in df.columns:
        # Clean sheet percentage
        df['defender_clean_sheet_pct'] = df['clean_sheet_pct']
        
        # Defensive contribution to clean sheets (lower cards = better contribution)
        max_cards = df['cards_per_90'].max()
        if max_cards > 0:
            df['defensive_contribution'] = df['defender_clean_sheet_pct'] * (1 - df['cards_per_90'] / max_cards)
        else:
            df['defensive_contribution'] = df['defender_clean_sheet_pct']
        
        # Defensive solidity (inverse of goals conceded)
        df['defensive_solidity'] = 1 / (df['goals_conceded_per_90'] + 0.01)
        # Clean sheets per 90
        df['defender_clean_sheets_per_90'] = df['clean_sheets_per_90']
        
        # Own goal impact (negative)
        df['own_goal_impact'] = -df['games_with_own_goals'] / (df['total_appearances'] + 0.01)
    
    print("\nDefender features added:")
    if 'clean_sheets' in df.columns:
        print(f"  - defender_clean_sheet_pct: {df['defender_clean_sheet_pct'].mean():.3f}")
        print(f"  - defensive_contribution: {df['defensive_contribution'].mean():.3f}")
        print(f"  - defensive_solidity: {df['defensive_solidity'].mean():.3f}")
        print(f"  - own_goal_impact: {df['own_goal_impact'].mean():.3f}")
        return df

print("Adding defender-specific features...")
def_df = add_defender_features(def_df)

Adding defender-specific features...

Defender features added:
  - defender_clean_sheet_pct: 0.048
  - defensive_contribution: 0.048
  - defensive_solidity: 0.423
  - own_goal_impact: -0.542


### 4. Midfielder-Specific Features

In [18]:
print("\n" + "="*60)
print("4Ô∏è‚É£ CREATING MIDFIELDER-SPECIFIC FEATURES")
print("="*60)


4Ô∏è‚É£ CREATING MIDFIELDER-SPECIFIC FEATURES


In [19]:
def add_midfielder_features(df):
    """Add midfielder-specific features"""
    
    # Creative contribution
    df['creative_output_per_90'] = df['assists_per_90']
    
    # Goal scoring from midfield
    df['midfielder_goals_per_90'] = df['goals_per_90']
    
    # Total contribution
    df['midfielder_contributions_per_90'] = df['goal_contributions_per_90']
    
    # Defensive work rate (cards as proxy for tackling)
    df['midfielder_defensive_work'] = df['cards_per_90']
        # Balance of attack vs defense
    df['attack_defense_ratio'] = (df['goals_per_90'] + df['assists_per_90']) / (df['cards_per_90'] + 0.01)
    
    print("\nMidfielder features added:")
    print("  - creative_output_per_90 (assists)")
    print("  - midfielder_goals_per_90")
    print("  - midfielder_contributions_per_90")
    print("  - midfielder_defensive_work (cards)")
    print("  - attack_defense_ratio")
    
    return df

mid_df = add_midfielder_features(mid_df)


Midfielder features added:
  - creative_output_per_90 (assists)
  - midfielder_goals_per_90
  - midfielder_contributions_per_90
  - midfielder_defensive_work (cards)
  - attack_defense_ratio


### 5. Attacker-Specific Features

In [20]:
print("\n" + "="*60)
print("5Ô∏è‚É£ CREATING ATTACKER-SPECIFIC FEATURES")
print("="*60)


5Ô∏è‚É£ CREATING ATTACKER-SPECIFIC FEATURES


In [21]:
def add_attacker_features(df):
    """Add attacker-specific features"""
    
    # Primary scoring metrics
    df['striker_goals_per_90'] = df['goals_per_90']
    df['striker_assists_per_90'] = df['assists_per_90']
    
    # Goal involvement
    df['goal_involvement_per_90'] = df['goal_contributions_per_90']
    
    # Efficiency (goals per appearance)
    df['goals_per_appearance'] = (df['goals'] / df['appearances_count']).replace([np.inf, -np.inf], 0).fillna(0)
        # Minutes per goal (for players who score)
    df['minutes_per_goal'] = (df['minutes_played'] / df['goals']).replace([np.inf, -np.inf], 9999).fillna(9999)
    
    # Contribution beyond scoring (assists relative to goals)
    df['assist_to_goal_ratio'] = (df['assists'] / (df['goals'] + 0.01)).fillna(0)
    
    print("\nAttacker features added:")
    print("  - striker_goals_per_90, striker_assists_per_90")
    print("  - goal_involvement_per_90")
    print("  - goals_per_appearance")
    print("  - minutes_per_goal")
    print("  - assist_to_goal_ratio")
    return df

att_df = add_attacker_features(att_df)


Attacker features added:
  - striker_goals_per_90, striker_assists_per_90
  - goal_involvement_per_90
  - goals_per_appearance
  - minutes_per_goal
  - assist_to_goal_ratio


### 6. Create Feature Sets for Modeling

In [22]:
print("\n" + "="*60)
print("6Ô∏è‚É£ CREATING FEATURE SETS FOR MODELING")
print("="*60)


6Ô∏è‚É£ CREATING FEATURE SETS FOR MODELING


In [69]:
def get_feature_columns(df, position):
    """Get the list of feature columns for each position"""
    
    # Base features common to all
    base_features = ['age', 'age_squared', 'total_minutes', 'total_appearances', 
                     'has_appearances', 'goals_per_90', 'assists_per_90', 
                     'goal_contributions_per_90', 'cards_per_90']
    
    # Clean sheet features (if they exist)
    clean_sheet_features = ['clean_sheets', 'clean_sheets_per_90', 'goals_conceded_per_90',
                            'clean_sheet_pct']
    
        # Position-specific features
    position_features = {
        'GK': ['gk_discipline_per_90', 'gk_offensive_contribution',
               'gk_clean_sheet_pct', 'save_percentage_proxy', 
               'clean_sheet_consistency', 'goals_prevented_per_90'],
        
        'DEF': ['defensive_aggression', 'defender_goals_per_90', 'defender_assists_per_90',
                'defender_contributions_per_90', 'defender_clean_sheet_pct',
                'defensive_contribution', 'defensive_solidity', 
                'defender_clean_sheets_per_90', 'own_goal_impact'],
        
        'MID': ['creative_output_per_90', 'midfielder_goals_per_90', 
                'midfielder_contributions_per_90', 'midfielder_defensive_work',
                'attack_defense_ratio'],
                'ATT': ['striker_goals_per_90', 'striker_assists_per_90', 'goal_involvement_per_90',
                'goals_per_appearance', 'minutes_per_goal', 'assist_to_goal_ratio']
    }
    
    # Combine all features for this position
    all_features = base_features + clean_sheet_features + position_features[position]
    
    # Filter to only features that exist in the dataframe
    available_features = [f for f in all_features if f in df.columns]
    
    return available_features

# Get feature lists for each position
gk_features = get_feature_columns(gk_df, 'GK')
def_features = get_feature_columns(def_df, 'DEF')
mid_features = get_feature_columns(mid_df, 'MID')
att_features = get_feature_columns(att_df, 'ATT')

print("\nüìä Feature sets created:")
print(f"  Goalkeepers: {len(gk_features)} features")
print(f"  Defenders: {len(def_features)} features")
print(f"  Midfielders: {len(mid_features)} features")
print(f"  Attackers: {len(att_features)} features")

print("\nüìã Goalkeeper features (with clean sheets):")
for i, feat in enumerate(gk_features[:15], 1):
    print(f"   {i}. {feat}")


üìä Feature sets created:
  Goalkeepers: 19 features
  Defenders: 22 features
  Midfielders: 18 features
  Attackers: 19 features

üìã Goalkeeper features (with clean sheets):
   1. age
   2. age_squared
   3. total_minutes
   4. total_appearances
   5. has_appearances
   6. goals_per_90
   7. assists_per_90
   8. goal_contributions_per_90
   9. cards_per_90
   10. clean_sheets
   11. clean_sheets_per_90
   12. goals_conceded_per_90
   13. clean_sheet_pct
   14. gk_discipline_per_90
   15. gk_offensive_contribution


### 7. Handle Missing Values and Outliers

In [70]:
print("\n" + "="*60)
print("7Ô∏è‚É£ HANDLING MISSING VALUES AND OUTLIERS")
print("="*60)


7Ô∏è‚É£ HANDLING MISSING VALUES AND OUTLIERS


In [72]:
def clean_features(df, position_name):
    """Clean feature columns: handle infinite values and outliers"""
    
    df = df.copy()
    
    # Get numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    
    # Replace infinite values with NaN
    df = df.replace([np.inf, -np.inf], np.nan)
    
    # Fill NaN with 0 for performance metrics
    perf_cols = [col for col in numeric_cols if any(x in col for x in 
                 ['per_90', 'goals', 'assists', 'cards', 'contributions', 'clean_sheet', 'pct'])]
    for col in perf_cols:
        if col in df.columns:
            df[col] = df[col].fillna(0)
    
    # Cap extreme outliers at 99th percentile
    for col in numeric_cols:
        if col in df.columns and col not in ['player_id', 'market_value_m', 'log_market_value', 'clean_sheets']:
            # Skip if all values are 0 or constant
            if df[col].nunique() > 1:
                upper_limit = df[col].quantile(0.99)
                lower_limit = df[col].quantile(0.01)
                df[col] = df[col].clip(lower=lower_limit, upper=upper_limit)
    
    print(f"\n{position_name}: Cleaned features")
    print(f"  - Numeric columns: {len(numeric_cols)}")
    print(f"  - Performance columns: {len(perf_cols)}")
    
    return df

In [73]:
# Apply cleaning to all dataframes
print("Cleaning features for all position groups...")
gk_df = clean_features(gk_df, 'Goalkeepers')
def_df = clean_features(def_df, 'Defenders')
mid_df = clean_features(mid_df, 'Midfielders')
att_df = clean_features(att_df, 'Attackers')

Cleaning features for all position groups...

Goalkeepers: Cleaned features
  - Numeric columns: 35
  - Performance columns: 21

Defenders: Cleaned features
  - Numeric columns: 39
  - Performance columns: 23

Midfielders: Cleaned features
  - Numeric columns: 34
  - Performance columns: 20

Attackers: Cleaned features
  - Numeric columns: 35
  - Performance columns: 21


### 8. Save Feature-Engineered Files

In [74]:
print("\n" + "="*60)
print("8Ô∏è‚É£ SAVING FEATURE-ENGINEERED FILES")
print("="*60)


8Ô∏è‚É£ SAVING FEATURE-ENGINEERED FILES


In [75]:
# Save each position group with features
gk_path = os.path.join(outputs_path, 'featured_data_gk.csv')
def_path = os.path.join(outputs_path, 'featured_data_def.csv')
mid_path = os.path.join(outputs_path, 'featured_data_mid.csv')
att_path = os.path.join(outputs_path, 'featured_data_att.csv')

gk_df.to_csv(gk_path, index=False)
def_df.to_csv(def_path, index=False)
mid_df.to_csv(mid_path, index=False)
att_df.to_csv(att_path, index=False)

print(f"‚úÖ Saved goalkeeper features to: {gk_path}")
print(f"‚úÖ Saved defender features to: {def_path}")
print(f"‚úÖ Saved midfielder features to: {mid_path}")
print(f"‚úÖ Saved attacker features to: {att_path}")

# Also save feature lists for reference
feature_sets = {
    'GK': gk_features,
    'DEF': def_features,
    'MID': mid_features,
    'ATT': att_features
}

‚úÖ Saved goalkeeper features to: C:\Users\Dimitris\Desktop\DAMA\homeworks\hw dama61\hw4\env_nn\outputs\featured_data_gk.csv
‚úÖ Saved defender features to: C:\Users\Dimitris\Desktop\DAMA\homeworks\hw dama61\hw4\env_nn\outputs\featured_data_def.csv
‚úÖ Saved midfielder features to: C:\Users\Dimitris\Desktop\DAMA\homeworks\hw dama61\hw4\env_nn\outputs\featured_data_mid.csv
‚úÖ Saved attacker features to: C:\Users\Dimitris\Desktop\DAMA\homeworks\hw dama61\hw4\env_nn\outputs\featured_data_att.csv


In [77]:
import json
feature_sets_path = os.path.join(outputs_path, 'feature_sets.json')
with open(feature_sets_path, 'w') as f:
    json.dump(feature_sets, f, indent=2)

print(f"\n‚úÖ Saved feature sets to: {feature_sets_path}")


‚úÖ Saved feature sets to: C:\Users\Dimitris\Desktop\DAMA\homeworks\hw dama61\hw4\env_nn\outputs\feature_sets.json
