# Notebook 2: Data Cleaning and Merging
 
## Transfermarkt Player Value Prediction
### Cleaning individual datasets and merging into master dataframe with position groups

In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
warnings.filterwarnings('ignore')

In [21]:
# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("=" * 60)
print("DATA CLEANING AND MERGING")
print("=" * 60)

DATA CLEANING AND MERGING


In [22]:
# paths
data_path    = "C:\\Users\\Dimitris\\Desktop\\DAMA\\homeworks\\hw dama61\\hw4\\env_nn\\data\\"
outputs_path = "C:\\Users\\Dimitris\\Desktop\\DAMA\\homeworks\\hw dama61\\hw4\\env_nn\\outputs\\"

In [23]:
#Load datasets from input path

print("\nüìÇ Loading datasets from input path...")

files = {
    'appearances': 'appearances.csv',
    'players': 'players.csv',
    'player_valuations': 'player_valuations.csv',
    'transfers': 'transfers.csv',
    'games': 'games.csv',
    'clubs': 'clubs.csv',
    'competitions': 'competitions.csv'
    }

data = {}
for name, filename in files.items():
    file_path = os.path.join(data_path, filename)
    try:
        data[name] = pd.read_csv(file_path)
        print(f"  ‚úÖ Loaded {name}: {len(data[name]):,} rows from {file_path}")
    except FileNotFoundError:
        print(f"  ‚ùå File not found: {file_path}")
    except Exception as e:
        print(f"  ‚ùå Error loading {file_path}: {e}")


üìÇ Loading datasets from input path...
  ‚úÖ Loaded appearances: 1,722,865 rows from C:\Users\Dimitris\Desktop\DAMA\homeworks\hw dama61\hw4\env_nn\data\appearances.csv
  ‚úÖ Loaded players: 34,291 rows from C:\Users\Dimitris\Desktop\DAMA\homeworks\hw dama61\hw4\env_nn\data\players.csv
  ‚úÖ Loaded player_valuations: 448,965 rows from C:\Users\Dimitris\Desktop\DAMA\homeworks\hw dama61\hw4\env_nn\data\player_valuations.csv
  ‚úÖ Loaded transfers: 85,293 rows from C:\Users\Dimitris\Desktop\DAMA\homeworks\hw dama61\hw4\env_nn\data\transfers.csv
  ‚úÖ Loaded games: 77,995 rows from C:\Users\Dimitris\Desktop\DAMA\homeworks\hw dama61\hw4\env_nn\data\games.csv
  ‚úÖ Loaded clubs: 451 rows from C:\Users\Dimitris\Desktop\DAMA\homeworks\hw dama61\hw4\env_nn\data\clubs.csv
  ‚úÖ Loaded competitions: 44 rows from C:\Users\Dimitris\Desktop\DAMA\homeworks\hw dama61\hw4\env_nn\data\competitions.csv


### 1. Clean Players Dataset

In [34]:
print("\n" + "="*60)
print("1Ô∏è‚É£ CLEANING PLAYERS DATASET")
print("="*60)

players = data['players'].copy()
print(f"Original players shape: {players.shape}")
print(f"Original columns: {players.columns.tolist()}")


1Ô∏è‚É£ CLEANING PLAYERS DATASET
Original players shape: (34291, 23)
Original columns: ['player_id', 'first_name', 'last_name', 'name', 'last_season', 'current_club_id', 'player_code', 'country_of_birth', 'city_of_birth', 'country_of_citizenship', 'date_of_birth', 'sub_position', 'position', 'foot', 'height_in_cm', 'contract_expiration_date', 'agent_name', 'image_url', 'url', 'current_club_domestic_competition_id', 'current_club_name', 'market_value_in_eur', 'highest_market_value_in_eur']


In [35]:
# Check for missing values
print("\nMissing values before cleaning:")
missing_before = players.isnull().sum()
missing_before = missing_before[missing_before > 0].sort_values(ascending=False)
if len(missing_before) > 0:
    display(missing_before)
else:
    print("No missing values!")


Missing values before cleaning:


agent_name                     16518
contract_expiration_date       12497
country_of_birth                2916
market_value_in_eur             2916
highest_market_value_in_eur     2916
foot                            2663
city_of_birth                   2589
height_in_cm                    2394
first_name                      2138
country_of_citizenship           355
sub_position                     189
date_of_birth                     49
dtype: int64

In [36]:
# Handle missing date_of_birth - drop if missing (essential for age calculation)
players = players.dropna(subset=['date_of_birth'])
print(f"\nAfter dropping missing DOB: {len(players):,} players")


After dropping missing DOB: 34,242 players


In [37]:
# Convert date_of_birth to datetime
players['date_of_birth'] = pd.to_datetime(players['date_of_birth'], errors='coerce')
players = players.dropna(subset=['date_of_birth'])  # Drop if conversion failed
print(f"After date conversion: {len(players):,} players")

After date conversion: 34,242 players


In [38]:
# Calculate age (as of 2025 for consistency)
year = 2025
players['age'] = year - players['date_of_birth'].dt.year
print(f"\nAge range: {players['age'].min()} to {players['age'].max()} years")


Age range: 15 to 57 years


In [39]:
# Handle missing position - drop if missing (essential for grouping)
players = players.dropna(subset=['position'])
print(f"After dropping missing position: {len(players):,} players")

After dropping missing position: 34,242 players


In [40]:
# Handle missing foot - fill with 'Unknown'
if 'foot' in players.columns:
    players['foot'] = players['foot'].fillna('Unknown')
    print("\nFoot distribution:")
    print(players['foot'].value_counts())


Foot distribution:
foot
right      22223
left        7936
Unknown     2648
both        1435
Name: count, dtype: int64


In [41]:
# After loading the players dataset (as you did in Notebook 2)
# Display all unique position values
unique_positions = data['players']['position'].unique()
print("Unique positions in the dataset:")
for pos in sorted(unique_positions):
    print(f"  - '{pos}'")

# Also show the count for each position to understand the distribution
print("\nPosition distribution:")
print(data['players']['position'].value_counts())

Unique positions in the dataset:
  - 'Attack'
  - 'Defender'
  - 'Goalkeeper'
  - 'Midfield'
  - 'Missing'

Position distribution:
position
Defender      10893
Midfield       9903
Attack         9400
Goalkeeper     3906
Missing         189
Name: count, dtype: int64


In [42]:
print("\n" + "="*60)
print("üéØ MAPPING POSITIONS TO GROUPS")
print("="*60)

# Simple direct mapping based on actual data
position_mapping = {
    'Goalkeeper': 'GK',
    'Defender': 'DEF',
    'Midfield': 'MID',
    'Attack': 'ATT'
    # 'Missing' will be handled separately
}


üéØ MAPPING POSITIONS TO GROUPS


In [43]:
# Map positions to groups
players['position_group'] = players['position'].map(position_mapping)

# Check the results
print("\nPosition group distribution:")
print(players['position_group'].value_counts(dropna=False))


Position group distribution:
position_group
DEF    10883
MID     9888
ATT     9390
GK      3897
NaN      184
Name: count, dtype: int64


In [50]:
# Keep only essential columns
keep_cols = ['player_id', 'name', 'position', 'position_group', 'age']
if 'foot' in players.columns:
    keep_cols.append('foot')
if 'height' in players.columns:
    keep_cols.append('height')

In [51]:
players_clean = players[keep_cols].copy()
print(f"\nCleaned players shape: {players_clean.shape}")


Cleaned players shape: (34242, 6)


In [52]:
# Save cleaned players to output path
players_path = os.path.join(outputs_path, 'players_cleaned.csv')
players_clean.to_csv(players_path, index=False)
print(f"\n‚úÖ Saved cleaned players to: {players_path}")


‚úÖ Saved cleaned players to: C:\Users\Dimitris\Desktop\DAMA\homeworks\hw dama61\hw4\env_nn\outputs\players_cleaned.csv


### 2. Clean Appearances Dataset

In [59]:
print("\n" + "="*60)
print("2Ô∏è‚É£ CLEANING APPEARANCES DATASET")
print("="*60)

appearances = data['appearances'].copy()
print(f"Original appearances shape: {appearances.shape}")
print(f"Columns: {appearances.columns.tolist()}")

# Aggregate appearances per player (sum all seasons - no date filtering)
print("\nAggregating performance stats per player...")


2Ô∏è‚É£ CLEANING APPEARANCES DATASET


Original appearances shape: (1722865, 13)
Columns: ['appearance_id', 'game_id', 'player_id', 'player_club_id', 'player_current_club_id', 'date', 'player_name', 'competition_id', 'yellow_cards', 'red_cards', 'goals', 'assists', 'minutes_played']

Aggregating performance stats per player...


In [60]:
# Define which columns to sum
columns_to_sum = ['goals', 'assists', 'minutes_played', 'yellow_cards', 'red_cards']
existing_sum_cols = [col for col in columns_to_sum if col in appearances.columns]
print(f"Found these columns to aggregate: {existing_sum_cols}")

Found these columns to aggregate: ['goals', 'assists', 'minutes_played', 'yellow_cards', 'red_cards']


In [61]:
# Method 1: Simple aggregation with a list (most reliable)
agg_results = appearances.groupby('player_id')[existing_sum_cols].sum().reset_index()

In [62]:
# Add count of appearances separately
appearance_counts = appearances.groupby('player_id').size().reset_index(name='appearances_count')

# Merge the two results
appearances_agg = agg_results.merge(appearance_counts, on='player_id', how='outer')

In [63]:
print(f"Players with appearances: {len(appearances_agg):,}")
print(f"Aggregated columns: {appearances_agg.columns.tolist()}")

if len(existing_sum_cols) > 0:
    print("\nPerformance stats summary:")
    display(appearances_agg[existing_sum_cols + ['appearances_count']].describe())

Players with appearances: 26,489
Aggregated columns: ['player_id', 'goals', 'assists', 'minutes_played', 'yellow_cards', 'red_cards', 'appearances_count']

Performance stats summary:


Unnamed: 0,goals,assists,minutes_played,yellow_cards,red_cards,appearances_count
count,26489.0,26489.0,26489.0,26489.0,26489.0,26489.0
mean,6.240515,4.912907,4487.508513,9.551587,0.247688,65.040772
std,16.729338,11.132572,6325.709273,14.664474,0.617068,84.400981
min,0.0,0.0,1.0,0.0,0.0,1.0
25%,0.0,0.0,330.0,1.0,0.0,8.0
50%,1.0,1.0,1802.0,4.0,0.0,30.0
75%,5.0,5.0,6111.0,12.0,0.0,91.0
max,480.0,224.0,49350.0,147.0,7.0,610.0


In [64]:
# Save aggregated appearances
appearances_path = os.path.join(outputs_path, 'appearances_aggregated.csv')
appearances_agg.to_csv(appearances_path, index=False)
print(f"\n‚úÖ Saved aggregated appearances to: {appearances_path}")


‚úÖ Saved aggregated appearances to: C:\Users\Dimitris\Desktop\DAMA\homeworks\hw dama61\hw4\env_nn\outputs\appearances_aggregated.csv


### 3. Clean Player Valuations Dataset

In [65]:
print("\n" + "="*60)
print("3Ô∏è‚É£ CLEANING PLAYER VALUATIONS")
print("="*60)

valuations = data['player_valuations'].copy()
print(f"Original valuations shape: {valuations.shape}")


3Ô∏è‚É£ CLEANING PLAYER VALUATIONS
Original valuations shape: (448965, 6)


In [66]:
# Convert date
if 'date' in valuations.columns:
    valuations['date'] = pd.to_datetime(valuations['date'], errors='coerce')
    valuations = valuations.dropna(subset=['date'])
    print(f"Date range: {valuations['date'].min()} to {valuations['date'].max()}")

Date range: 2000-01-20 00:00:00 to 2026-02-20 00:00:00


In [67]:
# Get latest valuation for each player (most recent)
latest_valuations = valuations.sort_values('date').groupby('player_id').last().reset_index()
print(f"Players with latest valuation: {len(latest_valuations):,}")

Players with latest valuation: 31,375


In [80]:
# Find the correct market value column
value_col = None
for col in ['market_value_in_eur', 'market_value', 'value']:
    if col in latest_valuations.columns:
        value_col = col
        break

if value_col:
    latest_valuations = latest_valuations.rename(columns={value_col: 'market_value'})
    
    # Convert to millions for easier handling
    latest_valuations['market_value_m'] = latest_valuations['market_value'] / 1_000_000
    
    print("\nMarket value distribution (‚Ç¨ millions):")
    display(latest_valuations['market_value_m'].describe())
    
    # Don't filter out low values - keep all players
    print(f"\nTotal players with valuations: {len(latest_valuations):,}")
else:
    print("‚ùå Could not find market value column!")
    print(f"Available columns: {latest_valuations.columns.tolist()}")
    # Create empty columns as placeholder
    latest_valuations['market_value'] = 0
    latest_valuations['market_value_m'] = 0

# Keep essential columns
keep_val_cols = ['player_id', 'date', 'market_value', 'market_value_m']
valuations_clean = latest_valuations[keep_val_cols].copy()
valuations_path = os.path.join(outputs_path, 'valuations_clean.csv')
valuations_clean.to_csv(valuations_path, index=False)
print(f"\n‚úÖ Saved cleaned valuations to: {valuations_path}")





Market value distribution (‚Ç¨ millions):


count    31375.000000
mean         1.660953
std          6.460389
min          0.010000
25%          0.100000
50%          0.250000
75%          0.700000
max        200.000000
Name: market_value_m, dtype: float64


Total players with valuations: 31,375

‚úÖ Saved cleaned valuations to: C:\Users\Dimitris\Desktop\DAMA\homeworks\hw dama61\hw4\env_nn\outputs\valuations_clean.csv


### 4. Merge Core Datasets (USING LEFT JOINS)

In [81]:
print("\n" + "="*60)
print("4Ô∏è‚É£ MERGING CORE DATASETS (LEFT JOINS)")
print("="*60)

# Load cleaned files from output path
players_clean = pd.read_csv(os.path.join(outputs_path, 'players_cleaned.csv'))
appearances_agg = pd.read_csv(os.path.join(outputs_path, 'appearances_aggregated.csv'))
valuations_clean = pd.read_csv(os.path.join(outputs_path, 'valuations_clean.csv'))


4Ô∏è‚É£ MERGING CORE DATASETS (LEFT JOINS)


In [82]:
print(f"Players: {len(players_clean):,}")
print(f"Appearances: {len(appearances_agg):,}")
print(f"Valuations: {len(valuations_clean):,}")

Players: 34,242
Appearances: 26,489
Valuations: 31,375


In [83]:
# Create df using LEFT JOIN to keep ALL players
df = players_clean.merge(appearances_agg, on='player_id', how='left')
print(f"\nAfter merge with appearances (left join): {len(df):,} players")

df = df.merge(valuations_clean, on='player_id', how='left')
print(f"After merge with valuations (left join): {len(df):,} players")


After merge with appearances (left join): 34,242 players
After merge with valuations (left join): 34,242 players


In [84]:
# Fill missing values with 0 for performance stats
fill_cols = ['appearances_count', 'goals', 'assists', 'minutes_played', 
             'yellow_cards', 'red_cards', 'market_value', 'market_value_m']
for col in fill_cols:
    if col in df.columns:
        df[col] = df[col].fillna(0)

print(f"\nFinal merged dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")


Final merged dataset shape: (34242, 15)
Columns: ['player_id', 'name', 'position', 'position_group', 'age', 'foot', 'goals', 'assists', 'minutes_played', 'yellow_cards', 'red_cards', 'appearances_count', 'date', 'market_value', 'market_value_m']


In [85]:
# Check position distribution in final df
print("\nPosition distribution in final merged dataset:")
print(df['position_group'].value_counts())


Position distribution in final merged dataset:
position_group
DEF    10883
MID     9888
ATT     9390
GK      3897
Name: count, dtype: int64


In [86]:
# Save merged complete dataset
merged_complete_path = os.path.join(outputs_path, 'merged_data_complete.csv')
df.to_csv(merged_complete_path, index=False)
print(f"\n‚úÖ Saved complete merged dataset to: {merged_complete_path}")



‚úÖ Saved complete merged dataset to: C:\Users\Dimitris\Desktop\DAMA\homeworks\hw dama61\hw4\env_nn\outputs\merged_data_complete.csv


### 5. Split by Position Group

In [87]:
print("\n" + "="*60)
print("5Ô∏è‚É£ SPLITTING BY POSITION GROUP")
print("="*60)

# NOW df is defined and we can split it
gk_df = df[df['position_group'] == 'GK'].copy()
def_df = df[df['position_group'] == 'DEF'].copy()
mid_df = df[df['position_group'] == 'MID'].copy()
att_df = df[df['position_group'] == 'ATT'].copy()


5Ô∏è‚É£ SPLITTING BY POSITION GROUP


In [88]:
print(f"Goalkeepers: {len(gk_df):,} players")
print(f"Defenders: {len(def_df):,} players")
print(f"Midfielders: {len(mid_df):,} players")
print(f"Attackers: {len(att_df):,} players")

Goalkeepers: 3,897 players
Defenders: 10,883 players
Midfielders: 9,888 players
Attackers: 9,390 players


In [89]:
# Quick stats by position group (only for players with market value > 0)
print("\nAverage market value by position (‚Ç¨ millions):")
for name, group_df in [('GK', gk_df), ('DEF', def_df), ('MID', mid_df), ('ATT', att_df)]:
    players_with_value = group_df[group_df['market_value_m'] > 0]
    if len(players_with_value) > 0:
        avg_value = players_with_value['market_value_m'].mean()
        print(f"  {name}: ‚Ç¨{avg_value:.2f}M (based on {len(players_with_value)} players with values)")
    else:
        print(f"  {name}: No players with market values")


Average market value by position (‚Ç¨ millions):
  GK: ‚Ç¨0.77M (based on 3464 players with values)
  DEF: ‚Ç¨1.54M (based on 10036 players with values)
  MID: ‚Ç¨1.83M (based on 9098 players with values)
  ATT: ‚Ç¨2.00M (based on 8644 players with values)


In [90]:
# Save each position group to output path
gk_path = os.path.join(outputs_path, 'merged_data_gk.csv')
def_path = os.path.join(outputs_path, 'merged_data_def.csv')
mid_path = os.path.join(outputs_path, 'merged_data_mid.csv')
att_path = os.path.join(outputs_path, 'merged_data_att.csv')

gk_df.to_csv(gk_path, index=False)
def_df.to_csv(def_path, index=False)
mid_df.to_csv(mid_path, index=False)
att_df.to_csv(att_path, index=False)

print("\n‚úÖ Saved position-specific files:")
print(f"  - {gk_path}")
print(f"  - {def_path}")
print(f"  - {mid_path}")
print(f"  - {att_path}")


‚úÖ Saved position-specific files:
  - C:\Users\Dimitris\Desktop\DAMA\homeworks\hw dama61\hw4\env_nn\outputs\merged_data_gk.csv
  - C:\Users\Dimitris\Desktop\DAMA\homeworks\hw dama61\hw4\env_nn\outputs\merged_data_def.csv
  - C:\Users\Dimitris\Desktop\DAMA\homeworks\hw dama61\hw4\env_nn\outputs\merged_data_mid.csv
  - C:\Users\Dimitris\Desktop\DAMA\homeworks\hw dama61\hw4\env_nn\outputs\merged_data_att.csv


### 6. Summary Statistics

In [91]:
print("\n" + "="*60)
print("6Ô∏è‚É£ SUMMARY STATISTICS")
print("="*60)

summary_stats = []
for name, group_df in [('GK', gk_df), ('DEF', def_df), ('MID', mid_df), ('ATT', att_df)]:
    players_with_value = group_df[group_df['market_value_m'] > 0]
    stats = {
        'Position': name,
        'Total Players': len(group_df),
        'Players with Value': len(players_with_value),
        'Avg Age': group_df['age'].mean(),
        'Avg Market Value (M‚Ç¨)': players_with_value['market_value_m'].mean() if len(players_with_value) > 0 else 0,
        'Median Market Value (M‚Ç¨)': players_with_value['market_value_m'].median() if len(players_with_value) > 0 else 0,
        'Avg Goals': group_df['goals'].mean(),
        'Avg Assists': group_df['assists'].mean(),
        'Avg Appearances': group_df['appearances_count'].mean()
    }
    summary_stats.append(stats)

summary_df = pd.DataFrame(summary_stats)
display(summary_df.round(2))


6Ô∏è‚É£ SUMMARY STATISTICS


Unnamed: 0,Position,Total Players,Players with Value,Avg Age,Avg Market Value (M‚Ç¨),Median Market Value (M‚Ç¨),Avg Goals,Avg Assists,Avg Appearances
0,GK,3897,3464,30.14,0.77,0.15,0.0,0.08,31.22
1,DEF,10883,10036,30.17,1.54,0.25,2.03,2.69,53.18
2,MID,9888,9098,29.78,1.83,0.25,4.54,4.76,54.23
3,ATT,9390,8644,29.59,2.0,0.3,10.43,5.68,51.52


In [92]:
# Save summary
summary_path = os.path.join(outputs_path, 'position_group_summary.csv')
summary_df.to_csv(summary_path, index=False)
print(f"\n‚úÖ Summary saved to: {summary_path}")


‚úÖ Summary saved to: C:\Users\Dimitris\Desktop\DAMA\homeworks\hw dama61\hw4\env_nn\outputs\position_group_summary.csv
