# Match Stats Data Cleaning

Simple notebook to clean and join match statistics data.

## Steps:
1. Load fixtures, match_stats, and team_mapping from prod
2. Join fixtures and match_stats via match_id
3. Keep only essential columns
4. Join with team mapping using team_id
5. Filter for consistent team names

In [1]:
import pandas as pd
import json
from pathlib import Path
from fuzzywuzzy import fuzz, process

# Set up paths
project_root = Path().resolve().parent.parent.parent
data_prod_path = project_root / 'data' / 'prod' / 'raw'

print(f"Loading data from: {data_prod_path}")

  from pandas.core import (


Loading data from: C:\Users\50230\OneDrive\Escritorio\Proyectos y trabajos\Personales\Pronósticos Football\data\prod\raw




## Step 1: Load Data

In [2]:
# Load fixtures
with open(data_prod_path / 'all_competitions_fixtures_dataframe.json', 'r', encoding='utf-8') as f:
    fixtures_data = json.load(f)
fixtures_df = pd.DataFrame(fixtures_data)
print(f"Fixtures loaded: {fixtures_df.shape}")

# Load match stats  
with open(data_prod_path / 'match_stats'/'complete_names' / 'all_competitions_match_stats.json', 'r', encoding='utf-8') as f:
    match_stats_data = json.load(f)
match_stats_df = pd.DataFrame(match_stats_data)
print(f"Match stats loaded: {match_stats_df.shape}")

with open(data_prod_path / 'match_stats'/'complete_names' / 'all_competitions_remaining_matches.json', 'r', encoding='utf-8') as f:
    match_stats_data = json.load(f)
match_stats_df_2 = pd.DataFrame(match_stats_data)
print(f"Match stats loaded: {match_stats_df_2.shape}")

# Load team mapping
with open(data_prod_path / 'all_teams.json', 'r', encoding='utf-8') as f:
    team_mapping_data = json.load(f)

# Convert nested dict to DataFrame
team_records = []
for team_id, team_info in team_mapping_data.items():
    team_records.append({
        'team_id': team_id,
        'team_name': team_info.get('team_name', '')
    })
team_mapping_df = pd.DataFrame(team_records)
print(f"Team mapping loaded: {team_mapping_df.shape}")

match_stats_df.rename(
    columns= {
        'match_id':'full_match_report_url'
    },
    inplace=True
)

Fixtures loaded: (5751, 30)
Match stats loaded: (98004, 4)
Match stats loaded: (160, 4)
Team mapping loaded: (27, 2)


## Step 2: Join Data

In [3]:
match_stats_df_2.rename(
    columns= {   
        'match_id':'full_match_report_url'
    },
    
    inplace=True
)

match_stats_df = pd.concat(
    [
        match_stats_df,
        match_stats_df_2
    ],
    axis=0
)

## Step 3: Stats Match manipulation

In [4]:
stats_match_df = pd.pivot(
    match_stats_df,
    index=['full_match_report_url','team_name'],
    columns='stat_name',
    values='stat_value'
).reset_index()

stats_match_df = stats_match_df.sort_values(
    by=['full_match_report_url', 'team_name']
)

stats_match_df['row_num'] = stats_match_df.sort_values(['full_match_report_url', 'team_name'], ascending=True).groupby(['full_match_report_url', 'full_match_report_url']).cumcount() + 1

stats_1 = stats_match_df[stats_match_df['row_num']==1].reset_index(drop=True)
stats_2 = stats_match_df[stats_match_df['row_num']==2].reset_index(drop=True)

stats_1.drop(columns=['row_num'], inplace=True)
stats_2.drop(columns=['row_num'], inplace=True)

### Step 4: Create stats on favor and Against

In [5]:
perspective_favor_1 = stats_1.copy()
perspective_against_1 = stats_2.copy()

for col in perspective_favor_1.columns:
    if col not in ['full_match_report_url', 'team_name', 'row_num']:
        perspective_favor_1.rename(columns={col: f'{col}_favor'},inplace=True)

for col in perspective_against_1.columns:
    if col not in ['full_match_report_url', 'team_name', 'row_num']:
        perspective_against_1.rename(columns={col: f'{col}_against'},inplace=True)

perspective_1 = perspective_favor_1.merge(
    perspective_against_1.drop(columns=['team_name'],axis = 1),
    on=['full_match_report_url'],
    how='inner'
)

perspective_favor_2 = stats_2.copy()
perspective_against_2 = stats_1.copy()

for col in perspective_favor_2.columns:
    if col not in ['full_match_report_url', 'team_name', 'row_num']:
        perspective_favor_2.rename(columns={col: f'{col}_favor'},inplace=True)

for col in perspective_against_2.columns:
    if col not in ['full_match_report_url', 'team_name', 'row_num']:
        perspective_against_2.rename(columns={col: f'{col}_against'},inplace=True)

perspective_2 = perspective_favor_2.merge(
    perspective_against_2.drop(columns=['team_name'],axis = 1),
    on=['full_match_report_url'],
    how='inner'
)

## Step 5: Creating Master with stats

In [6]:
master_1 = perspective_1.merge(
    fixtures_df,
    on=['full_match_report_url'],
    how='inner'
)

master_1 = master_1[master_1['team_name_x']==master_1['team_name_y']]

master_2 = perspective_2.merge(
    fixtures_df,
    on=['full_match_report_url'],
    how='inner'
)

master_2 = master_2[master_2['team_name_x']==master_2['team_name_y']]

In [7]:
cols_select = [
    'date',
    'comp',
    'round',
    'season',
    'team_id',
    'full_match_report_url',
    'team_name_x',
    'opponent',
    'Aerials Won_favor',
    'Clearances_favor',
    'Corners_favor',
    'Crosses_favor',
    'Fouls_favor',
    'Goal Kicks_favor',
    'Interceptions_favor',
    'Long Balls_favor',
    'Offsides_favor',
    'Passing Accuracy_favor',
    'Possession_favor',
    'Saves_favor',
    'Shots on Target_favor',
    'Tackles_favor',
    'Throw Ins_favor',
    'Touches_favor',
    'Aerials Won_against',
    'Clearances_against',
    'Corners_against',
    'Crosses_against',
    'Fouls_against',
    'Goal Kicks_against',
    'Interceptions_against',
    'Long Balls_against',
    'Offsides_against',
    'Passing Accuracy_against',
    'Possession_against',
    'Saves_against',
    'Shots on Target_against',
    'Tackles_against',
    'Throw Ins_against',
    'Touches_against',    
    'venue',
    'result',
    'formation',
    'referee',
    'start_time',
    'dayofweek',
    'goals_for',
    'goals_against',
    'xg_for',
    'xg_against',
    'opp_formation',
    ]

master_1 = master_1[cols_select]
master_2 = master_2[cols_select]

master_1.rename(
    columns={
        'team_name_x': 'team_name'
    },
    inplace=True
)

master_2.rename(
    columns={
        'team_name_x': 'team_name'
    },
    inplace=True
)

In [13]:
master_final_v1 = pd.concat(
    [
        master_1,
        master_2
    ],
    axis=0
)

master_final_v1['date'] = pd.to_datetime(master_final_v1['date'])

In [None]:
master_final_v1.sort_values(
    by=['date', 'full_match_report_url', 'team_id'],
    inplace=True
    )

## Step 6: Save Master Dataset

In [21]:
# Set up output path
output_path = project_root / 'data' / 'prod' / 'processed'
output_path.mkdir(parents=True, exist_ok=True)

print(f"Saving master dataset to: {output_path}")
print(f"Dataset shape: {master_final_v1.shape}")

# Save in multiple formats
filename_base = "match_stats_master_dataset"

# 1. JSON format
json_file = output_path / f"{filename_base}.json"
master_final_v1.to_json(json_file, orient='records', date_format='iso', indent=2)
print(f"✓ Saved JSON: {json_file}")

# 2. CSV format
csv_file = output_path / f"{filename_base}.csv"
master_final_v1.to_csv(csv_file, index=False)
print(f"✓ Saved CSV: {csv_file}")

# 3. Parquet format (efficient for large datasets)
parquet_file = output_path / f"{filename_base}.parquet"
master_final_v1.to_parquet(parquet_file, index=False)
print(f"✓ Saved Parquet: {parquet_file}")

# 4. Pickle format (preserves data types exactly)
pickle_file = output_path / f"{filename_base}.pkl"
master_final_v1.to_pickle(pickle_file)
print(f"✓ Saved Pickle: {pickle_file}")

# 5. Excel format
excel_file = output_path / f"{filename_base}.xlsx"
master_final_v1.to_excel(excel_file, index=False, sheet_name='match_stats')
print(f"✓ Saved Excel: {excel_file}")

print(f"\nAll files saved successfully!")
print(f"Total records: {len(master_final_v1):,}")
print(f"Date range: {master_final_v1['date'].min()} to {master_final_v1['date'].max()}")

Saving master dataset to: C:\Users\50230\OneDrive\Escritorio\Proyectos y trabajos\Personales\Pronósticos Football\data\prod\processed
Dataset shape: (5711, 51)
✓ Saved JSON: C:\Users\50230\OneDrive\Escritorio\Proyectos y trabajos\Personales\Pronósticos Football\data\prod\processed\match_stats_master_dataset.json
✓ Saved CSV: C:\Users\50230\OneDrive\Escritorio\Proyectos y trabajos\Personales\Pronósticos Football\data\prod\processed\match_stats_master_dataset.csv
✓ Saved Parquet: C:\Users\50230\OneDrive\Escritorio\Proyectos y trabajos\Personales\Pronósticos Football\data\prod\processed\match_stats_master_dataset.parquet
✓ Saved Pickle: C:\Users\50230\OneDrive\Escritorio\Proyectos y trabajos\Personales\Pronósticos Football\data\prod\processed\match_stats_master_dataset.pkl
✓ Saved Excel: C:\Users\50230\OneDrive\Escritorio\Proyectos y trabajos\Personales\Pronósticos Football\data\prod\processed\match_stats_master_dataset.xlsx

All files saved successfully!
Total records: 5,711
Date range: 