# Master Dataset Creation v1

This notebook creates a comprehensive master dataset by combining:
- Match statistics data
- Wage data
- Fixtures data
- Team information

## Objectives:
- Load all available datasets
- Explore data relationships and join keys
- Create a unified master dataset
- Save the master dataset for modeling

In [1]:
import pandas as pd
import numpy as np
import json
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set up paths
project_root = Path().resolve().parent.parent.parent
data_prod_raw = project_root / 'data' / 'prod' / 'raw'
data_prod_processed = project_root / 'data' / 'prod' / 'processed'

print(f"Project root: {project_root}")
print(f"Raw data path: {data_prod_raw}")
print(f"Processed data path: {data_prod_processed}")

  from pandas.core import (


Project root: C:\Users\50230\OneDrive\Escritorio\Proyectos y trabajos\Personales\Pronósticos Football
Raw data path: C:\Users\50230\OneDrive\Escritorio\Proyectos y trabajos\Personales\Pronósticos Football\data\prod\raw
Processed data path: C:\Users\50230\OneDrive\Escritorio\Proyectos y trabajos\Personales\Pronósticos Football\data\prod\processed


## Data Loading

Load all available datasets from the production environment.

In [2]:
# Helper function to load JSON data
def load_json_data(file_path):
    """Load JSON data with error handling"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None

def load_processed_data(filename):
    """Load processed data (prefers parquet for efficiency)"""
    base_path = data_prod_processed / filename
    
    # Try different formats in order of preference
    for ext in ['.parquet', '.pkl', '.csv', '.json']:
        file_path = base_path.with_suffix(ext)
        if file_path.exists():
            try:
                if ext == '.parquet':
                    return pd.read_parquet(file_path)
                elif ext == '.pkl':
                    return pd.read_pickle(file_path)
                elif ext == '.csv':
                    return pd.read_csv(file_path)
                elif ext == '.json':
                    return pd.read_json(file_path)
            except Exception as e:
                print(f"Error loading {file_path}: {e}")
                continue
    
    print(f"Could not load {filename} in any format")
    return None

print("Loading datasets...")
print("=" * 50)

Loading datasets...


In [3]:
# 1. Load Match Statistics Data
print("1. Loading match statistics data...")
match_stats_df = load_processed_data('match_stats_master_dataset')
if match_stats_df is not None:
    print(f"   ✓ Match stats loaded: {match_stats_df.shape}")
    print(f"   Columns: {list(match_stats_df.columns)[:5]}...")
else:
    print("   ✗ Failed to load match statistics")

1. Loading match statistics data...
   ✓ Match stats loaded: (5711, 51)
   Columns: ['date', 'comp', 'round', 'season', 'team_id']...


In [4]:
# 2. Load Wage Data
print("2. Loading wage data...")
wages_df = load_processed_data('premier_league_wages_processed')
if wages_df is not None:
    print(f"   ✓ Wages loaded: {wages_df.shape}")
    print(f"   Columns: {list(wages_df.columns)[:5]}...")
    print(f"   Date range: {wages_df['season'].unique()}")
else:
    print("   ✗ Failed to load wage data")

2. Loading wage data...
   ✓ Wages loaded: (3341, 15)
   Columns: ['team_name', 'season', 'player_name', 'age', 'annual_wages']...
   Date range: ['2019-2020' '2020-2021' '2021-2022' '2022-2023' '2023-2024' '2024-2025']


In [5]:
# 3. Load Team Information
print("4. Loading team information...")
teams_file = data_prod_raw / 'all_teams.json'
if teams_file.exists():
    teams_data = load_json_data(teams_file)
    if teams_data:
        teams_df = pd.DataFrame(teams_data)
        print(f"   ✓ Teams loaded: {teams_df.shape}")
        if len(teams_df.columns) > 0:
            print(f"   Columns: {list(teams_df.columns)[:5]}...")
    else:
        teams_df = None
        print("   ✗ Failed to load team data")
else:
    teams_df = None
    print("   ✗ No team data found")

teams_df = teams_df.T

4. Loading team information...
   ✓ Teams loaded: (4, 27)
   Columns: ['18bb7c10', '8602292d', '4ba7cbea', 'd07537b9', '943e8050']...


## Aggregations on Wages

Creando agregaciones de Salarios

In [6]:
wages_df['age'] = wages_df['age'].astype(int)

wages_resume = wages_df.groupby(
      [
          'team_id',
          'season',
          'team_name'
      ]
  ).agg(
      age_mean=('age', 'mean'),
      squad_size=('age', 'count'),
      age_max=('age', 'max'),
      age_min=('age', 'min'),
      avg_wage_dollars=('annual_wages_dollars', 'mean'),
      total_wage_bill_dollars=('annual_wages_dollars', 'sum'),
      max_wage_dollars=('annual_wages_dollars', 'max'),
      min_wage_dollars=('annual_wages_dollars', 'min')
  ).reset_index()

## Joining Matchstats to wages df

In [7]:
match_wages_1 = match_stats_df.merge(
    teams_df[['team_name','team_id']],
    left_on =  'opponent',
    right_on = 'team_name',
    how = 'left'    
)

match_wages_1.drop('team_name_y', axis=1,inplace=True)

match_wages_1.rename(
    columns={
        'team_id_x': 'team_id',
        'team_id_y': 'opponent_id',
        'team_name_x': 'team_name'
        }, 
        inplace=True)

match_wages_1 = match_wages_1.merge(
    wages_resume,
    on = ['team_id', 'season', 'team_name'],
    how = 'left'
    )

# Create a copy of wages_resume with "opp_" prefix for all columns
wages_resume_opp = wages_resume.copy()

wages_resume_opp.rename(
    columns = {
        'team_id': 'opponent_id', 
        'team_name': 'opponent', 
        'age_mean': 'opp_age_mean', 
        'squad_size': 'opp_squad_size', 
        'age_max': 'opp_age_max',
        'age_min': 'opp_age_min', 
        'avg_wage_dollars': 'opp_avg_wage_dollars', 
        'total_wage_bill_dollars': 'opp_total_wage_bill_dollars',
        'max_wage_dollars': 'opp_max_wage_dollars', 
        'min_wage_dollars': 'opp_min_wage_dollars'
    },
    inplace=True
)

match_stats_complete = match_wages_1.merge(
    wages_resume_opp,
    on = ['opponent_id', 'season', 'opponent'],
    how = 'left'
    )

## Data Cleaning Part I

In [8]:
# Keep original and create new column
match_stats_complete['points'] = match_stats_complete['result'].map({
    'W': 3,
    'D': 1,
    'L': 0
})

# Sort by team and date
match_stats_complete = match_stats_complete.sort_values(['team_id', 'date'])

# Convert date to datetime
match_stats_complete['date'] = pd.to_datetime(match_stats_complete['date'])

# Get previous match date for each team
match_stats_complete['prev_match_date'] = (
    match_stats_complete.groupby('team_id')['date'].shift(1)
)

# Calculate rest days
match_stats_complete['rest_days'] = (
    match_stats_complete['date'] - match_stats_complete['prev_match_date']
).dt.days

# Drop the helper column if you don't need it
match_stats_complete.drop('prev_match_date', axis=1, inplace=True)

## Data Cleaning

In [9]:
cols_fix = [
    'Passing Accuracy_favor',
    'Possession_favor',
    'Saves_favor',
    'Shots on Target_favor',
    'Passing Accuracy_against',
    'Possession_against',
    'Saves_against',
    'Shots on Target_against'
]

  # Apply to multiple columns at once
match_stats_complete[cols_fix] = (
      match_stats_complete[cols_fix]
      .apply(lambda x: x.str.replace('%', '').astype(float))
  )

## Feature Generation part I

In [10]:
cols_rolling = [
    'Aerials Won_favor',
    'Clearances_favor',
    'Corners_favor',
    'Crosses_favor',
    'Fouls_favor',
    'Goal Kicks_favor',
    'Interceptions_favor',
    'Long Balls_favor',
    'Offsides_favor',
    'Passing Accuracy_favor',
    'Possession_favor',
    'Saves_favor',
    'Shots on Target_favor',
    'Tackles_favor',
    'Throw Ins_favor',
    'Touches_favor',
    'Aerials Won_against',
    'Clearances_against',
    'Corners_against',
    'Crosses_against',
    'Fouls_against',
    'Goal Kicks_against',
    'Interceptions_against',
    'Long Balls_against',
    'Offsides_against',
    'Passing Accuracy_against',
    'Possession_against',
    'Saves_against',
    'Shots on Target_against',
    'Tackles_against',
    'Throw Ins_against',
    'Touches_against',
    'points',
    'rest_days'
]


match_stats_complete[cols_rolling] = match_stats_complete[cols_rolling].astype(float)

for stat in cols_rolling:
      # Sort by team and date first
      df_sorted = match_stats_complete.sort_values(['team_id', 'date'])

      # Rolling average of PREVIOUS 5 games
      match_stats_complete[f'{stat}_form_avg'] = (
          df_sorted.groupby('team_id')[stat]
          .transform(lambda x: x.shift(1).rolling(5, min_periods=1).mean())
      )
      match_stats_complete[f'{stat}_form_sum'] = (
          df_sorted.groupby('team_id')[stat]
          .transform(lambda x: x.shift(1).rolling(5, min_periods=1).sum())
      )

## Feature Generation Part II: Creating aggregation for opponents perspective

In [11]:
cols_to_keep = [
    'date',
    'team_id',
    'season',
    'rest_days',
    'Aerials Won_favor',
    'Clearances_favor',
    'Corners_favor',
    'Crosses_favor',
    'Fouls_favor',
    'Goal Kicks_favor',
    'Interceptions_favor',
    'Long Balls_favor',
    'Offsides_favor',
    'Passing Accuracy_favor',
    'Possession_favor',
    'Saves_favor',
    'Shots on Target_favor',
    'Tackles_favor',
    'Throw Ins_favor',
    'Touches_favor',
    'Aerials Won_against',
    'Clearances_against',
    'Corners_against',
    'Crosses_against',
    'Fouls_against',
    'Goal Kicks_against',
    'Interceptions_against',
    'Long Balls_against',
    'Offsides_against',
    'Passing Accuracy_against',
    'Possession_against',
    'Saves_against',
    'Shots on Target_against',
    'Tackles_against',
    'Throw Ins_against',
    'Touches_against',
    'points',
]

additional_against = match_stats_complete[cols_to_keep].copy()

additional_against.rename(
    columns={
        'rest_days': 'rest_days_opponent',
        'team_id': 'opponent_id',
        'Aerials Won_favor': 'Aerials Won_favor_opponent' ,
        'Clearances_favor':'Clearances_favor_opponent',
        'Corners_favor': 'Corners_favor_opponent',
        'Crosses_favor': 'Crosses_favor_opponent',
        'Fouls_favor': 'Fouls_favor_opponent',
        'Goal Kicks_favor': 'Goal Kicks_favor_opponent',
        'Interceptions_favor': 'Interceptions_favor_opponent',
        'Long Balls_favor': 'Long Balls_favor_opponent',
        'Offsides_favor': 'Offsides_favor_opponent',
        'Passing Accuracy_favor': 'Passing Accuracy_favor_opponent',
        'Possession_favor': 'Possession_favor_opponent',
        'Saves_favor': 'Saves_favor_opponent',
        'Shots on Target_favor': 'Shots on Target_favor_opponent',
        'Tackles_favor': 'Tackles_favor_opponent',
        'Throw Ins_favor': 'Throw Ins_favor_opponent',
        'Touches_favor': 'Touches_favor_opponent',
        'Aerials Won_against': 'Aerials Won_against_opponent',
        'Clearances_against': 'Clearances_against_opponent',
        'Corners_against': 'Corners_against_opponent',
        'Crosses_against': 'Crosses_against_opponent',
        'Fouls_against': 'Fouls_against_opponent',
        'Goal Kicks_against': 'Goal Kicks_against_opponent',
        'Interceptions_against': 'Interceptions_against_opponent',
        'Long Balls_against': 'Long Balls_against_opponent',
        'Offsides_against': 'Offsides_against_opponent',
        'Passing Accuracy_against': 'Passing Accuracy_against_opponent',
        'Possession_against': 'Possession_against_opponent',
        'Saves_against': 'Saves_against_opponent',
        'Shots on Target_against': 'Shots on Target_against_opponent',
        'Tackles_against': 'Tackles_against_opponent',
        'Throw Ins_against': 'Throw Ins_against_opponent',
        'Touches_against': 'Touches_against_opponent',
        'points': 'points_opponent'
    },
    inplace=True
)

cols_rolling = [
     'rest_days_opponent',
     'Aerials Won_favor_opponent',
     'Clearances_favor_opponent',
     'Corners_favor_opponent',
     'Crosses_favor_opponent',
     'Fouls_favor_opponent',
     'Goal Kicks_favor_opponent',
     'Interceptions_favor_opponent',
     'Long Balls_favor_opponent',
     'Offsides_favor_opponent',
     'Passing Accuracy_favor_opponent',
     'Possession_favor_opponent',
     'Saves_favor_opponent',
     'Shots on Target_favor_opponent',
     'Tackles_favor_opponent',
     'Throw Ins_favor_opponent',
     'Touches_favor_opponent',
     'Aerials Won_against_opponent',
     'Clearances_against_opponent',
     'Corners_against_opponent',
     'Crosses_against_opponent',
     'Fouls_against_opponent',
     'Goal Kicks_against_opponent',
     'Interceptions_against_opponent',
     'Long Balls_against_opponent',
     'Offsides_against_opponent',
     'Passing Accuracy_against_opponent',
     'Possession_against_opponent',
     'Saves_against_opponent',
     'Shots on Target_against_opponent',
     'Tackles_against_opponent',
     'Throw Ins_against_opponent',
     'Touches_against_opponent',
     'points_opponent'
]

for stat in cols_rolling:
      # Sort by team and date first
      df_sorted = additional_against.sort_values(['opponent_id', 'date'])

      # Rolling average of PREVIOUS 5 games
      additional_against[f'{stat}_form_avg'] = (
          df_sorted.groupby('opponent_id')[stat]
          .transform(lambda x: x.shift(1).rolling(5, min_periods=1).mean())
      )
      additional_against[f'{stat}_form_sum'] = (
          df_sorted.groupby('opponent_id')[stat]
          .transform(lambda x: x.shift(1).rolling(5, min_periods=1).sum())
      )

match_stats_complete = match_stats_complete.merge(
    additional_against,
    on=['date', 'season', 'opponent_id'],
    how = 'left'
)

## Column selection to avoid Data Leakage

In [12]:
id_cols = [
    'date',
    'comp',
    'round',
    'season',
    'team_id',
    'full_match_report_url',
    'team_name',
    'opponent',
    'opponent_id',
    
]

match_cols = [
    'referee',
    'start_time',
    'dayofweek'
]

target_cols = [
    'result',
    'goals_for',
    'goals_against'
]

stats_team_A = [
    'venue',
    'Aerials Won_favor_form_avg',
    'Aerials Won_favor_form_sum',
    'Clearances_favor_form_avg',
    'Clearances_favor_form_sum',
    'Corners_favor_form_avg',
    'Corners_favor_form_sum',
    'Crosses_favor_form_avg',
    'Crosses_favor_form_sum',
    'Fouls_favor_form_avg',
    'Fouls_favor_form_sum',
    'Goal Kicks_favor_form_avg',
    'Goal Kicks_favor_form_sum',
    'Interceptions_favor_form_avg',
    'Interceptions_favor_form_sum',
    'Long Balls_favor_form_avg',
    'Long Balls_favor_form_sum',
    'Offsides_favor_form_avg',
    'Offsides_favor_form_sum',
    'Passing Accuracy_favor_form_avg',
    'Passing Accuracy_favor_form_sum',
    'Possession_favor_form_avg',
    'Possession_favor_form_sum',
    'Saves_favor_form_avg',
    'Saves_favor_form_sum',
    'Shots on Target_favor_form_avg',
    'Shots on Target_favor_form_sum',
    'Tackles_favor_form_avg',
    'Tackles_favor_form_sum',
    'Throw Ins_favor_form_avg',
    'Throw Ins_favor_form_sum',
    'Touches_favor_form_avg',
    'Touches_favor_form_sum',
    'Aerials Won_against_form_avg',
    'Aerials Won_against_form_sum',
    'Clearances_against_form_avg',
    'Clearances_against_form_sum',
    'Corners_against_form_avg',
    'Corners_against_form_sum',
    'Crosses_against_form_avg',
    'Crosses_against_form_sum',
    'Fouls_against_form_avg',
    'Fouls_against_form_sum',
    'Goal Kicks_against_form_avg',
    'Goal Kicks_against_form_sum',
    'Interceptions_against_form_avg',
    'Interceptions_against_form_sum',
    'Long Balls_against_form_avg',
    'Long Balls_against_form_sum',
    'Offsides_against_form_avg',
    'Offsides_against_form_sum',
    'Passing Accuracy_against_form_avg',
    'Passing Accuracy_against_form_sum',
    'Possession_against_form_avg',
    'Possession_against_form_sum',
    'Saves_against_form_avg',
    'Saves_against_form_sum',
    'Shots on Target_against_form_avg',
    'Shots on Target_against_form_sum',
    'Tackles_against_form_avg',
    'Tackles_against_form_sum',
    'Throw Ins_against_form_avg',
    'Throw Ins_against_form_sum',
    'Touches_against_form_avg',
    'Touches_against_form_sum',
    'points_form_avg',
    'points_form_sum',
    'rest_days',
    'rest_days_form_avg',
    'rest_days_form_sum',
]

columnas_team_A_validacion = [
    'Aerials Won_favor',
    'Clearances_favor',
    'Corners_favor',
    'Crosses_favor',
    'Fouls_favor',
    'Goal Kicks_favor',
    'Interceptions_favor',
    'Long Balls_favor',
    'Offsides_favor',
    'Passing Accuracy_favor',
    'Possession_favor',
    'Saves_favor',
    'Shots on Target_favor',
    'Tackles_favor',
    'Throw Ins_favor',
    'Touches_favor',
    'Aerials Won_against',
    'Clearances_against',
    'Corners_against',
    'Crosses_against',
    'Fouls_against',
    'Goal Kicks_against',
    'Interceptions_against',
    'Long Balls_against',
    'Offsides_against',
    'Passing Accuracy_against',
    'Possession_against',
    'Saves_against',
    'Shots on Target_against',
    'Tackles_against',
    'Throw Ins_against',
    'Touches_against',
    'points'
]

players_team_A = [
    'age_mean',
    'squad_size',
    'age_max',
    'age_min',
    'avg_wage_dollars',
    'total_wage_bill_dollars',
    'max_wage_dollars',
    'min_wage_dollars'
 
]

stats_team_B = [
    'Aerials Won_favor_opponent_form_avg',
    'Aerials Won_favor_opponent_form_sum',
    'Clearances_favor_opponent_form_avg',
    'Clearances_favor_opponent_form_sum',
    'Corners_favor_opponent_form_avg',
    'Corners_favor_opponent_form_sum',
    'Crosses_favor_opponent_form_avg',
    'Crosses_favor_opponent_form_sum',
    'Fouls_favor_opponent_form_avg',
    'Fouls_favor_opponent_form_sum',
    'Goal Kicks_favor_opponent_form_avg',
    'Goal Kicks_favor_opponent_form_sum',
    'Interceptions_favor_opponent_form_avg',
    'Interceptions_favor_opponent_form_sum',
    'Long Balls_favor_opponent_form_avg',
    'Long Balls_favor_opponent_form_sum',
    'Offsides_favor_opponent_form_avg',
    'Offsides_favor_opponent_form_sum',
    'Passing Accuracy_favor_opponent_form_avg',
    'Passing Accuracy_favor_opponent_form_sum',
    'Possession_favor_opponent_form_avg',
    'Possession_favor_opponent_form_sum',
    'Saves_favor_opponent_form_avg',
    'Saves_favor_opponent_form_sum',
    'Shots on Target_favor_opponent_form_avg',
    'Shots on Target_favor_opponent_form_sum',
    'Tackles_favor_opponent_form_avg',
    'Tackles_favor_opponent_form_sum',
    'Throw Ins_favor_opponent_form_avg',
    'Throw Ins_favor_opponent_form_sum',
    'Touches_favor_opponent_form_avg',
    'Touches_favor_opponent_form_sum',
    'Aerials Won_against_opponent_form_avg',
    'Aerials Won_against_opponent_form_sum',
    'Clearances_against_opponent_form_avg',
    'Clearances_against_opponent_form_sum',
    'Corners_against_opponent_form_avg',
    'Corners_against_opponent_form_sum',
    'Crosses_against_opponent_form_avg',
    'Crosses_against_opponent_form_sum',
    'Fouls_against_opponent_form_avg',
    'Fouls_against_opponent_form_sum',
    'Goal Kicks_against_opponent_form_avg',
    'Goal Kicks_against_opponent_form_sum',
    'Interceptions_against_opponent_form_avg',
    'Interceptions_against_opponent_form_sum',
    'Long Balls_against_opponent_form_avg',
    'Long Balls_against_opponent_form_sum',
    'Offsides_against_opponent_form_avg',
    'Offsides_against_opponent_form_sum',
    'Passing Accuracy_against_opponent_form_avg',
    'Passing Accuracy_against_opponent_form_sum',
    'Possession_against_opponent_form_avg',
    'Possession_against_opponent_form_sum',
    'Saves_against_opponent_form_avg',
    'Saves_against_opponent_form_sum',
    'Shots on Target_against_opponent_form_avg',
    'Shots on Target_against_opponent_form_sum',
    'Tackles_against_opponent_form_avg',
    'Tackles_against_opponent_form_sum',
    'Throw Ins_against_opponent_form_avg',
    'Throw Ins_against_opponent_form_sum',
    'Touches_against_opponent_form_avg',
    'Touches_against_opponent_form_sum',
    'points_opponent_form_avg',
    'points_opponent_form_sum',
    'rest_days_opponent',
    'rest_days_opponent_form_avg',
    'rest_days_opponent_form_sum'
]

columnas_team_B_validacion = [
    'Aerials Won_favor_opponent',
    'Clearances_favor_opponent',
    'Corners_favor_opponent',
    'Crosses_favor_opponent',
    'Fouls_favor_opponent',
    'Goal Kicks_favor_opponent',
    'Interceptions_favor_opponent',
    'Long Balls_favor_opponent',
    'Offsides_favor_opponent',
    'Passing Accuracy_favor_opponent',
    'Possession_favor_opponent',
    'Saves_favor_opponent',
    'Shots on Target_favor_opponent',
    'Tackles_favor_opponent',
    'Throw Ins_favor_opponent',
    'Touches_favor_opponent',
    'Aerials Won_against_opponent',
    'Clearances_against_opponent',
    'Corners_against_opponent',
    'Crosses_against_opponent',
    'Fouls_against_opponent',
    'Goal Kicks_against_opponent',
    'Interceptions_against_opponent',
    'Long Balls_against_opponent',
    'Offsides_against_opponent',
    'Passing Accuracy_against_opponent',
    'Possession_against_opponent',
    'Saves_against_opponent',
    'Shots on Target_against_opponent',
    'Tackles_against_opponent',
    'Throw Ins_against_opponent',
    'Touches_against_opponent',
    'points_opponent'
]

players_team_B = [
    'opp_age_mean',
    'opp_squad_size',
    'opp_age_max',
    'opp_age_min',
    'opp_avg_wage_dollars',
    'opp_total_wage_bill_dollars',
    'opp_max_wage_dollars',
    'opp_min_wage_dollars'
]


total_cols = id_cols + match_cols + target_cols + columnas_team_A_validacion + stats_team_A + players_team_A + columnas_team_B_validacion + stats_team_B + players_team_B

match_stats_complete = match_stats_complete[total_cols]

## Save Master Dataset

Save the complete master dataset in multiple formats for different use cases.

In [13]:
# Create masters directory if it doesn't exist
masters_dir = data_prod_processed / 'masters'
masters_dir.mkdir(exist_ok=True)

print(f"Masters directory: {masters_dir}")
print(f"Dataset shape: {match_stats_complete.shape}")
print(f"Dataset info:")
print(f"  - Rows: {len(match_stats_complete):,}")
print(f"  - Columns: {len(match_stats_complete.columns):,}")
print(f"  - Memory usage: {match_stats_complete.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

# Base filename
base_filename = "match_stats_master_complete_v1"

print(f"\nSaving {base_filename} in multiple formats...")
print("=" * 50)

Masters directory: C:\Users\50230\OneDrive\Escritorio\Proyectos y trabajos\Personales\Pronósticos Football\data\prod\processed\masters
Dataset shape: (5711, 236)
Dataset info:
  - Rows: 5,711
  - Columns: 236
  - Memory usage: 15.3 MB

Saving match_stats_master_complete_v1 in multiple formats...


In [14]:
# 1. Parquet (recommended for data science - fast, efficient, preserves dtypes)
try:
    parquet_path = masters_dir / f"{base_filename}.parquet"
    match_stats_complete.to_parquet(parquet_path, index=False)
    file_size = parquet_path.stat().st_size / 1024**2
    print(f"✓ Parquet saved: {parquet_path.name} ({file_size:.1f} MB)")
except Exception as e:
    print(f"✗ Parquet failed: {e}")

# 2. Pickle (fastest load/save, preserves all pandas dtypes)
try:
    pkl_path = masters_dir / f"{base_filename}.pkl"
    match_stats_complete.to_pickle(pkl_path)
    file_size = pkl_path.stat().st_size / 1024**2
    print(f"✓ Pickle saved: {pkl_path.name} ({file_size:.1f} MB)")
except Exception as e:
    print(f"✗ Pickle failed: {e}")

# 3. CSV (universal compatibility)
try:
    csv_path = masters_dir / f"{base_filename}.csv"
    match_stats_complete.to_csv(csv_path, index=False)
    file_size = csv_path.stat().st_size / 1024**2
    print(f"✓ CSV saved: {csv_path.name} ({file_size:.1f} MB)")
except Exception as e:
    print(f"✗ CSV failed: {e}")

# 4. JSON (web-friendly, human-readable)
try:
    json_path = masters_dir / f"{base_filename}.json"
    match_stats_complete.to_json(json_path, orient='records', date_format='iso')
    file_size = json_path.stat().st_size / 1024**2
    print(f"✓ JSON saved: {json_path.name} ({file_size:.1f} MB)")
except Exception as e:
    print(f"✗ JSON failed: {e}")

# 5. Excel (business-friendly, good for sharing)
try:
    excel_path = masters_dir / f"{base_filename}.xlsx"
    match_stats_complete.to_excel(excel_path, index=False, engine='openpyxl')
    file_size = excel_path.stat().st_size / 1024**2
    print(f"✓ Excel saved: {excel_path.name} ({file_size:.1f} MB)")
except Exception as e:
    print(f"✗ Excel failed: {e}")

print("\n" + "=" * 50)
print("✅ Master dataset saved successfully!")
print(f"📁 Location: {masters_dir}")
print(f"📊 Dataset: {len(match_stats_complete):,} rows × {len(match_stats_complete.columns):,} columns")

✓ Parquet saved: match_stats_master_complete_v1.parquet (1.6 MB)
✓ Pickle saved: match_stats_master_complete_v1.pkl (10.2 MB)
✓ CSV saved: match_stats_master_complete_v1.csv (7.3 MB)
✓ JSON saved: match_stats_master_complete_v1.json (43.0 MB)
✓ Excel saved: match_stats_master_complete_v1.xlsx (6.2 MB)

✅ Master dataset saved successfully!
📁 Location: C:\Users\50230\OneDrive\Escritorio\Proyectos y trabajos\Personales\Pronósticos Football\data\prod\processed\masters
📊 Dataset: 5,711 rows × 236 columns
