# 01 — Data Loading

**Goal:** Load all 10 raw CSV files, confirm they are readable, print shape / dtypes / null counts per dataset.

**Outputs:** Nothing saved to disk — purely diagnostic.

**Next:** `02_eda_raw.ipynb`

In [1]:
import pandas as pd
import os

In [2]:
# Paths — relative to the notebooks/ folder
RAW_DATA_PATH = "../data/raw/"

assert os.path.exists(RAW_DATA_PATH), f"Folder not found: {RAW_DATA_PATH}"
print("Files in raw data folder:")
for f in sorted(os.listdir(RAW_DATA_PATH)):
    print(f"  {f}")

Files in raw data folder:
  appearances.csv
  club_games.csv
  clubs.csv
  competitions.csv
  game_events.csv
  game_lineups.csv
  games.csv
  player_valuations.csv
  players.csv
  transfers.csv


In [9]:
FILES = [
    "players.csv",
    "appearances.csv",
    "player_valuations.csv",
    "club_games.csv",
    "game_lineups.csv",
    "game_events.csv",
    "games.csv",
    "clubs.csv",
    "competitions.csv",
    "transfers.csv",
]

data = {}
for filename in FILES:
    name = filename.replace(".csv", "")
    path = os.path.join(RAW_DATA_PATH, filename)
    try:
        data[name] = pd.read_csv(path, low_memory=False)
        r, c = data[name].shape
        print(f"OK       {filename:<35} {r:>10,} rows  {c:>3} cols")
    except FileNotFoundError:
        print(f"MISSING  {filename}")

OK       players.csv                             34,291 rows   23 cols
OK       appearances.csv                      1,722,865 rows   13 cols
OK       player_valuations.csv                  448,965 rows    6 cols
OK       club_games.csv                         155,990 rows   11 cols
OK       game_lineups.csv                     2,680,694 rows   10 cols
OK       game_events.csv                      1,100,360 rows   11 cols
OK       games.csv                               77,995 rows   23 cols
OK       clubs.csv                                  451 rows   17 cols
OK       competitions.csv                            44 rows   11 cols
OK       transfers.csv                           85,293 rows   10 cols


In [8]:
# Per-dataset column summary: dtype, null count, null %, unique values
for name, df in data.items():
    sep = '=' * 60
    print(f"\n{sep}")
    print(f"  {name.upper()}  —  {df.shape[0]:,} rows x {df.shape[1]} cols")
    print(sep)
    info = pd.DataFrame({
        "dtype":      df.dtypes,
        "null_count": df.isnull().sum(),
        "null_pct":   (df.isnull().sum() / len(df) * 100).round(2),
        "unique":     df.nunique(),
    })
    print(info.to_string())


  PLAYERS  —  34,291 rows x 23 cols
                                        dtype  null_count  null_pct  unique
player_id                               int64           0      0.00   34291
first_name                                str        2138      6.23    7351
last_name                                 str           0      0.00   24855
name                                      str           0      0.00   33541
last_season                             int64           0      0.00      14
current_club_id                         int64           0      0.00     449
player_code                               str           0      0.00   33496
country_of_birth                          str        2916      8.50     186
city_of_birth                             str        2589      7.55    8828
country_of_citizenship                    str         355      1.04     186
date_of_birth                             str          49      0.14    9670
sub_position                              str      

In [7]:
# Sanity checks on the 3 core files
players = data["players"]
apps    = data["appearances"]
vals    = data["player_valuations"]

print("PLAYERS")
print(f"  Total: {len(players):,}")
print(players["position"].value_counts().to_string())

print("\nAPPEARANCES")
print(f"  Total rows:     {len(apps):,}")
print(f"  Unique players: {apps['player_id'].nunique():,}")
print(f"  Unique games:   {apps['game_id'].nunique():,}")

print("\nVALUATIONS")
print(f"  Total rows:     {len(vals):,}")
print(f"  Unique players: {vals['player_id'].nunique():,}")
mv = vals["market_value_in_eur"]
print(f"  Value range:    EUR {mv.min():,.0f} — EUR {mv.max():,.0f}")

PLAYERS
  Total: 34,291
position
Defender      10893
Midfield       9903
Attack         9400
Goalkeeper     3906
Missing         189

APPEARANCES
  Total rows:     1,722,865
  Unique players: 26,489
  Unique games:   70,120

VALUATIONS
  Total rows:     448,965
  Unique players: 31,375
  Value range:    EUR 0 — EUR 200,000,000
