In [5]:
import os
import pandas as pd

# === Paths ===
base_dir = "../Data/Silver"      # Folder where each team has its own subfolder
all_dir = "../Data/All"
os.makedirs(all_dir, exist_ok=True)

all_players = []
all_matches = []
teams = []

# === Loop through team folders ===
for team_name in os.listdir(base_dir):
    team_folder = os.path.join(base_dir, team_name)
    if not os.path.isdir(team_folder):
        continue

    # Save team info
    teams.append({"team_name": team_name})

    players_path = os.path.join(team_folder, "players_cleaned.csv")
    matches_path = os.path.join(team_folder, "matches_cleaned.csv")

    if os.path.exists(players_path):
        players_df = pd.read_csv(players_path)
        players_df["team_name"] = team_name
        all_players.append(players_df)

    
    if os.path.exists(matches_path):
        matches_df = pd.read_csv(matches_path)
    
        #  Keep only 'Home' matches to avoid duplicates
        if "Venue" in matches_df.columns:
            matches_df = matches_df[matches_df["Venue"].str.lower().eq("home")]
        # Add team name (for reference)
        matches_df["team_name"] = team_name
    
        all_matches.append(matches_df)


# === Concatenate ===
all_players_df = pd.concat(all_players, ignore_index=True)
all_matches_df = pd.concat(all_matches, ignore_index=True)

# === Create Teams DataFrame ===
teams_df = pd.DataFrame(teams).drop_duplicates().reset_index(drop=True)
teams_df.insert(0, "team_id", range(1, len(teams_df) + 1))

print(f"Players shape: {all_players_df.shape}")
print(f"Matches shape: {all_matches_df.shape}")
print(f"Teams shape: {teams_df.shape}")

# === Optional: save for the ready phase ===
all_players_df.to_csv(os.path.join(all_dir, "all_players.csv"), index=False)
all_matches_df.to_csv(os.path.join(all_dir, "all_matches.csv"), index=False)
teams_df.to_csv(os.path.join(all_dir, "teams.csv"), index=False)


Players shape: (702, 17)
Matches shape: (488, 18)
Teams shape: (20, 2)


In [6]:
import pandas as pd
import os



# === Charger data ===
all_players_df = pd.read_csv(os.path.join(all_dir, "all_players.csv"))
all_matches_df = pd.read_csv(os.path.join(all_dir, "all_matches.csv"))
teams_df = pd.read_csv(os.path.join(all_dir, "teams.csv"))

# 1️⃣  SAISON TABLE

saison_df = pd.DataFrame([{"saison_id": 1, "year": "2024-2025"}])

# 2️⃣  COMPETITION TABLE

competition_names = all_matches_df["Comp"].dropna().unique()
competition_df = pd.DataFrame({
    "competition_id": range(1, len(competition_names) + 1),
    "competition_name": competition_names
})

# 3️ TEAM TABLE (link to saison and competition)
# Assume all teams play in the same competition & season
teams_df["saison_id"] = 1

# 4️⃣  PLAYER + PLAYER_STATISTICS TABLES
# Player identity
player_df = all_players_df[["Player", "Nation", "Age", "Pos", "team_name"]].drop_duplicates()
player_df = player_df.merge(teams_df[["team_id", "team_name"]], on="team_name", how="left")
player_df.insert(0, "player_id", range(1, len(player_df) + 1))
player_df.drop(columns=["team_name"], inplace=True)

# Player statistics (aggregate-level, not per match)
stat_cols = [c for c in all_players_df.columns if c not in ["Player", "Nation", "Age", "Pos", "team_name"]]
player_statistics_df = all_players_df.merge(
    player_df[["player_id", "Player"]], on="Player", how="left"
)[["player_id"] + stat_cols]



# 5️⃣ MATCH TABLE
# Create match table using only the "Home" matches we kept earlier

match_df = all_matches_df.copy()

# Merge to get the team_id for the main team (the one whose folder we scraped)
match_df = match_df.merge(
    teams_df[["team_id", "team_name"]],
    on="team_name",
    how="left"
)

# Clean and map opponent name to team_id (if opponent exists in teams_df)
match_df["Opponent"] = match_df["Opponent"].astype(str).str.strip()
match_df = match_df.merge(
    teams_df.rename(columns={"team_id": "opponent_id", "team_name": "Opponent"}),
    on="Opponent",
    how="left"
)

# Create match_id
match_df.insert(0, "match_id", range(1, len(match_df) + 1))

# Standardize attendance (remove commas and cast to int)
if "Attendance" in match_df.columns:
    match_df["Attendance"] = (
        match_df["Attendance"].astype(str).str.replace(",", "", regex=False).replace("", None)
    )
    match_df["Attendance"] = pd.to_numeric(match_df["Attendance"], errors="coerce")

# Assign saison and competition (for now we assume single season & competition)
match_df["saison_id"] = 1
match_df["competition_id"] = 1

# Select columns relevant to the 'match' table
match_cols = [
    "match_id", "team_id", "opponent_id", "datetime",
    "Attendance", "Referee", "saison_id", "competition_id"
]
match_df = match_df[match_cols]

# 6️⃣ MATCH RESULT TABLE
# Each match_id has its result stats (GF, GA, etc.)

match_result_df = all_matches_df.copy()

# Merge with match_df to get the match_id for each row
match_result_df = match_result_df.merge(
    teams_df[["team_id", "team_name"]],
    on="team_name",
    how="left"
)



match_result_df = match_result_df.merge(
    match_df[["match_id", "datetime", "team_id"]],
    on=["datetime", "team_id"],
    how="left"
)

# Keep relevant result columns
result_cols = ["match_id", "GF", "GA", "Result", "xG", "xGA"]
match_result_df = match_result_df[result_cols].drop_duplicates().reset_index(drop=True)

#  SAVE ALL TABLES

glod_dir = "../Data/Gold"


saison_df.to_csv(os.path.join(glod_dir, "saison.csv"), index=False)
competition_df.to_csv(os.path.join(glod_dir, "competition.csv"), index=False)
teams_df.to_csv(os.path.join(glod_dir, "team.csv"), index=False)
player_df.to_csv(os.path.join(glod_dir, "player.csv"), index=False)
player_statistics_df.to_csv(os.path.join(glod_dir, "player_statistics.csv"), index=False)
match_df.to_csv(os.path.join(glod_dir, "match.csv"), index=False)
match_result_df.to_csv(os.path.join(glod_dir, "match_result.csv"), index=False)

print("All relational DataFrames successfully created and saved!")


All relational DataFrames successfully created and saved!
