# Notebook 1: Data Collection and Preprocessing

This notebook demonstrates how to use the `data_collection.py` and `data_preprocessing.py` modules to fetch and prepare sports data.

**Note:** To fetch real data from `football-data.org`, you need to set the `FOOTBALL_DATA_API_KEY` environment variable with your API token.

## 1. Setup and Imports

In [None]:
import os
import sys
from datetime import datetime, timedelta
import pandas as pd

# Add src directory to Python path to import custom modules
project_root = os.path.abspath(os.path.join(os.getcwd(), '..')) # Assuming notebook is in 'notebooks' directory
src_path = os.path.join(project_root, 'src')
if src_path not in sys.path:
    sys.path.append(src_path)

try:
    from data_collection import get_matches_for_date
    from data_preprocessing import preprocess_match_data
except ImportError as e:
    print(f"Error importing modules: {e}")
    print("Make sure 'src' is in sys.path and __init__.py files are present.")
    print(f"Current sys.path: {sys.path}")

import numpy as np # For np.select if used in example

## 2. Data Collection from football-data.org

In [None]:
fd_api_key = os.getenv("FOOTBALL_DATA_API_KEY", "YOUR_API_TOKEN")

if fd_api_key == "YOUR_API_TOKEN":
    print("Warning: FOOTBALL_DATA_API_KEY is not set. Using placeholder.")
    print("Data fetching from football-data.org will likely return an empty list or fail.")
    print("Please set the environment variable for actual data.")

yesterday_fd = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
print(f"Fetching matches from football-data.org for: {yesterday_fd}")

raw_matches_data_fd = get_matches_for_date(yesterday_fd, api_key=fd_api_key)

if raw_matches_data_fd:
    print(f"Successfully fetched {len(raw_matches_data_fd)} matches from football-data.org.")
    # Display first match as an example
    # print("Example match data:", raw_matches_data_fd[0]) 
else:
    print("No matches fetched from football-data.org. This could be due to no games on that day, API limits, or an invalid/missing API key.")

## 3. Data Collection from API-SPORTS (api-football.com)

This section demonstrates fetching data using `get_matches_from_apisports`.
**Note:** You need to set the `APISPORTS_API_KEY` environment variable.

In [None]:
# Need to re-import to get the new function if it was added after initial import
try:
    from data_collection import get_matches_from_apisports
    from data_preprocessing import engineer_form_features # For later section
except ImportError as e:
    print(f"Error re-importing modules: {e}")

apisports_api_key = os.getenv("APISPORTS_API_KEY")

if not apisports_api_key:
    print("Warning: APISPORTS_API_KEY is not set.")
    print("Data fetching from API-SPORTS will be skipped or will use a placeholder if the function handles it.")
    raw_matches_data_as = []
else:
    today_as = datetime.now().strftime("%Y-%m-%d")
    print(f"Fetching matches from API-SPORTS for: {today_as}")
    raw_matches_data_as = get_matches_from_apisports(date_str=today_as, api_key=apisports_api_key)

    if raw_matches_data_as:
        print(f"Successfully fetched {len(raw_matches_data_as)} matches from API-SPORTS.")
        # print("Example match data from API-SPORTS:", raw_matches_data_as[0])
    else:
        print("No matches fetched from API-SPORTS. This could be due to no games, API limits, or an invalid/missing API key.")

## 4. Data Preprocessing

Convert the raw list of matches (e.g., from football-data.org) into a Pandas DataFrame and perform initial cleaning/feature extraction.

In [None]:
# We'll use the data from football-data.org for this example
raw_matches_data_to_process = raw_matches_data_fd 

if isinstance(raw_matches_data_to_process, list) and raw_matches_data_to_process:
    matches_df = preprocess_match_data(raw_matches_data_to_process)
    if not matches_df.empty:
        print("Preprocessed DataFrame info (from football-data.org data):")
        matches_df.info()
        print("\nDataFrame head:")
        # Ensure necessary columns for display are present
        display_cols = ['home_team_name', 'away_team_name', 'home_team_score', 'away_team_score', 'utcDate', 'status', 'home_team_id', 'away_team_id']
        actual_display_cols = [col for col in display_cols if col in matches_df.columns]
        print(matches_df[actual_display_cols].head())
        
        # Ensure 'home_team_id', 'away_team_id', and 'utcDate' are created by preprocess_match_data
        # These are crucial for form feature engineering later.
        if 'home_team_id' not in matches_df.columns:
            matches_df['home_team_id'] = matches_df['homeTeam'].apply(lambda x: x.get('id') if isinstance(x, dict) else None)
        if 'away_team_id' not in matches_df.columns:
            matches_df['away_team_id'] = matches_df['awayTeam'].apply(lambda x: x.get('id') if isinstance(x, dict) else None)
        if 'utcDate' in matches_df.columns:
             matches_df['utcDate'] = pd.to_datetime(matches_df['utcDate'])
            
    else:
        print("Preprocessing resulted in an empty DataFrame.")
        matches_df = pd.DataFrame() # Ensure matches_df is an empty DF for subsequent cells
elif not raw_matches_data_to_process:
    print("Skipping preprocessing as no raw match data was fetched (using football-data.org sample).")
    matches_df = pd.DataFrame() 
else:
    print(f"Skipping preprocessing as raw_matches_data_to_process is not a list or is None (type: {type(raw_matches_data_to_process)}).")
    matches_df = pd.DataFrame()

## 5. Feature Engineering: Team Form

This section demonstrates using `engineer_form_features` to add team form statistics (W-D-L for last N games) to your match data.
This requires a DataFrame of "current/upcoming" matches (like `matches_df` above) and a DataFrame of "historical" matches.

In [None]:
# Attempt to load historical data from the sample CSV
historical_df = pd.DataFrame()
historical_csv_path = os.path.join(project_root, 'data', 'historical_matches_sample.csv')

try:
    historical_df = pd.read_csv(historical_csv_path)
    historical_df['utcDate'] = pd.to_datetime(historical_df['utcDate'])
    # Basic validation of required columns for engineer_form_features
    required_hist_cols = ['home_team_id', 'away_team_id', 'home_team_score', 'away_team_score', 'utcDate']
    if not all(col in historical_df.columns for col in required_hist_cols):
        print(f"Historical CSV ({historical_csv_path}) is missing required columns. Will use inline dummy data.")
        historical_df = pd.DataFrame() # Reset if invalid
    else:
        print(f"Successfully loaded historical data from {historical_csv_path}. Shape: {historical_df.shape}")
except FileNotFoundError:
    print(f"File not found: {historical_csv_path}. Will create inline dummy historical data.")
except Exception as e:
    print(f"Error loading {historical_csv_path}: {e}. Will create inline dummy historical data.")

if historical_df.empty:
    # Create dummy historical data if CSV loading failed or file not found
    historical_data_dummy = {
        'match_id': [101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112],
        'utcDate': pd.to_datetime([
            "2023-01-01", "2023-01-05", "2023-01-10", "2023-01-15", "2023-01-20", "2023-01-25",
            "2023-01-02", "2023-01-06", "2023-01-11", "2023-01-16", "2023-01-21", "2023-01-26"
        ]),
        'home_team_id': [10, 11, 10, 12, 10, 13, 11, 10, 12, 11, 10, 13],
        'away_team_id': [11, 10, 12, 10, 13, 10, 12, 13, 11, 13, 11, 12],
        'home_team_score': [2, 1, 3, 0, 1, 2, 2, 2, 0, 3, 4, 1],
        'away_team_score': [1, 2, 0, 1, 1, 0, 2, 2, 1, 1, 1, 1]
    }
    historical_df = pd.DataFrame(historical_data_dummy)
    print("Using inline dummy historical data.")

# Prepare a sample `processed_matches_df` for demonstration
# This should have 'home_team_id', 'away_team_id', and 'utcDate'
# We use a slice of our earlier `matches_df` or create a new one if `matches_df` is empty.
if not matches_df.empty and all(col in matches_df.columns for col in ['home_team_id', 'away_team_id', 'utcDate']):
    # Take a few matches, ensure IDs are not None and date is valid
    sample_processed_matches = matches_df[['home_team_id', 'away_team_id', 'utcDate', 'home_team_name', 'away_team_name']].dropna().head(3).copy()
else:
    print("Original matches_df is empty or lacks required columns. Creating a dummy processed_matches_df for form feature demo.")
    sample_processed_matches_data = {
        'home_team_id': [10, 12],
        'away_team_id': [11, 13],
        'utcDate': pd.to_datetime([datetime.now().strftime("%Y-%m-%d"), (datetime.now() + timedelta(days=1)).strftime("%Y-%m-%d")]),
        'home_team_name': ['Team Alpha', 'Team Charlie'],
        'away_team_name': ['Team Beta', 'Team Delta']
    }
    sample_processed_matches = pd.DataFrame(sample_processed_matches_data)

if not sample_processed_matches.empty:
    print("\nSample of processed matches (before form features):")
    print(sample_processed_matches[['home_team_name', 'away_team_name', 'utcDate']])

    # Engineer form features
    matches_with_form = engineer_form_features(sample_processed_matches, historical_df, num_games=5)

    print("\nMatches DataFrame with Form Features:")
    form_cols = [col for col in matches_with_form.columns if 'form' in col]
    print(matches_with_form[['home_team_name', 'away_team_name', 'utcDate'] + form_cols].head())
else:
    print("\nSkipping form feature engineering demo as sample_processed_matches is empty.")

## 6. Further Exploration (Placeholder)

At this stage, you would typically perform more in-depth exploratory data analysis (EDA) on the enriched DataFrame.
- Analyze distributions of scores, outcomes, and new form features.
- Engineer further features like: 
    - Head-to-head statistics between teams.
    - Team strength based on league position or ratings (e.g., Elo).
    - Home/Away advantages.

This often involves fetching more historical data for each team or joining with other datasets.

In [None]:
# Use the original matches_df if available and processed, otherwise matches_with_form (if created from dummy)
df_to_explore = matches_df if not matches_df.empty else (matches_with_form if 'matches_with_form' in locals() and not matches_with_form.empty else pd.DataFrame())

if not df_to_explore.empty:
    print("Example: Value counts of match status (if available and processed):")
    if 'status' in df_to_explore.columns:
        print(df_to_explore['status'].value_counts())
    else:
        print("'status' column not found in the DataFrame.")
    
    # Placeholder for creating a 'result' column (Home Win, Draw, Away Win)
    if 'home_team_score' in df_to_explore.columns and 'away_team_score' in df_to_explore.columns:
        h_scores = pd.to_numeric(df_to_explore['home_team_score'], errors='coerce')
        a_scores = pd.to_numeric(df_to_explore['away_team_score'], errors='coerce')
        
        conditions = [
            h_scores > a_scores,
            h_scores == a_scores,
            h_scores < a_scores
        ]
        outcomes = [0, 1, 2] # 0: Home Win, 1: Draw, 2: Away Win
        df_to_explore['result_label'] = pd.Series(np.select(conditions, outcomes, default=np.nan), index=df_to_explore.index)
        print("\n'result_label' distribution (0=Home, 1=Draw, 2=Away):")
        print(df_to_explore['result_label'].value_counts(dropna=False))
    else:
        print("\nScore columns not found, cannot determine match result label.")
else:
    print("DataFrame is empty, skipping further exploration.")