In [None]:
from data_preparation import load_raw_data

data = load_raw_data()

# Extract the races DataFrame
races = data['races']

# Get unique combinations of 'name' and 'year'
unique_combinations = races[['name', 'year']].drop_duplicates()

print(unique_combinations)


In [None]:
weather_data = data['weather_data']

# Display all distinct values
distinct_values = weather_data[['EventName', 'Year']].drop_duplicates()
print(distinct_values)

In [None]:
import pandas as pd
import re
from data_preparation import load_raw_data
from thefuzz import process  # Optional for fuzzy matching

# Load data
data = load_raw_data()
races = data['races']
weather_data = data['weather_data']  # Adjust the key if different

# Preprocessing function
def preprocess_string(s):
    if isinstance(s, str):
        s = s.lower().strip()
        s = re.sub(r'\s+', ' ', s)
        s = re.sub(r'grand prix', 'gp', s)
        s = re.sub(r'[^\w\s]', '', s)
        return s
    return s

# Apply preprocessing
weather_data['EventName_clean'] = weather_data['EventName'].apply(preprocess_string)
races['name_clean'] = races['name'].apply(preprocess_string)

# Merge to find matches
merged = weather_data.merge(
    races,
    left_on=['EventName_clean', 'Year'],
    right_on=['name_clean', 'year'],
    how='left',
    indicator=True
)

# Summary
total_weather_races = weather_data.shape[0]
matched_races = merged['_merge'].value_counts().get('both', 0)
unmatched_races = merged['_merge'].value_counts().get('left_only', 0)

print(f"Total races in weather data: {total_weather_races}")
print(f"Matched races: {matched_races}")
print(f"Unmatched races: {unmatched_races}")

# Handle unmatched races
if unmatched_races > 0:
    print("\nUnmatched Races:")
    print(merged[merged['_merge'] == 'left_only'][['EventName', 'Year']].drop_duplicates())

    # Example: Fuzzy matching (optional)
    unmatched_event_names = merged[merged['_merge'] == 'left_only']['EventName_clean'].unique()
    races_event_names = races['name_clean'].unique()

    for event in unmatched_event_names:
        match, score = process.extractOne(event, races_event_names)
        print(f"Weather Event: {event} --> Races Match: {match} (Score: {score})")
else:
    print("\nAll races in weather data are present in races data.")

# Final join (if all matched or after resolving mismatches)
final_merged = merged[merged['_merge'] == 'both']
print("\nFinal Merged DataFrame:")
print(final_merged.head())
