In [4]:
import sqlite3
import os
import pandas as pd
import re
from bs4 import BeautifulSoup
import unicodedata
from pathlib import Path
from rapidfuzz import fuzz, process
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import pandas as pd
import time

In [5]:
! pip install rapidfuzz
! pip install geopy




In [26]:
TEAM_CSV_PATH = './data/mapping/teams_map.csv'
LOGOS_BASE_PATH = './data/logos'
OUTPUT_CSV = './data/mapping/team_logo_mapping.csv'

In [33]:
import os
import shutil
from pathlib import Path

HISTORY_FOLDER = './data/history'
TARGET_FOLDER = './data/logos'

os.makedirs(TARGET_FOLDER, exist_ok=True)

for season_folder in Path(HISTORY_FOLDER).iterdir():
    if not season_folder.is_dir():
        continue

    for league_folder in season_folder.iterdir():
        if not league_folder.is_dir():
            continue

        # Extract "Country - League" from folder name
        country_league = league_folder.name.strip()

        for logo_file in league_folder.glob("*.png"):
            team_name = logo_file.stem.strip()
            new_filename = f"{team_name} - {country_league}.png"
            destination = Path(TARGET_FOLDER) / new_filename

            # Avoid overwriting if there's a duplicate
            if not destination.exists():
                shutil.copy2(logo_file, destination)

print(f"✅ Logos flattened into {TARGET_FOLDER}")


✅ Logos flattened into ./data/logos


In [49]:
# Re-import required libraries since the environment was reset
import os
from pathlib import Path
import shutil
import unicodedata
import pandas as pd
from rapidfuzz import fuzz

# Paths
FLATTENED_FOLDER = "./data/logos"
DEDUPED_OUTPUT_FOLDER = "./data/logos_flat"

# Ensure output folder exists
os.makedirs(DEDUPED_OUTPUT_FOLDER, exist_ok=True)

# === Normalization + Deduplication Utilities ===

def normalize_team_name(name):
    """Cleans and standardizes a team name for fuzzy comparison."""
    name = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('utf-8')
    name = name.lower()
    for word in ["fc", "sc", "ac", "cf", "afc", ".", ",", "-", "_"]:
        name = name.replace(word, "")
    return name.strip()

def deduplicate_logo_filenames(file_list, threshold=90):
    """
    Groups logo filenames (without extension) by fuzzy similarity.
    Returns a map: original_filename → canonical_name
    """
    canonical_map = {}
    used = set()

    for i, file in enumerate(file_list):
        if file in used:
            continue

        canon = file
        norm_canon = normalize_team_name(canon)
        canonical_map[file] = canon
        used.add(file)

        for other in file_list[i+1:]:
            if other in used:
                continue
            norm_other = normalize_team_name(other)
            score = fuzz.ratio(norm_canon, norm_other)

            if score >= threshold:
                canonical_map[other] = canon
                used.add(other)

    return canonical_map

# Get logo files and names
flattened_logo_files = [f for f in os.listdir(FLATTENED_FOLDER) if f.endswith('.png')]
flattened_logo_names = [Path(f).stem for f in flattened_logo_files]

# Apply deduplication
deduped_map = deduplicate_logo_filenames(flattened_logo_names, threshold=90)

# Write deduplicated logos into DEDUPED_OUTPUT_FOLDER
written = set()
for original_name, canonical_name in deduped_map.items():
    original_file = Path(FLATTENED_FOLDER) / f"{original_name}.png"
    output_file = Path(DEDUPED_OUTPUT_FOLDER) / f"{canonical_name}.png"

    if canonical_name not in written and original_file.exists():
        shutil.copy2(original_file, output_file)
        written.add(canonical_name)

# Return a summary
{
    "Original logos": len(flattened_logo_files),
    "Unique deduplicated logos": len(written),
    "Output folder": DEDUPED_OUTPUT_FOLDER
}


{'Original logos': 502,
 'Unique deduplicated logos': 502,
 'Output folder': './data/logos_flat'}

In [55]:
import os
from pathlib import Path
from rapidfuzz import process, fuzz

teams_map_df = pd.read_csv("./data/mapping/teams_map.csv")


# Load deduplicated logo filenames and parse team + metadata
deduped_folder = "./data/logos"
deduped_entries = []

if os.path.exists(deduped_folder):
    for f in os.listdir(deduped_folder):
        if f.endswith(".png"):
            parts = Path(f).stem.split(" - ")
            if len(parts) >= 3:
                deduped_entries.append({
                    "deduped_team": parts[0].strip(),
                    "country": parts[1].strip(),
                    "league": parts[2].strip(),
                    "logo_filename": f
                })

deduped_df = pd.DataFrame(deduped_entries)

# Match each team_long_name to the deduped logos
def normalize(name):
    if pd.isna(name):
        return ""
    return unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('utf-8').lower().replace("fc", "").replace("sc", "").replace("ac", "").replace("-", "").replace(",", "").strip()

results = []
for team in teams_map_df["team_long_name"].dropna().unique():
    best_match, score, _ = process.extractOne(
        normalize(team),
        deduped_df['deduped_team'].apply(normalize),
        scorer=fuzz.ratio
    )
    matched_row = deduped_df[deduped_df['deduped_team'].apply(normalize) == normalize(best_match)].iloc[0]
    results.append({
        "team_long_name": team,
        "matched_team": best_match,
        "match_score": score,
        "country": matched_row["country"],
        "league": matched_row["league"],
        "logo_filename": matched_row["logo_filename"]
    })

# Merge enriched results back into the original teams_map_df
enriched_df = pd.DataFrame(results)
final_teams_df = teams_map_df.merge(enriched_df, on="team_long_name", how="left")
final_teams_df.to_csv("./data/mapping/team_logo_mapping.csv", index=False)

In [56]:
# Load the uploaded team_logo_mapping.csv which should contain country information
geo_df = pd.read_csv("./data/mapping/team_logo_mapping.csv")


from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import unicodedata

# Prepare geolocator
geolocator = Nominatim(user_agent="team-geocoder", timeout=10)
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

# Function to clean team names for geocoding
def clean_team_name(name):
    if pd.isna(name):
        return ""
    name = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('utf-8')
    words_to_remove = ['fc', 'sc', 'ac', 'cf', 'afc', 'sporting', 'club', 'rsc', 'krc', 'kaa', 'ksc', 'raec' , 'sv' , 'ksv', 'cercle', 'athletic' ]
    for word in words_to_remove:
        name = name.lower().replace(word, '')
    return name.strip().title()

# Clean the team name and build query again
geo_df["cleaned_query"] = geo_df.apply(lambda row: f"{clean_team_name(row['team_long_name'])}, {row['country']}", axis=1)

# Add empty columns for lat/lon
geo_df["latitude"] = None
geo_df["longitude"] = None

# Run geocoding
for i, row in geo_df.iterrows():
    try:
        location = geocode(row["cleaned_query"])
        if location:
            geo_df.at[i, "latitude"] = location.latitude
            geo_df.at[i, "longitude"] = location.longitude
            print(f"✓ {row['cleaned_query']} → {location.latitude}, {location.longitude}")
        else:
            print(f"⚠️ No result for {row['cleaned_query']}")
    except Exception as e:
        print(f"❌ Error geocoding {row['cleaned_query']}: {e}")

# Save results
output_path = "./data/mapping/team_logo_mapping_geocoded.csv"
geo_df.to_csv(output_path, index=False)


✓ Genk, Belgium → 50.9654864, 5.5001456
⚠️ No result for Beerhot, Belgium
⚠️ No result for Sv Zulte-Waregem, Belgium
⚠️ No result for Lokeren, Portugal
⚠️ No result for Ksv Cercle Brugge, Belgium
✓ R Anderlecht, Belgium → 50.845386412012935, 4.340770380198071
✓ Gent, Belgium → 51.0538286, 3.7250121
✓ Mons, France → 43.6118557, 1.5721111
⚠️ No result for V Dender Eh, Belgium
✓ Standard De Liege, Belgium → 50.61095765, 5.54393415
✓ Kv Mechelen, Belgium → 51.0371829, 4.4863913
✓ Brugge Kv, Belgium → 51.2085526, 3.226772
✓ Ksv Roeselare, Belgium → 50.9520177, 3.1047304
✓ Kv Kortrijk, Belgium → 50.8276429, 3.2659884
⚠️ No result for Tubize, Portugal
⚠️ No result for Royal Excel Mouron, Belgium
✓ Kvc Westerlo, Belgium → 51.094091, 4.92812145796815
✓ Charleroi, Belgium → 50.4116233, 4.444528
⚠️ No result for Sint-Truidense Vv, Belgium
⚠️ No result for Lierse Sk, Norway
✓ Kas Eupen, Belgium → 50.6263653, 6.0453424
✓ Oud-Heverlee Leuven, Belgium → 50.8376275, 4.6629253
⚠️ No result for Waasland