In [15]:
# Tennis Map with Flags and Surface Color Coding

import pandas as pd
from geopy.geocoders import Nominatim
import folium
from folium.plugins import MarkerCluster, MiniMap
import pycountry
import time
import os

# Load data
df = pd.read_csv("/Users/karine/SmashData-1/Milestone 1/match_tennis_68_ajd.csv")

# Manual mapping of last names to country names
name_to_country = {
    "Acasuso": "Argentina",
    "Agassi": "United States",
    "Almagro": "Spain",
    "Andujar": "Spain",
    "Bedene": "Slovenia",
    "Berdych": "Czech Republic",
    "Berlocq": "Argentina",
    "Bolelli": "Italy",
    "Bourgue": "France",
    "Brands": "Germany",
    "Cilic": "Croatia",
    "Coria": "Argentina",
    "Cuevas": "Uruguay",
    "Davydenko": "Russia",
    "Delbonis": "Argentina",
    "Delgado": "Paraguay",
    "Delic": "United States",
    "Devvarman": "India",
    "Dolgopolov": "Ukraine",
    "Donskoy": "Russia",
    "Eschauer": "Austria",
    "Falla": "Colombia",
    "Federer": "Switzerland",
    "Fognini": "Italy",
    "Gimeno-Traver": "Spain",
    "Gonzalez": "Chile",
    "Hanescu": "Romania",
    "Hernandez": "Spain",
    "Ito": "Japan",
    "Karlovic": "Croatia",
    "Kiefer": "Germany",
    "Krajinovic": "Serbia",
    "Lapentti": "Ecuador",
    "Llodra": "France",
    "Lopez": "Spain",
    "Lorenzi": "Italy",
    "Martin": "Spain", 
    "Massu": "Chile",
    "Monaco": "Argentina",
    "Montanes": "Spain",
    "Nadal": "Spain",
    "Nalbandian": "Argentina",
    "Nieminen": "Finland",
    "Paire": "France",
    "Phau": "Germany",
    "Pico": "Argentina",
    "Pospisil": "Canada",
    "Ram": "United States",
    "Roddick": "United States",
    "Rufin": "France",
    "Russell": "United States",
    "Sela": "Israel",
    "Serra": "France",
    "Simon": "France",
    "Smyczek": "United States",
    "Spadea": "United States",
    "Stepanek": "Czech Republic",
    "Stoppini": "Italy",
    "Tabilo": "Chile",
    "Tomic": "Australia",
    "Troicki": "Serbia",
    "Volandri": "Italy",
    "Wawrinka": "Switzerland",
    "Youzhny": "Russia",
    "Kohlschreiber": "Germany",       # Philipp Kohlschreiber
    "Hewitt": "Australia",            # Lleyton Hewitt (you wrote "hewitt l")
    "Clement": "France",              # Arnaud Clément ("celement a" typo)
    "Gasquet": "France",              # Richard Gasquet ("gaquet r" typo)
    "Corretja": "Spain",              # Alex Corretja
    "Zverev": "Germany",
    "Ferrer": "Spain",
    "Norrie": "United Kingdom",  # Cameron Norrie (often appears as "Norri C.")
    "Korda": "United States",
    "Moya": "Spain",           # Carlos Moyá
    "Lajovic": "Serbia",       # Dušan Lajović (you wrote "lajovc d")
    "Djokovic": "Serbia",      # Novak Djokovic (you wrote "djokpvic")
    "Sinner": "Italy",
    "Estrella": "Dominican Republic",
    "Ruusuvuori": "Finland" 
}


# Extract last name
def extract_last_name(full_name):
    return full_name.split()[0] if isinstance(full_name, str) else ""

# Country name to flag emoji
def country_to_flag(country_name):
    try:
        country = pycountry.countries.search_fuzzy(country_name)[0]
        code = country.alpha_2
        return ''.join(chr(0x1F1E6 + ord(c) - ord('A')) for c in code.upper())
    except:
        return ''

# Count tournaments by location
location_counts = df["Location"].value_counts().reset_index()
location_counts.columns = ["Location", "Occurrences"]

# Top winner per location
top_players = (
    df.groupby("Location")["Winner"]
    .agg(lambda x: x.value_counts().index[0])
    .reset_index()
)
top_players.columns = ["Location", "TopPlayer"]

# Add last name, country, flag
top_players["LastName"] = top_players["TopPlayer"].apply(extract_last_name)
top_players["Country"] = top_players["LastName"].map(name_to_country).fillna("Unknown")
top_players["Flag"] = top_players["Country"].apply(lambda c: country_to_flag(c) if c != "Unknown" else "")

# Merge top players into location counts
location_counts = location_counts.merge(top_players, on="Location", how="left")

# Most frequent surface per location
surface_info = (
    df.groupby("Location")["Surface"]
    .agg(lambda x: x.value_counts().index[0])
    .reset_index()
)
location_counts = location_counts.merge(surface_info, on="Location", how="left")

# Surface color mapping
surface_colors = {
    "Hard": "#4A90E2",    # Bright faded blue (soft sky blue)
    "Clay": "#CC6600",    # Burnt orange / terracotta (classic clay color)
    "Grass": "#2E8B57",   # Grass green (sea green, rich and natural)
    "Carpet": "#7F3FBF"   # Purple (unchanged but softened if needed)
}

# Load or initialize geocoding cache
cache_file = "geocoded_locations_mens.csv"
if os.path.exists(cache_file):
    cache = pd.read_csv(cache_file)
else:
    cache = pd.DataFrame(columns=["Location", "Latitude", "Longitude"])

# Geocoder setup
geolocator = Nominatim(user_agent="tennis_mapper")

# Geocode with cache
def geocode_location(town):
    if town in cache["Location"].values:
        row = cache[cache["Location"] == town].iloc[0]
        return pd.Series([row["Latitude"], row["Longitude"]])
    else:
        try:
            location = geolocator.geocode(town)
            time.sleep(1)
            if location:
                lat, lon = location.latitude, location.longitude
                cache.loc[len(cache)] = [town, lat, lon]
                cache.to_csv(cache_file, index=False)
                return pd.Series([lat, lon])
        except Exception as e:
            print(f"Error geocoding {town}: {e}")
        return pd.Series([None, None])

# Apply geocoding
location_counts[["Latitude", "Longitude"]] = location_counts["Location"].apply(geocode_location)
location_counts.dropna(subset=["Latitude", "Longitude"], inplace=True)

# Create map
m = folium.Map(location=[20, 0], zoom_start=2, control_scale=True)
m.add_child(MiniMap(toggle_display=True, position="bottomright"))
marker_cluster = MarkerCluster(name="Tournament Locations").add_to(m)

# Normalize radius
max_occurrences = location_counts["Occurrences"].max()

# Add markers
for _, row in location_counts.iterrows():
    popup_html = (
        f"<div style='width: 220px;'>"
        f"<b>{row['Location']}</b><br>"
        f"Surface: {row['Surface']}<br>"
        f"Matches: {int(row['Occurrences'])}<br>"
        f"Top Player: {row['Flag']} {row['TopPlayer']}"
        f"</div>"
    )

    radius = (row["Occurrences"] / max_occurrences) * 20
    marker_color = surface_colors.get(row["Surface"], "gray")

    folium.CircleMarker(
        location=[row["Latitude"], row["Longitude"]],
        radius=radius,
        color=marker_color,
        fill=True,
        fill_color=marker_color,
        fill_opacity=0.6,
        popup=popup_html,
        tooltip=row["Location"]
    ).add_to(marker_cluster)

# Save map
m.save("tennis_map_mens_with_flags.html")
m



Columns (14,20,21,27,46,47,51) have mixed types. Specify dtype option on import or set low_memory=False.



In [10]:
import pandas as pd
import plotly.graph_objects as go

# Load your CSV data
file_path = "/Users/karine/SmashData-1/Milestone 1/match_tennis_68_ajd.csv"
df = pd.read_csv(file_path)

# --- Step 1: Clean & Compute Statistics ---
# Focus on top 5 players by total wins
top_players = df['Winner'].value_counts().head(5).index.tolist()

# Ensure WRank is numeric
df['WRank'] = pd.to_numeric(df['WRank'], errors='coerce')

# Prepare statistics for radar plot
player_stats = []
for player in top_players:
    subset = df[df['Winner'] == player]

    avg_aces = pd.to_numeric(subset['W1'], errors='coerce').mean(skipna=True)  # proxy for ace if missing real ace column
    avg_duration = subset.apply(
        lambda row: sum([
            pd.to_numeric(row.get(f"W{i}"), errors='coerce') +
            pd.to_numeric(row.get(f"L{i}"), errors='coerce')
            for i in range(1, 6)
        ]), axis=1).mean() * 5  # estimate minutes

    win_sets = pd.to_numeric(subset['Wsets'], errors='coerce').mean(skipna=True)
    rank_avg = subset['WRank'].mean(skipna=True)

    player_stats.append({
        "Player": player,
        "Estimated Minutes": avg_duration,
        "Avg Sets Won": win_sets,
        "Avg W1 Games": avg_aces,
        "Avg WRank": rank_avg
    })

# Convert to DataFrame
stats_df = pd.DataFrame(player_stats).set_index("Player")

# Normalize data to 0–100
def normalize(series):
    return 100 * (series - series.min()) / (series.max() - series.min())

normalized = stats_df.apply(normalize)

# --- Step 2: Plot Radar Chart ---
categories = normalized.columns.tolist()

fig = go.Figure()
for player in normalized.index:
    fig.add_trace(go.Scatterpolar(
        r=normalized.loc[player].tolist() + [normalized.loc[player].tolist()[0]],
        theta=categories + [categories[0]],
        fill='toself',
        name=player
    ))

fig.update_layout(
    title="Top 5 Players – Normalized Match Performance",
    polar=dict(radialaxis=dict(visible=True, range=[0, 100])),
    showlegend=True,
    template="plotly_white",
    height=600
)

fig.show()

  df = pd.read_csv(file_path)


In [2]:
import pandas as pd
import os

# Set the folder containing the CSVs
folder_path = "/Users/karine/SmashData-1/csv_data/wta_womens_tour"
all_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')]

# Read and concatenate all files
df_list = [pd.read_csv(file) for file in all_files]
wta_df = pd.concat(df_list, ignore_index=True)

# Optional: save merged file for later use
wta_df.to_csv("merged_wta_matches.csv", index=False)


In [4]:
import pandas as pd
from geopy.geocoders import Nominatim
import folium
from folium.plugins import MarkerCluster, MiniMap
import pycountry
import time
import os

# Load and inspect WTA data
df = pd.read_csv("merged_wta_matches.csv")

# Basic cleaning: drop missing or empty location/winner entries
df = df[df["Location"].notna() & df["Winner"].notna()]
df = df[df["Location"].str.strip() != ""]

# Optional: Print to debug
print("Unique Locations:", df["Location"].nunique())
print("Unique Surfaces:", df["Surface"].unique())

# Manual mapping of last names to countries (extend as needed)
name_to_country = {
    "Williams": "United States",     # Venus or Serena
    "Halep": "Romania",
    "Osaka": "Japan",
    "Swiatek": "Poland",
    "Barty": "Australia",
    "Kvitova": "Czech Republic",
    "Wozniacki": "Denmark",
    "Azarenka": "Belarus",
    "Kerber": "Germany",
    "Radwanska": "Poland",
    "Sharapova": "Russia",
    "Muguruza": "Spain",
    "Jabeur": "Tunisia",
    "Sakkari": "Greece",
    "Sabalenka": "Belarus",
    "Gauff": "United States",
    "Rybakina": "Kazakhstan",
    "Kasatkina": "Russia",
    "Zheng": "China"
}

# Utility functions
def extract_last_name(full_name):
    return full_name.split()[0] if isinstance(full_name, str) else ""

def country_to_flag(country_name):
    try:
        country = pycountry.countries.search_fuzzy(country_name)[0]
        code = country.alpha_2
        return ''.join(chr(0x1F1E6 + ord(c) - ord('A')) for c in code.upper())
    except:
        return ''

# Compute tournament statistics
location_counts = df["Location"].value_counts().reset_index()
location_counts.columns = ["Location", "Occurrences"]

top_players = df.groupby("Location")["Winner"] \
                .agg(lambda x: x.value_counts().index[0]) \
                .reset_index()
top_players.columns = ["Location", "TopPlayer"]

top_players["LastName"] = top_players["TopPlayer"].apply(extract_last_name)
top_players["Country"] = top_players["LastName"].map(name_to_country).fillna("Unknown")
top_players["Flag"] = top_players["Country"].apply(lambda c: country_to_flag(c) if c != "Unknown" else "")

location_counts = location_counts.merge(top_players, on="Location", how="left")

# Surface info
surface_info = df.groupby("Location")["Surface"] \
                 .agg(lambda x: x.value_counts().index[0]) \
                 .reset_index()
location_counts = location_counts.merge(surface_info, on="Location", how="left")

# Color mapping
surface_colors = {
    "Hard": "#4A90E2",    # Soft blue
    "Clay": "#CC6600",    # Burnt orange
    "Grass": "#2E8B57",   # Grass green
    "Carpet": "#7F3FBF"   # Purple
}

# Geocode cache
cache_file = "geocoded_locations_womens.csv"
if os.path.exists(cache_file):
    cache = pd.read_csv(cache_file)
else:
    cache = pd.DataFrame(columns=["Location", "Latitude", "Longitude"])

geolocator = Nominatim(user_agent="tennis_mapper_wta")

def geocode_location(town):
    if town in cache["Location"].values:
        row = cache[cache["Location"] == town].iloc[0]
        return pd.Series([row["Latitude"], row["Longitude"]])
    else:
        try:
            location = geolocator.geocode(town)
            time.sleep(1)
            if location:
                lat, lon = location.latitude, location.longitude
                cache.loc[len(cache)] = [town, lat, lon]
                cache.to_csv(cache_file, index=False)
                return pd.Series([lat, lon])
        except Exception as e:
            print(f"Error geocoding {town}: {e}")
        return pd.Series([None, None])

# Geocode each location
location_counts[["Latitude", "Longitude"]] = location_counts["Location"].apply(geocode_location)
location_counts.dropna(subset=["Latitude", "Longitude"], inplace=True)

# Create the map
m = folium.Map(location=[20, 0], zoom_start=2, control_scale=True)
m.add_child(MiniMap(toggle_display=True, position="bottomright"))
marker_cluster = MarkerCluster(name="WTA Tournaments").add_to(m)

max_occurrences = location_counts["Occurrences"].max()

# Add markers
for _, row in location_counts.iterrows():
    popup_html = (
        f"<div style='width: 220px;'>"
        f"<b>{row['Location']}</b><br>"
        f"Surface: {row['Surface']}<br>"
        f"Matches: {int(row['Occurrences'])}<br>"
        f"Top Player: {row['Flag']} {row['TopPlayer']}"
        f"</div>"
    )

    radius = (row["Occurrences"] / max_occurrences) * 20
    marker_color = surface_colors.get(row["Surface"], "gray")

    folium.CircleMarker(
        location=[row["Latitude"], row["Longitude"]],
        radius=radius,
        color=marker_color,
        fill=True,
        fill_color=marker_color,
        fill_opacity=0.6,
        popup=popup_html,
        tooltip=row["Location"]
    ).add_to(marker_cluster)

# Save output
m.save("tennis_map_womens_with_flags.html")
m



Columns (25,26,28) have mixed types. Specify dtype option on import or set low_memory=False.



Unique Locations: 132
Unique Surfaces: ['Hard' 'Carpet' 'Clay' 'Grass' 'Greenset']


In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Load your CSV data
file_path = "/Users/karine/SmashData-1/Milestone 1/match_tennis_68_ajd.csv"
df = pd.read_csv(file_path)

# --- Step 1: Clean & Compute Statistics ---
# Focus on top 10 players by total wins
top_players = df['Winner'].value_counts().head(10).index.tolist()

# Ensure WRank is numeric
df['WRank'] = pd.to_numeric(df['WRank'], errors='coerce')

# Prepare statistics for radar plot
player_stats = []
for player in top_players:
    subset = df[df['Winner'] == player]

    avg_aces = pd.to_numeric(subset['W1'], errors='coerce').mean(skipna=True)  # proxy for ace if missing real ace column
    win_sets = pd.to_numeric(subset['Wsets'], errors='coerce').mean(skipna=True)
    rank_avg = subset['WRank'].mean(skipna=True)

    player_stats.append({
        "Player": player,
        "Avg Sets Won": win_sets,
        "Avg W1 Games": avg_aces,
        "Avg WRank": rank_avg
    })

# Convert to DataFrame
stats_df = pd.DataFrame(player_stats)

# Normalize selected columns to 0–100 for visual comparison
features = ['Avg Sets Won', 'Avg W1 Games', 'Avg WRank']
for feature in features:
    min_val = stats_df[feature].min()
    max_val = stats_df[feature].max()
    stats_df[feature] = 100 * (stats_df[feature] - min_val) / (max_val - min_val)

# Melt the DataFrame for plotly express compatibility
df_melted = stats_df.melt(id_vars="Player", var_name="Metric", value_name="Value")

# Create animated radar plot with dropdown menu
fig = px.line_polar(
    df_melted,
    r="Value",
    theta="Metric",
    color="Player",
    line_close=True,
    template="plotly_dark",
    height=650,
    animation_frame="Player"
)

fig.update_traces(fill="toself")
fig.update_layout(
    title="Interactive Radar Chart: Top 10 Tennis Players",
    polar=dict(radialaxis=dict(visible=True, range=[0, 100])),
    showlegend=False,
    updatemenus=[{
        "buttons": [
            {
                "args": [None, {"frame": {"duration": 1000, "redraw": True}, "fromcurrent": True}],
                "label": "Play",
                "method": "animate"
            },
            {
                "args": [[None], {"frame": {"duration": 0, "redraw": False}, "mode": "immediate"}],
                "label": "Pause",
                "method": "animate"
            }
        ],
        "type": "buttons",
        "direction": "left",
        "pad": {"r": 10, "t": 87},
        "x": 0.1,
        "xanchor": "right",
        "y": 0,
        "yanchor": "top"
    }]
)

fig.show()

  df = pd.read_csv(file_path)
