In [7]:
import pandas as pd
import json
import numpy as np

import sys
sys.path.append('../../src')
import calculator

In [8]:
url = "http://115.78.93.252/TTU/pool/donghuynh0/data_visualization_2025/final_proejct/raw/women_results.csv"
continent_url = "http://115.78.93.252/TTU/pool/donghuynh0/data_visualization_2025/final_proejct/raw/continent_mapping.json"

In [9]:
df = pd.read_csv(url)
continent_mapping = pd.read_json(continent_url, typ='series')

df.head()

Unnamed: 0.1,Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament
0,0,1969-11-01,Italy,France,1,0,Euro
1,1,1969-11-01,Denmark,England,4,3,Euro
2,2,1969-11-02,England,France,2,0,Euro
3,3,1969-11-02,Italy,Denmark,3,1,Euro
4,4,1975-08-25,Thailand,Australia,3,2,AFC Championship


In [10]:
# remove unneccesary column

df = df.drop('Unnamed: 0', axis=1)

df.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament
0,1969-11-01,Italy,France,1,0,Euro
1,1969-11-01,Denmark,England,4,3,Euro
2,1969-11-02,England,France,2,0,Euro
3,1969-11-02,Italy,Denmark,3,1,Euro
4,1975-08-25,Thailand,Australia,3,2,AFC Championship


In [11]:
# map name of countries with respective continents

df['home_continent'] = df['home_team'].map(continent_mapping)
df['away_continent'] = df['away_team'].map(continent_mapping)

df.head(5)

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,home_continent,away_continent
0,1969-11-01,Italy,France,1,0,Euro,Europe,Europe
1,1969-11-01,Denmark,England,4,3,Euro,Europe,Europe
2,1969-11-02,England,France,2,0,Euro,Europe,Europe
3,1969-11-02,Italy,Denmark,3,1,Euro,Europe,Europe
4,1975-08-25,Thailand,Australia,3,2,AFC Championship,Asia,Oceania


In [12]:
# add a goal difference column 

df['goal_difference'] = np.abs(df['home_score'] - df['away_score'])

df.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,home_continent,away_continent,goal_difference
0,1969-11-01,Italy,France,1,0,Euro,Europe,Europe,1
1,1969-11-01,Denmark,England,4,3,Euro,Europe,Europe,1
2,1969-11-02,England,France,2,0,Euro,Europe,Europe,2
3,1969-11-02,Italy,Denmark,3,1,Euro,Europe,Europe,2
4,1975-08-25,Thailand,Australia,3,2,AFC Championship,Asia,Oceania,1


In [13]:
# who wins in a match --> add winer column 

df['winner'] = np.where(df['home_score'] > df['away_score'], df['home_team'],
                   np.where(df['away_score'] > df['home_score'], df['away_team'],
                   'Draw'))
df.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,home_continent,away_continent,goal_difference,winner
0,1969-11-01,Italy,France,1,0,Euro,Europe,Europe,1,Italy
1,1969-11-01,Denmark,England,4,3,Euro,Europe,Europe,1,Denmark
2,1969-11-02,England,France,2,0,Euro,Europe,Europe,2,England
3,1969-11-02,Italy,Denmark,3,1,Euro,Europe,Europe,2,Italy
4,1975-08-25,Thailand,Australia,3,2,AFC Championship,Asia,Oceania,1,Thailand


In [14]:
# add year column

df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year

df.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,home_continent,away_continent,goal_difference,winner,year
0,1969-11-01,Italy,France,1,0,Euro,Europe,Europe,1,Italy,1969
1,1969-11-01,Denmark,England,4,3,Euro,Europe,Europe,1,Denmark,1969
2,1969-11-02,England,France,2,0,Euro,Europe,Europe,2,England,1969
3,1969-11-02,Italy,Denmark,3,1,Euro,Europe,Europe,2,Italy,1969
4,1975-08-25,Thailand,Australia,3,2,AFC Championship,Asia,Oceania,1,Thailand,1975


In [15]:
# calculate win percentage by home and away team

all_countries = pd.unique(df[['home_team', 'away_team']].values.ravel())
results = [calculator.win_percentage(df, country) for country in all_countries]
win_percent = pd.DataFrame(results)
win_percent.head(5)

Unnamed: 0,country,home_win_percent,away_win_percent,home matches,away_matches,total matches
0,Italy,62.39,48.72,109,117,226
1,France,64.76,48.6,105,107,212
2,Denmark,59.17,43.71,120,151,271
3,England,67.59,49.5,108,101,209
4,Thailand,70.59,34.0,51,50,101


In [12]:
# summarize 

world = calculator.country_match_summary(df)
asia = calculator.country_match_summary(asia_df)
africa = calculator.country_match_summary(africa_df)
europe = calculator.country_match_summary(europe_df)
north_america = calculator.country_match_summary(north_america_df)
south_america = calculator.country_match_summary(south_america_df)
oceania = calculator.country_match_summary(oceania_df)

In [14]:
# sort

win_percent = win_percent.sort_values(by="total matches" ,ascending=False)
world = world.sort_values(by="total_matches" ,ascending=False)
asia = asia.sort_values(by="total_matches" ,ascending=False)
africa = africa.sort_values(by="total_matches" ,ascending=False)
europe = europe.sort_values(by="total_matches" ,ascending=False)
north_america = north_america.sort_values(by="total_matches" ,ascending=False)
south_america = south_america.sort_values(by="total_matches" ,ascending=False)
oceania = oceania.sort_values(by="total_matches" ,ascending=False).head(5)

In [15]:
# save 

df.to_csv('women_results.csv', index=False)

win_percent.to_csv('women_win_percent.csv', index=False)

world.to_csv('women_world.csv', index=False)
asia.to_csv('women_asia.csv', index=False)
africa.to_csv('women_africa.csv', index=False)
europe.to_csv('women_europe.csv', index=False)
north_america.to_csv('women_north_america.csv', index=False)
south_america.to_csv('women_south_america.csv', index=False)
oceania.to_csv('women_oceania.csv', index=False)