In [1]:
# !pip install openpyxl
import numpy as np
import pandas as pd
import warnings
import plotly.express as px

warnings.filterwarnings('ignore')

In [2]:
# Load the data from the provided files
athletes_df = pd.read_excel('data/Athletes.xlsx')
coaches_df = pd.read_excel('data/Coaches.xlsx')
entries_gender_df = pd.read_excel('data/EntriesGender.xlsx')
medals_df = pd.read_excel('data/Medals.xlsx')
teams_df = pd.read_excel('data/Teams.xlsx')
events_df = pd.read_csv('data/events.csv')

In [3]:
# Analyze the shape of each dataframe
print('Athletes DataFrame Shape:', athletes_df.shape)
print('Coaches DataFrame Shape:', coaches_df.shape)
print('Entries Gender DataFrame Shape:', entries_gender_df.shape)
print('Medals DataFrame Shape:', medals_df.shape)
print('Teams DataFrame Shape:', teams_df.shape)
print('Events DataFrame Shape:', events_df.shape)

Athletes DataFrame Shape: (11085, 3)
Coaches DataFrame Shape: (394, 4)
Entries Gender DataFrame Shape: (46, 4)
Medals DataFrame Shape: (93, 7)
Teams DataFrame Shape: (743, 4)
Events DataFrame Shape: (329, 5)


In [4]:
print("Athletes DataFrame:")
athletes_df.head()

Athletes DataFrame:


Unnamed: 0,Name,NOC,Discipline
0,AALERUD Katrine,Norway,Cycling Road
1,ABAD Nestor,Spain,Artistic Gymnastics
2,ABAGNALE Giovanni,Italy,Rowing
3,ABALDE Alberto,Spain,Basketball
4,ABALDE Tamara,Spain,Basketball


In [5]:
print("Coaches DataFrame:")
coaches_df.head()

Coaches DataFrame:


Unnamed: 0,Name,NOC,Discipline,Event
0,ABDELMAGID Wael,Egypt,Football,
1,ABE Junya,Japan,Volleyball,
2,ABE Katsuhiko,Japan,Basketball,
3,ADAMA Cherif,Côte d'Ivoire,Football,
4,AGEBA Yuya,Japan,Volleyball,


In [6]:
print("Entries Gender DataFrame:")
entries_gender_df.head()

Entries Gender DataFrame:


Unnamed: 0,Discipline,Female,Male,Total
0,3x3 Basketball,32,32,64
1,Archery,64,64,128
2,Artistic Gymnastics,98,98,196
3,Artistic Swimming,105,0,105
4,Athletics,969,1072,2041


In [7]:
print("Medals DataFrame:")
medals_df.head()

Medals DataFrame:


Unnamed: 0,Rank,Team/NOC,Gold,Silver,Bronze,Total,Rank by Total
0,1,United States of America,39,41,33,113,1
1,2,People's Republic of China,38,32,18,88,2
2,3,Japan,27,14,17,58,5
3,4,Great Britain,22,21,22,65,4
4,5,ROC,20,28,23,71,3


In [8]:
print("Teams DataFrame:")
teams_df.head()

Teams DataFrame:


Unnamed: 0,Name,Discipline,NOC,Event
0,Belgium,3x3 Basketball,Belgium,Men
1,China,3x3 Basketball,People's Republic of China,Men
2,China,3x3 Basketball,People's Republic of China,Women
3,France,3x3 Basketball,France,Women
4,Italy,3x3 Basketball,Italy,Women


In [9]:
print("Events DataFrame:")
events_df.head()

Events DataFrame:


Unnamed: 0,event,tag,sport,sport_code,sport_url
0,Men's Individual,archery,Archery,ARC,https://olympics.com/en/paris-2024/sports/archery
1,Women's Individual,archery,Archery,ARC,https://olympics.com/en/paris-2024/sports/archery
2,Men's Team,archery,Archery,ARC,https://olympics.com/en/paris-2024/sports/archery
3,Women's Team,archery,Archery,ARC,https://olympics.com/en/paris-2024/sports/archery
4,Mixed Team,archery,Archery,ARC,https://olympics.com/en/paris-2024/sports/archery


##### If we look at the Coaches dataframe, we can see that there are multiple NaN values indicating no coaches for Teams. This can be the case for other datasets too, where Missing Values are not present in the top 5 results, but are present in the dataset. Let's confirm how many values are missing in this datasets

In [10]:
def check_missing(df):
    """Function to check missing values in the given dataframe"""
    missing_values = df.isnull().sum()
    missing_values = missing_values[missing_values > 0]
    missing_values = missing_values.sort_values(ascending=False)
    return missing_values

In [11]:
print(check_missing(athletes_df))
print(check_missing(coaches_df))
print(check_missing(entries_gender_df))
print(check_missing(medals_df))
print(check_missing(teams_df))
print(check_missing(events_df))

Series([], dtype: int64)
Event    145
dtype: int64
Series([], dtype: int64)
Series([], dtype: int64)
Series([], dtype: int64)
Series([], dtype: int64)


##### By this way we have confirmed that other datasets do not contains any missing values and the data provided is clean, Only Coaches dataframe has NaN Values.

### ---------------------------------------------- ANALYZING TIME --------------------------------------------------

### 0. Total no. of athletes, coaches, countries, sports joining

In [12]:
total_athletes = athletes_df['Name'].nunique()
total_coaches = coaches_df['Name'].nunique()
total_countries = athletes_df['NOC'].nunique()
total_sports = athletes_df['Discipline'].nunique()

print(f"Total number of athletes: {total_athletes}")
print(f"Total number of coaches: {total_coaches}")
print(f"Total number of countries: {total_countries}")
print(f"Total number of sports: {total_sports}")


Total number of athletes: 11062
Total number of coaches: 381
Total number of countries: 206
Total number of sports: 46


---

### 1. Number of athletes & coaches in each country

In [13]:
import plotly.express as px

athletes_per_country = athletes_df['NOC'].value_counts().reset_index()
print(athletes_per_country.head(10))

fig = px.bar(athletes_per_country.head(40), x='NOC', y='count', title='Highest Number of Athletes Participating')
fig.update_layout(xaxis_title='Country', yaxis_title='Number of Athletes')
fig.show()

fig = px.bar(athletes_per_country.tail(40), x='NOC', y='count', title='Least Number of Athletes Participating per Country')
fig.update_layout(xaxis_title='Country', yaxis_title='Number of Athletes')
fig.show()

                          NOC  count
0    United States of America    615
1                       Japan    586
2                   Australia    470
3  People's Republic of China    401
4                     Germany    400
5                      France    377
6                      Canada    368
7               Great Britain    366
8                       Italy    356
9                       Spain    324


In [14]:
coaches_per_country = coaches_df['NOC'].value_counts().reset_index()
coaches_per_country.columns = ['Country', 'Number of Coaches']

# Sorting the dataframe to get both highest and lowest
coaches_per_country_sorted = coaches_per_country.sort_values(by='Number of Coaches', ascending=False)

# Selecting the top and bottom countries
top_countries = coaches_per_country_sorted.head(10)
bottom_countries = coaches_per_country_sorted.tail(10)

# Creating a bar chart for the top countries
fig_top = px.bar(top_countries, x='Country', y='Number of Coaches', 
                 title='Top 10 Countries by Number of Coaches',
                 labels={'Number of Coaches': 'Number of Coaches'}, text='Number of Coaches')
fig_top.update_traces(texttemplate='%{text}', textposition='outside')
fig_top.update_layout(yaxis_range=[0, top_countries['Number of Coaches'].max() + 2])
fig_top.show()

# Creating a bar chart for the bottom countries
fig_bottom = px.bar(bottom_countries, x='Country', y='Number of Coaches', 
                    title='Bottom 10 Countries by Number of Coaches',
                    labels={'Number of Coaches': 'Number of Coaches'}, text='Number of Coaches')
fig_bottom.update_traces(texttemplate='%{text}', textposition='outside')
fig_bottom.update_layout(yaxis_range=[0, bottom_countries['Number of Coaches'].max() + 2])
fig_bottom.show()

##### ->Insight Gained: USA has the highest number of participants, 2nd comes Japan and 3rd Austraila. While in contrast Myanmar, Dominica, Tuvalu have the least number of participants.
##### -> Insight Gained: Japan leading the chart with most no. of Coaches, followed by USA on 2nd no. and Spain on 3rd. While in contrast Liechtenstien, Croatia, Slovakia Having the least no. of Coaches for Teams.
---

### 2. Number of Athletes & Coaches in Each Discipline

In [15]:
# 1. Count the number of athletes in each discipline
discipline_counts = athletes_df['Discipline'].value_counts().reset_index()
discipline_counts.columns = ['Discipline', 'Number of Athletes']

# 2. Identify the discipline with the most and least athletes
most_athletes_discipline = discipline_counts.loc[discipline_counts['Number of Athletes'].idxmax()]
least_athletes_discipline = discipline_counts.loc[discipline_counts['Number of Athletes'].idxmin()]

print(f"Discipline with the most athletes: {most_athletes_discipline['Discipline']} with {most_athletes_discipline['Number of Athletes']} athletes")
print(f"Discipline with the least athletes: {least_athletes_discipline['Discipline']} with {least_athletes_discipline['Number of Athletes']} athletes")

# 3. Visualize the data using Plotly
fig = px.bar(discipline_counts, x='Discipline', y='Number of Athletes', title='Number of Athletes in Each Discipline')
fig.update_layout(xaxis_title='Discipline', yaxis_title='Number of Athletes')
fig.show()


# 1. Count the number of coaches in each discipline
discipline_coaches_counts = coaches_df['Discipline'].value_counts().reset_index()
discipline_coaches_counts.columns = ['Discipline', 'Number of Coaches']

# 2. Identify the discipline with the most and least coaches
most_coaches_discipline = discipline_coaches_counts.loc[discipline_coaches_counts['Number of Coaches'].idxmax()]
least_coaches_discipline = discipline_coaches_counts.loc[discipline_coaches_counts['Number of Coaches'].idxmin()]

print(f"Discipline with the most coaches: {most_coaches_discipline['Discipline']} with {most_coaches_discipline['Number of Coaches']} coaches")
print(f"Discipline with the least coaches: {least_coaches_discipline['Discipline']} with {least_coaches_discipline['Number of Coaches']} coaches")

# 3. Visualize the data using Plotly
fig = px.bar(discipline_coaches_counts, x='Discipline', y='Number of Coaches', title='Number of Coaches in Each Discipline')
fig.update_layout(xaxis_title='Discipline', yaxis_title='Number of Coaches')
fig.show()

Discipline with the most athletes: Athletics with 2068 athletes
Discipline with the least athletes: Cycling BMX Freestyle with 19 athletes


Discipline with the most coaches: Basketball with 74 coaches
Discipline with the least coaches: Water Polo with 22 coaches


##### ->Insight Gained: Athletics has the highest number of participants, 2nd comes Swimming and 3rd Football. While in contrast Sport Climbing, Trampoline Gymnastics, Cyding BMX have the least number of participants.

##### -> Insight Gained: Basketball leading the chart with most no. of Coaches, followed by Artistic Swimming on 2nd no. and Football on 3rd. While in contrast Baseball, Rubgy, Water Polo Having the least no. of Coaches for Teams.
---

### 3. How many athletes per coach

In [16]:
# Calculate the number of athletes and coaches per country
athletes_per_country = athletes_df['NOC'].value_counts().reset_index()
athletes_per_country.columns = ['Country', 'Number of Athletes']

coaches_per_country = coaches_df['NOC'].value_counts().reset_index()
coaches_per_country.columns = ['Country', 'Number of Coaches']

# Merge the dataframes to compute the ratio
ratio_df = pd.merge(athletes_per_country, coaches_per_country, on='Country', how='outer').fillna(0)
ratio_df['Athletes per Coach'] = ratio_df['Number of Coaches'] / ratio_df['Number of Athletes']

# Replace infinite values with NaN
ratio_df['Athletes per Coach'].replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with NaN values in the 'Athletes per Coach' column
ratio_df.dropna(subset=['Athletes per Coach'], inplace=True)

# Identify the country with the highest and lowest ratio of athletes to coaches
highest_ratio_country = ratio_df.loc[ratio_df['Athletes per Coach'].idxmax()]
lowest_ratio_country = ratio_df.loc[ratio_df['Athletes per Coach'].idxmin()]

print(f"Country with the highest athletes per coach ratio: {highest_ratio_country['Country']} with a ratio of {highest_ratio_country['Athletes per Coach']:.2f}")
print(f"Country with the lowest athletes per coach ratio: {lowest_ratio_country['Country']} with a ratio of {lowest_ratio_country['Athletes per Coach']:.2f}")

Country with the highest athletes per coach ratio: San Marino with a ratio of 0.50
Country with the lowest athletes per coach ratio: Afghanistan with a ratio of 0.00


### 4. Countries winning Medals

In [17]:
import plotly.express as px

# Calculate the total number of medals won by each country
medals_per_country = medals_df[['Team/NOC', 'Total']]
medals_per_country.columns = ['Country', 'Total Medals']

# Sort the dataframe by 'Total Medals' for better visualization
medals_per_country_sorted = medals_per_country.sort_values(by='Total Medals', ascending=False)

# Creating a bar chart for all countries that have won medals
fig = px.bar(medals_per_country_sorted, x='Country', y='Total Medals', 
             title='Countries Winning Medals',
             labels={'Total Medals': 'Total Medals'}, text='Total Medals')
fig.update_traces(texttemplate='%{text}', textposition='outside')
fig.update_layout(xaxis_title='Country', yaxis_title='Total Medals', 
                  yaxis_range=[0, medals_per_country_sorted['Total Medals'].max() + 10])
fig.show()

In [18]:
# Calculate the total number of gold medals won by each country
gold_medals_per_country = medals_df[['Team/NOC', 'Gold']]
gold_medals_per_country.columns = ['Country', 'Gold Medals']

# Sort the dataframe by 'Gold Medals' for better visualization
gold_medals_per_country_sorted = gold_medals_per_country.sort_values(by='Gold Medals', ascending=False)

# Creating a bar chart for all countries that have won gold medals
fig_gold = px.bar(gold_medals_per_country_sorted, x='Country', y='Gold Medals', 
                  title='Countries Winning Gold Medals',
                  labels={'Gold Medals': 'Gold Medals'}, text='Gold Medals')
fig_gold.update_traces(texttemplate='%{text}', textposition='outside')
fig_gold.update_layout(xaxis_title='Country', yaxis_title='Gold Medals', 
                       yaxis_range=[0, gold_medals_per_country_sorted['Gold Medals'].max() + 5])
fig_gold.show()

In [19]:
# Calculate the total number of silver medals won by each country
silver_medals_per_country = medals_df[['Team/NOC', 'Silver']]
silver_medals_per_country.columns = ['Country', 'Silver Medals']

# Sort the dataframe by 'Silver Medals' for better visualization
silver_medals_per_country_sorted = silver_medals_per_country.sort_values(by='Silver Medals', ascending=False)

# Creating a bar chart for all countries that have won silver medals
fig_silver = px.bar(silver_medals_per_country_sorted, x='Country', y='Silver Medals', 
                    title='Countries Winning Silver Medals',
                    labels={'Silver Medals': 'Silver Medals'}, text='Silver Medals')
fig_silver.update_traces(texttemplate='%{text}', textposition='outside')
fig_silver.update_layout(xaxis_title='Country', yaxis_title='Silver Medals', 
                         yaxis_range=[0, silver_medals_per_country_sorted['Silver Medals'].max() + 5])
fig_silver.show()

In [20]:
# Calculate the total number of bronze medals won by each country
bronze_medals_per_country = medals_df[['Team/NOC', 'Bronze']]
bronze_medals_per_country.columns = ['Country', 'Bronze Medals']

# Sort the dataframe by 'Bronze Medals' for better visualization
bronze_medals_per_country_sorted = bronze_medals_per_country.sort_values(by='Bronze Medals', ascending=False)

# Creating a bar chart for all countries that have won bronze medals
fig_bronze = px.bar(bronze_medals_per_country_sorted, x='Country', y='Bronze Medals', 
                    title='Countries Winning Bronze Medals',
                    labels={'Bronze Medals': 'Bronze Medals'}, text='Bronze Medals')
fig_bronze.update_traces(texttemplate='%{text}', textposition='outside')
fig_bronze.update_layout(xaxis_title='Country', yaxis_title='Bronze Medals', 
                         yaxis_range=[0, bronze_medals_per_country_sorted['Bronze Medals'].max() + 5])
fig_bronze.show()

---
### 5. Percentage of medals earned by top countries

In [21]:
# Calculate the total number of medals won by each country
medals_per_country = medals_df[['Team/NOC', 'Total']]
medals_per_country.columns = ['Country', 'Total Medals']

# Calculate the percentage of total medals earned by each country
total_medals = medals_per_country['Total Medals'].sum()
medals_per_country['Percentage of Total Medals'] = (medals_per_country['Total Medals'] / total_medals) * 100

# Sort the dataframe by 'Percentage of Total Medals' for better visualization
medals_per_country_sorted = medals_per_country.sort_values(by='Percentage of Total Medals', ascending=False)

# Creating a bar chart for the top countries by percentage of total medals
fig = px.bar(medals_per_country_sorted.head(10), x='Country', y='Percentage of Total Medals', 
             title='Percentage of Total Medals Earned by Top Countries',
             labels={'Percentage of Total Medals': 'Percentage of Total Medals'}, text='Percentage of Total Medals')
fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
fig.update_layout(xaxis_title='Country', yaxis_title='Percentage of Total Medals', 
                  yaxis_range=[0, medals_per_country_sorted['Percentage of Total Medals'].max() + 5])
fig.show()

---
### 6. Top Disciplines with gender participation

In [22]:
# Use the EntriesGender dataframe to get the number of male and female athletes in each discipline
gender_participation = entries_gender_df[['Discipline', 'Male', 'Female']]

# Calculate the total participation in each discipline
gender_participation['Total'] = gender_participation['Male'] + gender_participation['Female']

# Sort the dataframe by 'Total' for better visualization
gender_participation_sorted = gender_participation.sort_values(by='Total', ascending=False)

# Creating a bar chart for the top games with gender participation
fig = px.bar(gender_participation_sorted.head(10), x='Discipline', y=['Male', 'Female'], 
             title='Top Games with Gender Participation',
             labels={'value': 'Number of Athletes', 'variable': 'Gender'}, 
             barmode='group', text_auto=True)
fig.update_layout(xaxis_title='Discipline', yaxis_title='Number of Athletes')
fig.show()

---
### 7. Teams vs Disciplines

In [23]:
# Calculate the number of teams in each discipline
teams_per_discipline = teams_df['Discipline'].value_counts().reset_index()
teams_per_discipline.columns = ['Discipline', 'Number of Teams']

# Sort the dataframe by 'Number of Teams' for better visualization
teams_per_discipline_sorted = teams_per_discipline.sort_values(by='Number of Teams', ascending=False)

# Creating a bar chart for the number of teams in each discipline
fig = px.bar(teams_per_discipline_sorted, x='Discipline', y='Number of Teams', 
             title='Number of Teams in Each Discipline',
             labels={'Number of Teams': 'Number of Teams'}, text='Number of Teams')
fig.update_traces(texttemplate='%{text}', textposition='outside')
fig.update_layout(xaxis_title='Discipline', yaxis_title='Number of Teams', 
                  yaxis_range=[0, teams_per_discipline_sorted['Number of Teams'].max() + 5])
fig.show()

---
### 8. Number of events participated by a sports team

In [24]:
# Calculate the number of events participated by each team
events_per_team = teams_df.groupby('NOC')['Event'].nunique().reset_index()
events_per_team.columns = ['Team', 'Number of Events']

# Sort the dataframe by 'Number of Events' for better visualization
events_per_team_sorted = events_per_team.sort_values(by='Number of Events', ascending=False)

# Creating a bar chart for the number of events participated by each team
fig = px.bar(events_per_team_sorted, x='Team', y='Number of Events', 
             title='Number of Events Participated by Each Sports Team',
             labels={'Number of Events': 'Number of Events'}, text='Number of Events')
fig.update_traces(texttemplate='%{text}', textposition='outside')
fig.update_layout(xaxis_title='Team', yaxis_title='Number of Events', 
                  yaxis_range=[0, events_per_team_sorted['Number of Events'].max() + 5])
fig.show()

---

### 9. Does every Athlete belong to a team?

In [25]:
# Check if every athlete belongs to a team
athletes_with_team = athletes_df['NOC'].notna().all()

print(f"Does every Athlete belong to a team? {'Yes' if athletes_with_team else 'No'}")

Does every Athlete belong to a team? Yes


---
### 10. Team has coach?

In [26]:
# Check if each team has at least one coach
teams_with_coach = coaches_df['NOC'].value_counts().reset_index()
teams_with_coach.columns = ['Team', 'Number of Coaches']

# Merge with the list of teams to see if any team is missing a coach
all_teams = athletes_df['NOC'].unique()
teams_with_coach_dict = dict(zip(teams_with_coach['Team'], teams_with_coach['Number of Coaches']))

teams_missing_coach = [team for team in all_teams if team not in teams_with_coach_dict]

print(f"Teams missing a coach: {teams_missing_coach}")

Teams missing a coach: ['Sudan', 'Azerbaijan', 'Qatar', 'Malaysia', 'Singapore', 'Maldives', 'Uzbekistan', 'Indonesia', 'Ethiopia', 'Malta', 'Sri Lanka', 'Morocco', 'Mauritania', 'Libya', 'Nauru', 'Switzerland', 'Guyana', 'Georgia', 'Jordan', 'Palestine', 'Cyprus', 'El Salvador', 'Federated States of Micronesia', 'Lithuania', 'Congo', 'Monaco', 'Rwanda', 'Armenia', 'Samoa', 'Brunei Darussalam', 'Bangladesh', 'Benin', 'Trinidad and Tobago', 'Senegal', 'Algeria', 'Tajikistan', 'Pakistan', 'Kyrgyzstan', 'Latvia', 'Oman', 'Kuwait', 'Iraq', 'Refugee Olympic Team', 'Yemen', 'Cuba', 'Niger', 'Djibouti', 'Somalia', 'Bulgaria', 'Bermuda', 'Jamaica', 'Estonia', 'United Arab Emirates', 'Paraguay', 'Costa Rica', 'Cape Verde', 'Uganda', 'Peru', 'Ghana', 'Ecuador', 'Botswana', 'Bahamas', 'Philippines', 'Guam', 'Madagascar', 'Haiti', 'Afghanistan', 'Uruguay', 'Panama', 'Finland', 'Cameroon', 'Syrian Arab Republic', 'Turkmenistan', 'Hong Kong, China', 'Togo', 'Seychelles', 'Mongolia', 'Guinea', 'Nicar

---
### 11. Distribution of Event Types

In [27]:
# Calculate the distribution of event types
event_type_distribution = teams_df['Event'].value_counts().reset_index()
event_type_distribution.columns = ['Event Type', 'Number of Occurrences']

# Creating a bar chart for the distribution of event types
fig = px.bar(event_type_distribution, x='Event Type', y='Number of Occurrences', 
             title='Distribution of Event Types',
             labels={'Number of Occurrences': 'Number of Occurrences'}, text='Number of Occurrences')
fig.update_traces(texttemplate='%{text}', textposition='outside')
fig.update_layout(xaxis_title='Event Type', yaxis_title='Number of Occurrences', 
                  yaxis_range=[0, event_type_distribution['Number of Occurrences'].max() + 5])
fig.show()

In [28]:
import pandas as pd

# Ensure columns are as expected
print("EntriesGender DataFrame columns:", entries_gender_df.columns)
print("Medals DataFrame columns:", medals_df.columns)

# Analysis 1: Number of Coaches vs. Number of Medals
coaches_count = coaches_df.groupby('NOC').size().reset_index(name='Num_Coaches')
medals_coaches = pd.merge(medals_df, coaches_count, left_on='Team/NOC', right_on='NOC', how='left')
coaches_medals_corr = medals_coaches[['Num_Coaches', 'Total']].corr()

# Analysis 2: Number of Participants vs. Number of Medals
participants_count = athletes_df.groupby('NOC').size().reset_index(name='Num_Participants')
medals_participants = pd.merge(medals_df, participants_count, left_on='Team/NOC', right_on='NOC', how='left')
participants_medals_corr = medals_participants[['Num_Participants', 'Total']].corr()

# Analysis 3: Gender Distribution vs. Number of Medals
# Extract unique NOC and Discipline from 'athletes_df'
gender_noc = athletes_df[['NOC', 'Discipline']].drop_duplicates()
gender_noc = pd.merge(gender_noc, entries_gender_df, on='Discipline', how='left')

# Aggregate by NOC to get total Female, Male and Total entries
gender_noc_agg = gender_noc.groupby('NOC').sum().reset_index()
gender_medals = pd.merge(medals_df, gender_noc_agg, left_on='Team/NOC', right_on='NOC', how='left')
gender_medals_corr = gender_medals[['Female', 'Male', 'Total_x', 'Total_y']].corr()

# Analysis 4: Event Participation vs. Medal Wins
event_count = teams_df.groupby('NOC').size().reset_index(name='Num_Events')
medals_events = pd.merge(medals_df, event_count, left_on='Team/NOC', right_on='NOC', how='left')
events_medals_corr = medals_events[['Num_Events', 'Total']].corr()

# Display correlations
print("Correlation between Number of Coaches and Medals:")
print(coaches_medals_corr)

print("\nCorrelation between Number of Participants and Medals:")
print(participants_medals_corr)

print("\nCorrelation between Gender Distribution and Medals:")
print(gender_medals_corr)

print("\nCorrelation between Event Participation and Medals:")
print(events_medals_corr)


EntriesGender DataFrame columns: Index(['Discipline', 'Female', 'Male', 'Total'], dtype='object')
Medals DataFrame columns: Index(['Rank', 'Team/NOC', 'Gold', 'Silver', 'Bronze', 'Total',
       'Rank by Total'],
      dtype='object')
Correlation between Number of Coaches and Medals:
             Num_Coaches     Total
Num_Coaches     1.000000  0.639795
Total           0.639795  1.000000

Correlation between Number of Participants and Medals:
                  Num_Participants     Total
Num_Participants          1.000000  0.875115
Total                     0.875115  1.000000

Correlation between Gender Distribution and Medals:
           Female      Male   Total_x   Total_y
Female   1.000000  0.997727  0.686650  0.999390
Male     0.997727  1.000000  0.672092  0.999472
Total_x  0.686650  0.672092  1.000000  0.679495
Total_y  0.999390  0.999472  0.679495  1.000000

Correlation between Event Participation and Medals:
            Num_Events    Total
Num_Events     1.00000  0.86864
Total    

### Interpretation: There is a moderate positive correlation (0.64) between the number of coaches and the total number of medals won by a country. This suggests that having more coaches is associated with winning more medals, though other factors are also at play.

### Interpretation: There is a strong positive correlation (0.88) between the number of participants and the total number of medals won. This indicates that countries with more participants tend to win more medals.

# Based upon the above observations, we can conclude that "The higher the no. of participants the higher chances of getting medals" and also "coaches play a vital role in winning games, so more the no. of coaches in a team the more the chances of winning the medals". Based upon the data, my calculations can be inaccurate but based on them I belive USA, Japan, & France have the highest chance of winning Paris Oympics 2024