## 🧰 Preparing tool box

In [1]:
import pandas as pd
import numpy as np
import plotly.offline as pyo
import plotly.io as pio
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Initializing Plotly (off-line mode)
pyo.init_notebook_mode(connected=True)

# Setting Plotly template as Simple White
pio.templates.default = "simple_white"

In [None]:
# Reading Dataset
df = pd.read_csv("data/athlete_events.csv")

## 🇧🇷 Brasil at the Olympics

In [None]:
df_brazil = df[df["NOC"] == "BRA"].copy()

3848 linnes


**2. Is there any correlation between an athlete's height and his/her performance for any sports? Analyze it by gender.**

You may choose sports where Brazil has at least 5 medalists and 5 non-medalists.

In [5]:
# Grouping data by sport
df_brazil_grouped = df_brazil.groupby(by="Sport", as_index=False).count()

# Defining filters
medals_filter = df_brazil_grouped["Medal"] >= 5
competitors_filter = df_brazil_grouped["ID"] >= 10

# Filtering dataframe
sports_list = np.array(df_brazil_grouped[medals_filter & competitors_filter]["Sport"])

print(sports_list)

['Athletics' 'Basketball' 'Beach Volleyball' 'Boxing' 'Equestrianism'
 'Football' 'Judo' 'Sailing' 'Shooting' 'Swimming' 'Volleyball']


From the list above, **Boxing** e **Judo** will not be considered. These sports are divided in weight categories (which may impact the height difference between those categories). None of the categories have earned more than 5 medals.

**Athletics** will also not be considered, for its high variability between events.

Sports **Basketball**, **Football**, and **Volleyball** will also be analyzed per Team Height.

### 📝 Definining plotting standards

In [6]:
# Dataframe
# Creating a DataFrame copy for plotting
df_brazil_for_plotting = df_brazil.copy()

# Changing NaNs to 'No medal'
df_brazil_for_plotting["Medal"].fillna(value="No medal", inplace=True)

# Creating a DataFrame copy for plotting grouped by Team
df_brazil_for_plotting_grouped = df_brazil_for_plotting.groupby(
    ["Year", "City", "Sex", "Medal", "Sport", "Event"], as_index=False
).mean()

# Defining plots colors and categories orders
color_info = {
    "No medal": "seagreen",
    "Bronze": "lightsalmon",
    "Silver": "grey",
    "Gold": "gold",
}

category_orders = {"Medal": ["No medal", "Bronze", "Silver", "Gold"], "Sex": ["M", "F"]}

# Defining Hover
# For individuals
athletes_chosen_columns = ("Height", "Name", "City", "Year", "Event")

athletes_chosen_columns_for_hover = dict(
    zip(
        df_brazil_for_plotting.columns,
        [
            True if column in athletes_chosen_columns else False
            for column in df_brazil_for_plotting.columns
        ],
    )
)

# For teams
teams_chosen_columns = ("Height", "City", "Year", "Event")

teams_chosen_columns_for_hover = dict(
    zip(
        df_brazil_for_plotting_grouped.columns,
        [
            True if column in teams_chosen_columns else False
            for column in df_brazil_for_plotting_grouped.columns
        ],
    )
)

### 🏀 Basketball

In [7]:
# Defining sport to be analyzed
analyzed_sport = "Basketball"

#### Individual analysis

In [8]:
# Filtering dataframe
df_brazil_for_sport_plotting = df_brazil_for_plotting[
    df_brazil_for_plotting["Sport"] == analyzed_sport
]

# Creating figure
fig = px.strip(
    data_frame=df_brazil_for_sport_plotting,
    x="Medal",
    y="Height",
    color="Medal",
    facet_col="Sex",
    hover_data=athletes_chosen_columns_for_hover,
    category_orders=category_orders,
    color_discrete_map=color_info,
    title="🏀 Athlete Height x Performance (Medal)",
)

# Customizing figure
fig.update_traces(
    hovertemplate="%{customdata[1]}<br>"
    "Height: %{y}cm.<br>"
    "Competing at: %{customdata[10]}, %{customdata[8]}.<extra></extra>"
)

fig.update_layout(
    yaxis=dict(tickfont_size=10, title=dict(font_size=12, text="Height (cm)"))
)

fig.update_xaxes(
    matches=None, tickfont_size=10, title=dict(font_size=12, text="Performance (Medal)")
)

fig.for_each_annotation(
    lambda annotation: (
        annotation.update(text="👨 Male", font_size=13)
        if annotation.text.split("=")[-1] == "M"
        else annotation.update(text="🙎‍♀️ Female", font_size=13)
    )
);

In [9]:
# Showing figure
fig.show()

#### Team Analysis

In [10]:
# Filtering dataframe
df_brazil_for_sport_plotting_grouped = df_brazil_for_plotting_grouped[
    df_brazil_for_plotting_grouped["Sport"] == analyzed_sport
]

# Creating figure
fig = px.strip(
    data_frame=df_brazil_for_sport_plotting_grouped,
    x="Medal",
    y="Height",
    color="Medal",
    facet_col="Sex",
    hover_data=teams_chosen_columns_for_hover,
    category_orders=category_orders,
    color_discrete_map=color_info,
    title="🏀 Team Average Height x Performance (Medal)",
)

# Customizing figure
fig.update_traces(
    hovertemplate="Olympic Games: %{customdata[1]}, %{customdata[0]}.<br>"
    "Average Height: %{y:.0f}cm.<br><extra></extra>"
)

fig.update_layout(
    yaxis=dict(tickfont_size=10, title=dict(font_size=12, text="Average Height (cm)"))
)

fig.update_xaxes(
    matches=None, tickfont_size=10, title=dict(font_size=12, text="Performance (Medal)")
)

fig.for_each_annotation(
    lambda annotation: (
        annotation.update(text="👨 Male", font_size=13)
        if annotation.text.split("=")[-1] == "M"
        else annotation.update(text="🙎‍♀️ Female", font_size=13)
    )
);

In [11]:
# Showing figure
fig.show()

#### Conclusion

The male visualization may indicate that medalists were, in general, shorter (which is a bit surprising). Also, no Brazilian athlete taller than 2m have ever earn a medal in Basketball.

By comparing teams average height, we also observe that shorter teams earned more medals - this is inverted for the females.

### 🏖 Beach Volleyball

In [12]:
# Defining sport to be analyzed
analyzed_sport = "Beach Volleyball"

In [13]:
# Filtering dataframe
df_brazil_for_sport_plotting = df_brazil_for_plotting[
    df_brazil_for_plotting["Sport"] == analyzed_sport
]

# Creating figure
fig = px.strip(
    data_frame=df_brazil_for_sport_plotting,
    x="Medal",
    y="Height",
    color="Medal",
    facet_col="Sex",
    hover_data=athletes_chosen_columns_for_hover,
    category_orders=category_orders,
    color_discrete_map=color_info,
    title="🏖 Athlete Height x Performance (Medal)",
)

# Customizing figure
fig.update_traces(
    hovertemplate="%{customdata[1]}<br>"
    "Height: %{y}cm.<br>"
    "Competing at: %{customdata[10]}, %{customdata[8]}.<extra></extra>"
)

fig.update_layout(
    yaxis=dict(tickfont_size=10, title=dict(font_size=12, text="Height (cm)"))
)

fig.update_xaxes(
    matches=None, tickfont_size=10, title=dict(font_size=12, text="Performance (Medal)")
)

fig.for_each_annotation(
    lambda annotation: (
        annotation.update(text="👨 Male", font_size=13)
        if annotation.text.split("=")[-1] == "M"
        else annotation.update(text="🙎‍♀️ Female", font_size=13)
    )
);

In [14]:
# Showing figure
fig.show()

#### Conclusion

No correlation was observed.

### 🏇🏻 Equestrianism

In [15]:
# Defining sport to be analyzed
analyzed_sport = "Equestrianism"

In [16]:
# Filtering dataframe
df_brazil_for_sport_plotting = df_brazil_for_plotting[
    df_brazil_for_plotting["Sport"] == analyzed_sport
]

# Creating figure (for males)
fig = px.strip(
    data_frame=df_brazil_for_sport_plotting[df_brazil_for_sport_plotting["Sex"] == "M"],
    x="Medal",
    y="Height",
    color="Medal",
    hover_data=athletes_chosen_columns_for_hover,
    category_orders=category_orders,
    color_discrete_map=color_info,
    title="🏇 Athlete Height (Male) x Performance (Medal)",
)

# Customizing figure
fig.update_traces(
    hovertemplate="%{customdata[1]}<br>"
    "Height: %{y}cm.<br>"
    "Competing at: %{customdata[10]}, %{customdata[8]}.<extra></extra>"
)

fig.update_layout(
    yaxis=dict(tickfont_size=10, title=dict(font_size=12, text="Height (cm)"))
)

fig.update_xaxes(
    matches=None, tickfont_size=10, title=dict(font_size=12, text="Performance (Medal)")
)

fig.for_each_annotation(
    lambda annotation: (
        annotation.update(text="👨 Male", font_size=13)
        if annotation.text.split("=")[-1] == "M"
        else annotation.update(text="🙎‍♀️ Female", font_size=13)
    )
)

# Adding a warning about female athletes
fig.add_annotation(
    text="<i>Warning: There were no female medalists.</i>",
    font_color="grey",
    xref="paper",
    yref="paper",
    x=0.5,
    y=-0.2,
    showarrow=False,
);

In [17]:
# Creating figure
fig.show()

#### Conclusion

No correlation was observed.

### ⚽️ Football

In [18]:
# Defining sport to be analyzed
analyzed_sport = "Football"

#### Individual Analysis

In [19]:
# Filtering dataframe
df_brazil_for_sport_plotting = df_brazil_for_plotting[
    df_brazil_for_plotting["Sport"] == analyzed_sport
]

# Creating figure
fig = px.strip(
    data_frame=df_brazil_for_sport_plotting,
    x="Medal",
    y="Height",
    color="Medal",
    facet_col="Sex",
    hover_data=athletes_chosen_columns_for_hover,
    category_orders=category_orders,
    color_discrete_map=color_info,
    title="⚽️ Athlete Height x Performance (Medal)",
)

# Customizing figure
fig.update_traces(
    hovertemplate="%{customdata[1]}<br>"
    "Height: %{y}cm.<br>"
    "Competing at: %{customdata[10]}, %{customdata[8]}.<extra></extra>"
)

fig.update_layout(
    yaxis=dict(tickfont_size=10, title=dict(font_size=12, text="Height (cm)"))
)

fig.update_xaxes(
    matches=None, tickfont_size=10, title=dict(font_size=12, text="Performance (Medal)")
)

fig.for_each_annotation(
    lambda annotation: (
        annotation.update(text="👨 Male", font_size=13)
        if annotation.text.split("=")[-1] == "M"
        else annotation.update(text="🙎‍♀️ Female", font_size=13)
    )
);

In [20]:
# Showing figure
fig.show()

#### Team Analysis

In [21]:
# Filtering dataframe
df_brazil_for_sport_plotting_grouped = df_brazil_for_plotting_grouped[
    df_brazil_for_plotting_grouped["Sport"] == analyzed_sport
]

# Creating figure
fig = px.strip(
    data_frame=df_brazil_for_sport_plotting_grouped,
    x="Medal",
    y="Height",
    color="Medal",
    facet_col="Sex",
    hover_data=teams_chosen_columns_for_hover,
    category_orders=category_orders,
    color_discrete_map=color_info,
    title="⚽️ Team Average Height x Performance (Medal)",
)

# Customizing figure
fig.update_traces(
    hovertemplate="Olympic Games: %{customdata[1]}, %{customdata[0]}.<br>"
    "Average Height: %{y:.0f}cm.<br><extra></extra>"
)

fig.update_layout(
    yaxis=dict(tickfont_size=10, title=dict(font_size=12, text="Average Height (cm)"))
)

fig.update_xaxes(
    matches=None, tickfont_size=10, title=dict(font_size=12, text="Performance (Medal)")
)

fig.for_each_annotation(
    lambda annotation: (
        annotation.update(text="👨 Male", font_size=13)
        if annotation.text.split("=")[-1] == "M"
        else annotation.update(text="🙎‍♀️ Female", font_size=13)
    )
);

In [22]:
# Showing figure
fig.show()

#### Conclusion

The visualization may indicate a slightly tendency for taller teams to perform better (earn medals). This is reinforced in the Team visualization, where the medals are more concentrated in the upper part for Y axis.

### ⛵️ Sailing

In [23]:
# Defining sport to be analyzed
analyzed_sport = "Sailing"

In [24]:
# Filtering dataframe
df_brazil_for_sport_plotting = df_brazil_for_plotting[
    df_brazil_for_plotting["Sport"] == analyzed_sport
]

# Creating figure
fig = px.strip(
    data_frame=df_brazil_for_sport_plotting,
    x="Medal",
    y="Height",
    color="Medal",
    facet_col="Sex",
    hover_data=athletes_chosen_columns_for_hover,
    category_orders=category_orders,
    color_discrete_map=color_info,
    title="⛵️ Athlete Height x Performance (Medal)",
)

# Customizing figure
fig.update_traces(
    hovertemplate="%{customdata[1]}<br>"
    "Height: %{y}cm.<br>"
    "Competing at: %{customdata[10]}, %{customdata[8]}.<extra></extra>"
)

fig.update_layout(
    yaxis=dict(tickfont_size=10, title=dict(font_size=12, text="Height (cm)"))
)

fig.update_xaxes(
    matches=None, tickfont_size=10, title=dict(font_size=12, text="Performance (Medal)")
)

fig.for_each_annotation(
    lambda annotation: (
        annotation.update(text="👨 Male", font_size=13)
        if annotation.text.split("=")[-1] == "M"
        else annotation.update(text="🙎‍♀️ Female", font_size=13)
    )
);

In [25]:
# Showing figure
fig.show()

#### Conclusion

No correlation was observed.

### 🔫 Shooting

In [26]:
# Defining sport to be analyzed
analyzed_sport = "Shooting"

In [27]:
# Filtering dataframe
df_brazil_for_sport_plotting = df_brazil_for_plotting[
    df_brazil_for_plotting["Sport"] == analyzed_sport
]

# Creating figure
fig = px.strip(
    data_frame=df_brazil_for_sport_plotting[df_brazil_for_sport_plotting["Sex"] == "M"],
    x="Medal",
    y="Height",
    color="Medal",
    hover_data=athletes_chosen_columns_for_hover,
    category_orders=category_orders,
    color_discrete_map=color_info,
    title="🔫 Athlete Height (Male) x Performance (Medal)",
)

# Customizing figure
fig.update_traces(
    hovertemplate="%{customdata[1]}<br>"
    "Height: %{y}cm.<br>"
    "Competing at: %{customdata[10]}, %{customdata[8]}.<extra></extra>"
)

fig.update_layout(
    yaxis=dict(tickfont_size=10, title=dict(font_size=12, text="Height (cm)"))
)

fig.update_xaxes(
    matches=None, tickfont_size=10, title=dict(font_size=12, text="Performance (Medal)")
)

fig.for_each_annotation(
    lambda annotation: (
        annotation.update(text="👨 Male", font_size=13)
        if annotation.text.split("=")[-1] == "M"
        else annotation.update(text="🙎‍♀️ Female", font_size=13)
    )
)

# Adding a warning about female athletes
fig.add_annotation(
    text="<i>Warning: There were no female medalists.</i>",
    font_color="grey",
    xref="paper",
    yref="paper",
    x=0.5,
    y=-0.2,
    showarrow=False,
);

In [28]:
# Showing figure
fig.show()

#### Conclusion

No correlation was observed.

### 🏊‍♂️ Swimming

In [29]:
# Defining sport to be analyzed
analyzed_sport = "Swimming"

In [30]:
# Filtering dataframe
df_brazil_for_sport_plotting = df_brazil_for_plotting[
    df_brazil_for_plotting["Sport"] == analyzed_sport
]

# Creatign figure
fig = px.strip(
    data_frame=df_brazil_for_sport_plotting,
    x="Medal",
    y="Height",
    color="Medal",
    facet_col="Sex",
    hover_data=athletes_chosen_columns_for_hover,
    category_orders=category_orders,
    color_discrete_map=color_info,
    title="🏊‍♂️ Athlete Height x Performance (Medal)",
)

# Customizing figure
fig.update_traces(
    hovertemplate="%{customdata[1]}<br>"
    "height: %{y}cm.<br>"
    "Competing at: %{customdata[10]}, %{customdata[8]}.<extra></extra>"
)

fig.update_layout(
    yaxis=dict(tickfont_size=10, title=dict(font_size=12, text="Height (cm)"))
)

fig.update_xaxes(
    matches=None, tickfont_size=10, title=dict(font_size=12, text="Performance (Medal)")
)

fig.for_each_annotation(
    lambda annotation: (
        annotation.update(text="👨 Male", font_size=13)
        if annotation.text.split("=")[-1] == "M"
        else annotation.update(text="🙎‍♀️ Female", font_size=13)
    )
);

In [31]:
# Showing Figure
fig.show()

#### Conclusion

The male visualization may indicate a medal concentrarion in the upper part of the Y axis. Every medalist (but one) was over 1.80m tall.

No correlation was observed in the female visualization.

### 🏐 Volleyball

In [32]:
# Defining sport to be analyzed
analyzed_sport = "Volleyball"

#### Individual analysis

In [33]:
# Filtering dataframe
df_brazil_for_sport_plotting = df_brazil_for_plotting[
    df_brazil_for_plotting["Sport"] == analyzed_sport
]

# Creating figure
fig = px.strip(
    data_frame=df_brazil_for_sport_plotting,
    x="Medal",
    y="Height",
    color="Medal",
    facet_col="Sex",
    hover_data=athletes_chosen_columns_for_hover,
    category_orders=category_orders,
    color_discrete_map=color_info,
    title="🏐 Athlete Height x Performance (Medal)",
)

# Cusotmizing figure
fig.update_traces(
    hovertemplate="%{customdata[1]}<br>"
    "Height: %{y}cm.<br>"
    "Competing at: %{customdata[10]}, %{customdata[8]}.<extra></extra>"
)

fig.update_layout(
    yaxis=dict(tickfont_size=10, title=dict(font_size=12, text="Height (cm)"))
)

fig.update_xaxes(
    matches=None, tickfont_size=10, title=dict(font_size=12, text="Performance (Medal)")
)

fig.for_each_annotation(
    lambda annotation: (
        annotation.update(text="👨 Male", font_size=13)
        if annotation.text.split("=")[-1] == "M"
        else annotation.update(text="🙎‍♀️ Female", font_size=13)
    )
);

In [34]:
# Showing figure
fig.show()

#### Team Analysis

In [35]:
# Filtering dataframe
df_brazil_for_sport_plotting_grouped = df_brazil_for_plotting_grouped[
    df_brazil_for_plotting_grouped["Sport"] == analyzed_sport
]

# Creating figure
fig = px.strip(
    data_frame=df_brazil_for_sport_plotting_grouped,
    x="Medal",
    y="Height",
    color="Medal",
    facet_col="Sex",
    hover_data=teams_chosen_columns_for_hover,
    category_orders=category_orders,
    color_discrete_map=color_info,
    title="🏐 Team Average Height x Performance (Medal)",
)

# Customizing figure
fig.update_traces(
    hovertemplate="Competing at: %{customdata[1]}, %{customdata[0]}.<br>"
    "Team Average Height: %{y:.0f}cm.<br><extra></extra>"
)

fig.update_layout(
    yaxis=dict(tickfont_size=10, title=dict(font_size=12, text="Average Height (cm)"))
)

fig.update_xaxes(
    matches=None, tickfont_size=10, title=dict(font_size=12, text="Performance (Medal)")
)

fig.for_each_annotation(
    lambda annotation: (
        annotation.update(text="👨 Male", font_size=13)
        if annotation.text.split("=")[-1] == "M"
        else annotation.update(text="🙎‍♀️ Female", font_size=13)
    )
);

In [36]:
# Showing figure
fig.show()

#### Conclusion

The visualization indicates a correlation, specially in the team average comparison, between height and medal performance.

### 📝 Considerations

Each sport has a behavior, which are dinamic and constantly evolving. For more robust conclusions, the ranking of the athlete (not only the medal, but 4th, 5th,..., nth positions) would provide a better way to analyze the visualizations.

## 🏅Medals by Sport

**3. Create a visualization to show, by sport, the proportion of gold, silver, and bronze medals Brazil have earned.**

In [37]:
# Creating a copy of the dataframe for plotiing
df_brazil_medal_plot = df_brazil.copy()

# Grouping dataframe to obtain a single record for each earned medal
df_brazil_medal_plot = df_brazil_medal_plot.groupby(
    by=["Medal", "Sport", "Event", "Year"], as_index=False
).count()

# Grouping dataframe by medal and sport
df_brazil_medal_plot = df_brazil_medal_plot.groupby(
    by=["Medal", "Sport"], as_index=False
).count()

# Definindo a sequência de ordenação das colunas
df_brazil_medal_plot["Medal"] = pd.Categorical(
    values=df_brazil_medal_plot["Medal"],
    categories=("Bronze", "Silver", "Gold"),
    ordered=True,
)

df_brazil_medal_plot.sort_values(["Medal", "Event"], ascending=False, inplace=True)

# Defining colors
color_info = {"Bronze": "lightsalmon", "Silver": "grey", "Gold": "gold"}

# Creating figure
fig = px.bar(
    data_frame=df_brazil_medal_plot,
    x="Event",
    y="Sport",
    color="Medal",
    color_discrete_map=color_info,
    orientation="h",
    text_auto=True,
    title="🏅 Olympic Medals by Sport",
)

# Customizing figure
fig.update_traces(textposition="inside", insidetextanchor="middle", textangle=0)

fig.update_layout(
    yaxis=dict(autorange="reversed", title_text=None, ticks=""),
    xaxis_visible=False,
    hovermode=False,
);

In [38]:
# Showing figure
fig.show()

## 📈 Medal Time Series

**4. Create a visualization of Brazil's evolution in earned medals through time.**

In [39]:
# Defining Country (NOC) to be analyzed
chosen_country = "BRA"

# Defining if winter games will be included in the visualization
include_winter = False

# Defining standard number of ticks on X axis
dtick = "M24"

# Copying dataframe and applying filters
df_time_plot = df[df["NOC"] == chosen_country].copy()

if not include_winter:
    df_time_plot = df_time_plot[df_time_plot["Season"] != "Winter"].copy()
    dtick = "M48"

# Chosing category orders
category_orders = {"Medal": ["No Medal", "Bronze", "Silver", "Gold"]}

# Grouping dataframe to obtain a single record for each earned medal
df_time_plot = df_time_plot.groupby(
    by=["Year", "Medal", "Event"], as_index=False
).count()
df_time_plot = df_time_plot.groupby(by=["Year", "Medal"], as_index=False).count()

# Pivoting dataframe
df_time_plot = df_time_plot.pivot(index="Year", columns="Medal")["Event"]
df_time_plot.fillna(0, inplace=True)

# Creating figure
fig = px.line(
    data_frame=df_time_plot,
    x=df_time_plot.index,
    y=["Gold", "Silver", "Bronze"],
    color_discrete_map=color_info,
    markers=True,
    title="🇧🇷 Brazil's Medal Timeline",
)

# Customizing figure
fig.update_traces(hovertemplate="Year: %{x|%Y}.<br>" "Medals: %{y}.<br>", marker_size=6)

fig.update_layout(legend_title_text="Medal")

fig.update_yaxes(
    tickfont_size=10, title=dict(font_size=12, text="Medals"), rangemode="tozero"
)

fig.update_xaxes(
    tickfont_size=10,
    title=dict(font_size=12, text="Ano"),
    type="date",
    dtick="M48",
    tickangle=45,
);

In [40]:
# Showing figure
fig.show()