# Semifinals Women: IFSC WM in Bern 2023

In [None]:
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix

In [None]:
DATA_DIR_DATA = Path('data')
OUTPUT_DIR_PATH = Path('output')

In [None]:
df = pd.read_csv(DATA_DIR_DATA / "women_data.csv", delimiter=";", encoding="ISO-8859-1", skipfooter=1)

## Height distribution

In [None]:
_ = sns.histplot(df.loc[:, "height [cm]"], bins=10)

## Correlation matrix athlete height and total score

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


def create_height_total_score_scatter_plot(df: pd.DataFrame) -> None:
    # Adjust the figure size for better clarity and space
    plt.figure(figsize=(12, 10))

    # Generate the pairplot
    pairplot = sns.pairplot(df, vars=["height [cm]", "Total Score"], hue="height category", diag_kind="kde", height=3.5, plot_kws={'alpha':0.5})
    #plt.suptitle('Scatter Matrix for Height and Total Score by Height Category', y=1.02)

    # Find the top 5 scoring athletes
    top = df.nlargest(3, 'Total Score')

    # Define offsets for better text annotation visibility
    vertical_offset_scale = 2.0  # Increased vertical spread
    horizontal_offset_constant = 0.5  # Small constant offset to the right

    # Annotate each subplot with the top 5 athletes' names, applying offsets
    for i, athlete in top.iterrows():
        # Apply a random vertical offset and a small horizontal offset
        x_height = athlete['height [cm]'] + horizontal_offset_constant
        y_score = athlete['Total Score'] + np.random.uniform(-vertical_offset_scale, vertical_offset_scale)
        name = athlete['name']

        # Iterate over the axes to annotate the plots
        for j in range(2):  # Adjust based on the number of variables
            for k in range(2):  # Same here
                if j != k:  # Skip the diagonal plots
                    ax = pairplot.axes[j][k]

                    # Determine the position for the annotation based on the subplot
                    if k == 0:  # Column for 'height [cm]'
                        x = x_height
                        y = y_score
                    else:  # Column for 'Total Score'
                        x = y_score + horizontal_offset_constant  # Apply horizontal offset here as well
                        y = x_height + np.random.uniform(-vertical_offset_scale, vertical_offset_scale)  # Apply vertical offset

                    # Add the annotation with adjustments
                    ax.text(x, y, name, horizontalalignment='left', size='small', color='black', weight='normal')

create_height_total_score_scatter_plot(df)
plt.savefig(OUTPUT_DIR_PATH / "height_score_correlation.png")
plt.show()

## Number of Tops per boulder per height category

In [None]:
# Splitting the DataFrame based on 'height category'
df_tall = df[df['height category'] == 'tall']
df_short = df[df['height category'] == 'short']

# Counting the tops for each boulder
boulders = ['W1 25', 'W2 25', 'W3 25', 'W4 25']

# Initialize a dictionary to store the sums
tops_sum = {'Boulder': boulders, 'Tall': [], 'Short': []}

for boulder in boulders:
    tops_sum['Tall'].append(df_tall[boulder].sum())
    tops_sum['Short'].append(df_short[boulder].sum())

# Adjust the 'Boulder' key in the tops_sum dictionary to simplified names
tops_sum['Boulder'] = [boulder.replace(" 25", "") for boulder in boulders]

# Now, when you convert this dictionary to a DataFrame, the boulder names will be simplified
tops_sum_df = pd.DataFrame(tops_sum)

In [None]:
# Plotting the bar plot
ax = tops_sum_df.plot(x='Boulder', kind='bar', figsize=(10, 6), width=0.8, rot=0, )

#plt.title('Total Tops per Boulder by Height Category')
plt.xlabel('Boulder')
plt.ylabel('Total Tops')

# Set y-ticks range
plt.yticks(range(0, 6))

# Remove the grid
ax.grid(False)

plt.legend(title='Height Category')
plt.tight_layout()

plt.savefig(OUTPUT_DIR_PATH / "tops_per_boulder_by_heightcategory.png")
plt.show()