# Semifinals Women: IFSC WM in Bern 2023

In [None]:
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
from pandas.plotting import scatter_matrix

In [None]:
DATA_DIR_DATA = Path('data')
OUTPUT_DIR_PATH = Path('output/data_exploration')

In [None]:
df = pd.read_csv(DATA_DIR_DATA / "women_data.csv", delimiter=";", encoding="ISO-8859-1", skipfooter=1,
    engine='python')

In [None]:
df.head()

In [None]:
def set_matplotlib_rc() -> None:
    """Basically set font settings etc. for nice looking figures."""
    matplotlib.rcParams.update({'text.usetex': False, 'font.family': 'stixgeneral', 'mathtext.fontset': 'stix', })
    matplotlib.rcParams.update({'font.size': 12, 'legend.fontsize': 13})
    font = {'weight': 'bold'}
    matplotlib.rc('font', **font)

    plt.rcParams['axes.linewidth'] = 2

set_matplotlib_rc()

## Height distribution

In [None]:
def height_distribution_by_category(df):
    # Set style for the plot
    sns.set_style("white")
    
    # Initialize the matplotlib figure
    plt.figure(figsize=(8, 6))
    
    # Plot distribution of 'height [cm]' for all data with specified bin size to match the step of 2
    bins = np.arange(150, 178, 2)  # Creates bins from 150 to 178 with a step of 2
    sns.histplot(data=df, x="height [cm]", hue="height category", multiple="stack", bins=bins,
                 palette={"short": "orange", "tall": "teal"}, edgecolor=".2", linewidth=.5, alpha=0.7)
    
    # Customizing the plot
    #plt.title('Height Distribution by Category', fontsize=18, fontweight='bold')
    plt.xlabel('Height [cm]', fontsize=16)
    plt.ylabel('Number of Athletes', fontsize=16)
    plt.xticks(np.arange(150, 176, 5))  # Sets x-axis ticks to match the desired range and step
    plt.legend(['tall', 'short'], title='Height Category', title_fontsize='13', fontsize='12')


height_distribution_by_category(df)
plt.savefig(OUTPUT_DIR_PATH / "height_distribution.png")
plt.show()

## Correlation matrix athlete height and total score

### Correlation matrix

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


def create_height_score_correlation_matrix(df: pd.DataFrame) -> None:
    # Adjust the figure size for better clarity and space
    plt.figure(figsize=(12, 10))

    # Ensure category names in the DataFrame are correctly matched in the palette
    unique_categories = df['height category'].unique()
    # Create a palette that dynamically matches the DataFrame's categories
    palette = {category: 'teal' if category.lower() == 'tall' else 'orange' for category in unique_categories}
    
    # Generate the pairplot with the specified color palette
    pairplot = sns.pairplot(df, vars=["height [cm]", "Total Score"], hue="height category", diag_kind="kde", palette=palette, height=3.5, plot_kws={'alpha':0.5})

    # Find the top scoring athletes
    top = df.nlargest(3, 'Total Score')

    # Define offsets for better text annotation visibility
    vertical_offset_scale = 2.0  # Increased vertical spread
    horizontal_offset_constant = 0.5  # Small constant offset to the right

    # Annotate each subplot with the top athletes' names, applying offsets
    for i, athlete in top.iterrows():
        x_height = athlete['height [cm]'] + horizontal_offset_constant
        y_score = athlete['Total Score'] + np.random.uniform(-vertical_offset_scale, vertical_offset_scale)
        name = athlete['name']

        for j in range(2):
            for k in range(2):
                if j != k:  # Skip the diagonal plots
                    ax = pairplot.axes[j][k]
                    x, y = (x_height, y_score) if k == 0 else (y_score + horizontal_offset_constant, x_height + np.random.uniform(-vertical_offset_scale, vertical_offset_scale))
                    ax.text(x, y, name, horizontalalignment='left', size=13, color='black', weight='normal')

In [None]:
create_height_score_correlation_matrix(df)
plt.savefig(OUTPUT_DIR_PATH / "height_score_correlation_matrix.png")
plt.show()

### Scatter plot

In [None]:
def create_height_score_scatter_plot(df: pd.DataFrame) -> None:
    # Set plot style
    sns.set_style("whitegrid")
    
    # Create a figure and axis with specific size
    plt.figure(figsize=(8, 6))
    
    # Categorize height
    df['Category'] = np.where(df['height [cm]'] > 161, 'tall', 'short')
    
    # Define colors for categories
    color_map = {'short': 'orange', 'tall': 'teal'}
    
    # Scatter plot for each category
    for category, color in color_map.items():
        subset = df[df['Category'] == category]
        plt.scatter(subset['height [cm]'], subset['Total Score'], alpha=0.7, color=color, label=category, s=100, edgecolor='k', linewidth=0.5)
    
    # Vertical line at 161 cm
    plt.axvline(x=161, color='dimgrey', linestyle='--', label='Threshold (tall: > 161 cm)', linewidth=2)
    
    # Set x-axis and y-axis ticks
    plt.xticks(np.arange(145, 181, 5))
    plt.yticks(np.arange(0, 101, 10))
    
    # Label axes and set title
    plt.xlabel('Height [cm]', fontsize=14)
    plt.ylabel('Score', fontsize=14)
    #plt.title('Scatter Plot of Total Score vs. Height with Categories and Enhanced Layout', fontsize=16)
    
    # Customize legend
    plt.legend(frameon=True, fontsize=12, loc='upper right')
    
    # Show gridlines for better readability
    plt.grid(True, which='both', linestyle='--', linewidth=0.5)
    
    # Optionally, tighten the layout
    plt.tight_layout()

In [None]:
create_height_score_scatter_plot(df)
plt.savefig(OUTPUT_DIR_PATH / "height_score_scatter_plot.png")
plt.show()

### Violin plot

#### Individual violin plots for height categories

In [None]:
def create_violin_plots(df: pd.DataFrame) -> None:
    # Set plot style
    sns.set_style("whitegrid")
    
    # Ensure 'height category' is a categorical type with a specific order
    df['height category'] = pd.Categorical(df['height category'], categories=['short', 'tall'], ordered=True)

    # Ensure the order of categories with 'order' parameter in seaborn's violinplot
    category_order = ['short', 'tall']  # This will ensure Short is on the left, Tall on the right
    
    # Create a figure and axis with specific size
    plt.figure(figsize=(10, 6))
    
    # Generate violin plots for each height category in the specified order
    sns.violinplot(x='height category', y='Total Score', data=df, order=category_order, split=False, inner='quartile', 
                   palette={'short': 'orange', 'tall': 'teal'}, cut=0)
    
    # Adjust y-axis limits and ticks for score range
    plt.ylim(-10, 110)
    plt.yticks(np.arange(0, 101, 10))
    
    # Customizing the plot
    plt.xlabel('Height Category', fontsize=14)
    plt.ylabel('Score', fontsize=14)
    #plt.title('Total Score Distribution by Height Category', fontsize=16)


create_violin_plots(df)
plt.savefig(OUTPUT_DIR_PATH / "height_score_violin_plot.png")
plt.show()

#### Split violin plot

In [None]:
def create_split_violin_plot(df: pd.DataFrame) -> None:
    # Set plot style
    sns.set_style("whitegrid")

    # Ensure 'height category' is a categorical type with a specific order
    df['height category'] = pd.Categorical(df['height category'], categories=['short', 'tall'], ordered=True)

    # Create a figure and axis with a specific size
    plt.figure(figsize=(10, 6))

    # Generate a split violin plot with the specified color palette
    ax = sns.violinplot(data=df, x="height category", y="Total Score", hue="height category",
                        split=True, inner="quart", palette={'short': 'orange', 'tall': 'teal'}, cut=0)

    # Adjust y-axis limits and ticks for the score range, adding some buffer space
    plt.ylim(-10, 110)
    plt.yticks(np.arange(0, 101, 10))

    # Customize the plot
    plt.xlabel('Height Category', fontsize=14)
    plt.ylabel('Score', fontsize=14)
    #plt.title('Total Score Distribution by Height Category', fontsize=16)

create_split_violin_plot(df)
plt.show()

## Looking at Tops and Zones per height category

### Number of Tops per boulder per height category

In [None]:
def tops_per_boulder_per_height_category(df):
    # Splitting the DataFrame based on 'height category'
    df_tall = df[df['height category'] == 'tall']
    df_short = df[df['height category'] == 'short']
    
    # Counting the tops for each boulder
    boulders = ['W1 25', 'W2 25', 'W3 25', 'W4 25']
    
    # Initialize a dictionary to store the sums
    tops_sum = {'Boulder': boulders, 'short': [], 'tall': []}
    
    for boulder in boulders:
        tops_sum['short'].append(df_short[boulder].sum())
        tops_sum['tall'].append(df_tall[boulder].sum())
        
    # Adjust the 'Boulder' key in the tops_sum dictionary to simplified names
    tops_sum['Boulder'] = [boulder.replace(" 25", "") for boulder in boulders]
    
    # Now, when you convert this dictionary to a DataFrame, the boulder names will be simplified
    tops_sum_df = pd.DataFrame(tops_sum)
    
    # Plotting the bar plot
    colors = ['orange', 'teal']  # short then tall if the DataFrame columns are in that order
    ax = tops_sum_df.plot(x='Boulder', kind='bar', figsize=(10, 6), width=0.8, rot=0, color=colors)
    
    #plt.title('Total Tops per Boulder by Height Category')
    plt.xlabel('Boulder')
    plt.ylabel('Number of Tops')
    
    # Set y-ticks range
    plt.ylim(0, 6)
    plt.yticks(np.arange(0, 6, 1))
    
    # Enable only horizontal grid lines
    ax.yaxis.grid(True)  # Enable the y-axis grid lines
    ax.xaxis.grid(False)  # Disable the x-axis grid lines
    
    plt.legend(title='Athlete Height Category')
    plt.tight_layout()


tops_per_boulder_per_height_category(df)
plt.savefig(OUTPUT_DIR_PATH / "tops_per_boulder_by_heightcategory.png")
plt.show()

### One bar plot per boulder: How many reached Zone 1, Zone 2 and the Top per height category

In [None]:
def zones_and_tops_per_height_category_2x2(df):
    # Splitting the DataFrame based on 'height category'
    df_tall = df[df['height category'] == 'tall']
    df_short = df[df['height category'] == 'short']
    
    # Boulders and their zones
    boulders_zones = {
        'W1': ['W1 5', 'W1 10', 'W1 25'],
        'W2': ['W2 5', 'W2 10', 'W2 25'],
        'W3': ['W3 5', 'W3 10', 'W3 25'],
        'W4': ['W4 5', 'W4 10', 'W4 25'],
    }
    
    # Create a 2x2 subplot structure
    fig, axs = plt.subplots(2, 2, figsize=(20, 12))  # Adjust the figsize as needed
    fig.subplots_adjust(hspace=0.3, wspace=0.3)  # Adjust space between plots
    
    # Flatten the axs array for easy iteration
    axs = axs.flatten()
    
    # Bar width
    width = 0.35  # Width of the bars, you can adjust this as needed
    
    # Iterate over each boulder and its zones
    for i, (boulder, zones) in enumerate(boulders_zones.items()):
        tops_sum = {'Zone': ['Zone 1', 'Zone 2', 'Top'], 'short': [], 'tall': []}
        
        for zone in zones:
            tops_sum['short'].append(df_short[zone].sum())
            tops_sum['tall'].append(df_tall[zone].sum())
        
        # Convert the dictionary to a DataFrame
        tops_sum_df = pd.DataFrame(tops_sum)
        
        # X locations for the groups
        ind = np.arange(len(tops_sum_df['Zone']))  # the x locations for the groups
        
        # Plotting the bar plot for each boulder in its subplot
        axs[i].bar(ind - width/2, tops_sum_df['short'], width, label='short', color='orange')
        axs[i].bar(ind + width/2, tops_sum_df['tall'], width, label='tall', color='teal')
        
        axs[i].set_title(f'Zone Achievement by Height Category for {boulder}')
        axs[i].set_ylabel('Number of Athletes')
        axs[i].set_ylim(0, 14)  # Set y-limit to 0-14 as requested
        axs[i].set_xticks(ind)
        axs[i].set_xticklabels(tops_sum_df['Zone'])
        axs[i].legend(title='Height Category')

        # Enable only horizontal grid lines
        axs[i].yaxis.grid(True)  # Enable the y-axis grid lines
        axs[i].xaxis.grid(False)  # Disable the x-axis grid lines

    plt.tight_layout()


zones_and_tops_per_height_category_2x2(df)
plt.savefig(OUTPUT_DIR_PATH / "zones_and_tops_per_height_category.png")
plt.show()