In [2]:
import plotly.graph_objects as go
import plotly.io as pio

# Define the updated base template
base = go.layout.Template(
    layout=go.Layout(
        paper_bgcolor='#FFF5CC',
        plot_bgcolor='#FFF5CC',
        height=800,
        width=800 * 1.618,
        xaxis=dict(
            anchor='y',
            showgrid=True,
            gridcolor='#888888',  # Darker grid lines
            tickfont=dict(
                size=24,
                family='Open Sans, sans-serif'  # Use Open Sans font
            ),
            titlefont=dict(
                size=26,
                family='Open Sans, sans-serif'  # Use Open Sans font
            ),
            linecolor='#333333',
            linewidth=2  # Adjust the thickness of the x-axis line
        ),
        yaxis=dict(
            anchor='x',
            showgrid=True,
            gridcolor='#888888',  # Darker grid lines
            tickfont=dict(
                size=24,
                family='Open Sans, sans-serif'
            ),
            titlefont=dict(
                size=26,
                family='Open Sans, sans-serif'
            ),
            linecolor='#333333',
            linewidth=2  # Adjust the thickness of the y-axis line
        ),
        font=dict(
            color='#333333',
            size=28,
            family='Open Sans, sans-serif'
        ),
        # Updated colorway to ensure more distinguishable colors
        colorway=["#470945", # H: Violet
                  "#E67E5A", # H: Orange (Sienna)
                  "#297FB9", # H: Blue (Steel)
                  "#163748", # D: Charcoal
                  "#4F1787", # H: Purple
                  "#EFE04E", # H: Yellow (Maize)
                  "#214F70", # D: Indigo
                  "#DF14AA", # H: Pink (Cerise)
                  "#100B1A", # D: Black
                  "#12C4CF", # H: Teal
                  "#14193D", # D: Space
                  "#CC5500"],# H: Cream
        title=go.layout.Title(
            text='',
            font=dict(
                size=34,
                color='#333333',
                family='Open Sans, sans-serif'
            ),
            x=0.05,
        )
    ),
    data=dict(
        scatter=[
            go.Scatter(
                line=dict(width=6)  # Set the line width for scatter plots
            )
        ]
    )
)

# Register the updated base template
pio.templates['base'] = base
pio.templates.default = 'base'

In [3]:
import plotly.graph_objects as go
import plotly.io as pio

# Define the updated base template
base_trans = go.layout.Template(
    layout=go.Layout(
        paper_bgcolor='rgba(0,0,0,0)',  # Transparent background
        plot_bgcolor='rgba(0,0,0,0)',   # Transparent plot area
        height=800,
        width=800 * 1.618,
        xaxis=dict(
            anchor='y',
            showgrid=True,
            gridcolor='#888888',  # Darker grid lines
            tickfont=dict(
                size=24,
                family='Open Sans, sans-serif'  # Use Open Sans font
            ),
            titlefont=dict(
                size=26,
                family='Open Sans, sans-serif'  # Use Open Sans font
            ),
            linecolor='#333333',
            linewidth=2  # Adjust the thickness of the x-axis line
        ),
        yaxis=dict(
            anchor='x',
            showgrid=True,
            gridcolor='#888888',  # Darker grid lines
            tickfont=dict(
                size=24,
                family='Open Sans, sans-serif'
            ),
            titlefont=dict(
                size=26,
                family='Open Sans, sans-serif'
            ),
            linecolor='#333333',
            linewidth=2  # Adjust the thickness of the y-axis line
        ),
        font=dict(
            color='#333333',
            size=28,
            family='Open Sans, sans-serif'
        ),
        # Updated colorway to ensure more distinguishable colors
        colorway=["#470945", # H: Violet
                  "#E67E5A", # H: Orange (Sienna)
                  "#297FB9", # H: Blue (Steel)
                  "#163748", # D: Charcoal
                  "#4F1787", # H: Purple
                  "#EFE04E", # H: Yellow (Maize)
                  "#214F70", # D: Indigo
                  "#100B1A", # D: Black
                  "#12C4CF", # H: Teal
                  "#14193D", # D: Space
                  "#CC5500"],# H: Cream
        title=go.layout.Title(
            text='',
            font=dict(
                size=34,
                color='#333333',
                family='Open Sans, sans-serif'
            ),
            x=0.05,
        )
    ),
    data=dict(
        scatter=[
            go.Scatter(
                line=dict(width=6)  # Set the line width for scatter plots
            )
        ]
    )
)

# Register the updated base template
pio.templates['base_trans'] = base_trans
pio.templates.default = 'base_trans'

In [4]:
import pandas as pd

# Load the CSV files into dataframes (ordered alphabetically)
d_archery = pd.read_csv('d_archery.csv')
d_athletics100M = pd.read_csv('d_athletics100M.csv')
d_athletics800M = pd.read_csv('d_athletics800M.csv')
d_baseball = pd.read_csv('d_baseball.csv')
d_basketball = pd.read_csv('d_basketball.csv')
d_chess = pd.read_csv('d_chess.csv')
d_discus = pd.read_csv('d_discus.csv')
d_diving10M = pd.read_csv('d_diving10M.csv')
d_dressage = pd.read_csv('d_dressage.csv')
d_football = pd.read_csv('d_football.csv')
d_formulaone = pd.read_csv('d_formulaone.csv')
d_golf = pd.read_csv('d_golf.csv')
d_gridiron = pd.read_csv('d_gridiron.csv')
d_gymnastics = pd.read_csv('d_gymnastics.csv')
d_highjump = pd.read_csv('d_highjump.csv')
d_hockey = pd.read_csv('d_hockey.csv')
d_javelin = pd.read_csv('d_javelin.csv')
d_longjump = pd.read_csv('d_longjump.csv')
d_marathon = pd.read_csv('d_marathon.csv')
d_mma = pd.read_csv('d_mma.csv')
d_swimming100m = pd.read_csv('d_swimming100m.csv')
d_tennis = pd.read_csv('d_tennis.csv')


In [5]:
import pandas as pd

# Updated function to handle non-numeric values
def extract_year(date_value):
    try:
        # If the value contains a hyphen (e.g., '2014-05'), split and take the year part
        if isinstance(date_value, str) and '-' in date_value:
            return int(date_value.split('-')[0])
        # If it's a direct integer or string representation of an integer, convert it
        return int(date_value)
    except ValueError:
        # If there's a non-numeric value, return NaN or an appropriate value (e.g., a placeholder year)
        return None

# Apply this function to both columns in each dataframe
def process_dataframe(df):
    df['peak_date'] = df['peak_date'].apply(extract_year)
    df['birth_year'] = pd.to_numeric(df['birth_year'], errors='coerce')
    df['age_at_peak'] = (df['peak_date'] - df['birth_year']).astype('Int64')

# Apply to all datasets, ordered alphabetically
datasets = [
    d_archery,
    d_athletics100M,
    d_athletics800M,
    d_baseball,
    d_basketball,
    d_chess,
    d_discus,
    d_diving10M,
    d_dressage,
    d_football,
    d_formulaone,
    d_golf,
    d_gridiron,
    d_gymnastics,
    d_highjump,
    d_hockey,
    d_javelin,
    d_longjump,
    d_marathon,
    d_mma,
    d_swimming100m,
    d_tennis,
]

for dataset in datasets:
    process_dataframe(dataset)

print("Processing complete for all datasets.")

Processing complete for all datasets.


In [6]:
# First, add a discipline column to each dataframe
d_archery['discipline'] = 'archery'
d_athletics100M['discipline'] = 'athletics100M'
d_athletics800M['discipline'] = 'athletics800M'
d_baseball['discipline'] = 'baseball'
d_basketball['discipline'] = 'basketball'
d_chess['discipline'] = 'chess'
d_discus['discipline'] = 'discus'
d_diving10M['discipline'] = 'diving10M'
d_dressage['discipline'] = 'dressage'
d_football['discipline'] = 'football'
d_formulaone['discipline'] = 'formulaone'
d_golf['discipline'] = 'golf'
d_gridiron['discipline'] = 'gridiron'
d_gymnastics['discipline'] = 'gymnastics'
d_highjump['discipline'] = 'highjump'
d_hockey['discipline'] = 'hockey'
d_javelin['discipline'] = 'javelin'
d_longjump['discipline'] = 'longjump'
d_marathon['discipline'] = 'marathon'
d_mma['discipline'] = 'mma'
d_swimming100m['discipline'] = 'swimming100m'
d_tennis['discipline'] = 'tennis'

# Now concatenate all the dataframes into a single dataframe
d_athletes = pd.concat([
    d_archery,
    d_athletics100M, 
    d_athletics800M, 
    d_baseball,
    d_basketball,
    d_chess, 
    d_discus, 
    d_diving10M, 
    d_dressage, 
    d_football, 
    d_formulaone, 
    d_golf, 
    d_gridiron,
    d_gymnastics, 
    d_highjump,
    d_hockey,
    d_javelin, 
    d_longjump, 
    d_marathon, 
    d_mma,
    d_swimming100m, 
    d_tennis, 
], ignore_index=True)

print("Concatenation complete with discipline and plot column added.")

Concatenation complete with discipline and plot column added.


In [7]:
# import time
# import wikipediaapi
# import re

# # Initialize Wikipedia API with user-agent
# wiki_wiki = wikipediaapi.Wikipedia('en', headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'})

# # Function to fetch birth year from Wikipedia
# def fetch_birth_year(player_name):
#     """
#     Fetch the birth year of a player from their Wikipedia page.
#     """
#     try:
#         # Get the Wikipedia page
#         page = wiki_wiki.page(player_name)
#         if page.exists():
#             # Extract the page summary
#             summary = page.summary

#             # Use regex to find the birth year in the summary
#             birth_year_match = re.search(r'\b\d{4}\b', summary)

#             if birth_year_match:
#                 return int(birth_year_match.group(0))  # Return the first match found (likely the birth year)
#         return None
#     except Exception as e:
#         print(f"Error fetching data for {player_name}: {e}")
#         return None

# # Take a sample of the dataframe (replace with full dataframe when ready)
# sample_df = d_tennis  # You can add .sample() to reduce the sample size for testing

# # Add a new column for the actual birth year from Wikipedia
# sample_df['actual_birth_year'] = None

# # Process the data row by row with a delay and print the mismatches
# for index, row in sample_df.iterrows():
#     actual_birth_year = fetch_birth_year(row['player'])
#     sample_df.at[index, 'actual_birth_year'] = actual_birth_year
    
#     # Convert birth_year to int if it's not already
#     try:
#         birth_year = int(row['birth_year'])
#     except ValueError:
#         birth_year = None

#     # Check if the birth years match
#     birth_year_match = birth_year == actual_birth_year
    
#     # Only print mismatched results
#     if not birth_year_match:
#         print({
#             'player': row['player'],
#             'birth_year': birth_year,
#             'actual_birth_year': actual_birth_year,
#         })
    
#     # Delay to avoid API limits (e.g., 1 second delay)
#     time.sleep(1)

# # Optionally save the mismatches to a CSV file if needed
# # mismatches_df = sample_df[sample_df['birth_year'] != sample_df['actual_birth_year']]
# # mismatches_df.to_csv('birth_year_mismatches.csv', index=False)


In [8]:
import plotly.express as px

def plot_trend_chart(df, width=800, height=600, gender='both', window=None, group_by='discipline', y_min=0, y_max=80, line_thickness=2, show_minimal=True):
    """
    Function to plot the trend chart for average age at peak performance by a chosen group (e.g., discipline, nationality) over time.

    Parameters:
    - df: The dataframe containing the data (e.g., d_athletes)
    - width: The width of the plot (default is 800)
    - height: The height of the plot (default is 600)
    - gender: The gender to filter ('M', 'F', or 'both', default is 'both')
    - window: The rolling window for smoothing the trend (None means no rolling window, otherwise set to an integer)
    - group_by: The column to group by (e.g., 'discipline', 'nationality', etc.), default is 'discipline'
    - y_min: The minimum value of the y-axis (default is 0)
    - y_max: The maximum value of the y-axis (default is 80)
    - line_thickness: The thickness of the line (default is 2)
    - show_minimal: If True, show only the line with no background, axis, or labels (default is True)
    """
    
    # Check if the provided group_by column exists in the dataframe
    if group_by not in df.columns:
        print(f"Column '{group_by}' not found in dataframe.")
        return

    # Filter the dataframe based on the gender selection
    if gender in ['M', 'F']:
        filtered_df = df[df['gender'] == gender]
    else:
        filtered_df = df  # Keep all if 'both' is selected
    
    # Group the data by 'peak_date' and the chosen group_by column, then calculate the average 'age_at_peak'
    avg_age_by_year = filtered_df.groupby(['peak_date', group_by])['age_at_peak'].mean().reset_index()

    # Sort by the group_by column to ensure legend is ordered alphabetically
    avg_age_by_year = avg_age_by_year.sort_values(by=[group_by, 'peak_date'])

    # Apply rolling window if provided
    if window:
        avg_age_by_year['age_at_peak'] = avg_age_by_year.groupby(group_by)['age_at_peak'].transform(lambda x: x.rolling(window=window, min_periods=1).mean())

    # Create a line chart with Plotly
    fig = px.line(
        avg_age_by_year,
        x='peak_date',  # X-axis as year
        y='age_at_peak',  # Y-axis as average age
        color=group_by,  # Group by the selected column
        line_group=group_by,  # Ensure lines are grouped by discipline
    )

    # Customize the line color and thickness
    fig.update_traces(
        line=dict(color='#fff5cc', width=line_thickness),  # Line color and thickness
        marker=dict(size=0)  # Remove markers
    )

    # Update layout: minimal or full layout
    if show_minimal:
        # Minimal: transparent background, no axis, no gridlines, no labels
        fig.update_layout(
            template='plotly_white',
            xaxis=dict(showgrid=False, showticklabels=False, zeroline=False),
            yaxis=dict(showgrid=False, showticklabels=False, zeroline=False),
            plot_bgcolor='rgba(0,0,0,0)',  # Transparent background
            paper_bgcolor='rgba(0,0,0,0)',  # Transparent background
            showlegend=False,
            width=width,
            height=height,
        )
    else:
        # Full layout with axis, gridlines, etc.
        fig.update_layout(
            xaxis_title='Year',
            yaxis_title='Average Age',
            template='base',
            width=width,
            height=height,
            yaxis=dict(range=[y_min, y_max]),  # Set the y-axis range
            legend=dict(traceorder='normal')  # Ensure the legend is ordered alphabetically
        )

    # Show the plot
    fig.show()

# # Example usage:
# plot_trend_chart(d_chess[d_chess['peak_date'] >= 1995], width=1200, height=800, gender='M', window=3, group_by='discipline', y_min=0, y_max=60, line_thickness=8, show_minimal=True)
plot_trend_chart(d_golf[d_golf['peak_date'] >= 1985], width=1200, height=800, gender='M', window=3, group_by='gender', y_min=0, y_max=60, line_thickness=8, show_minimal=True)
plot_trend_chart(d_golf[d_golf['peak_date'] >= 1985], width=1200, height=800, gender='F', window=3, group_by='gender', y_min=0, y_max=60, line_thickness=8, show_minimal=True)

excluded_disciplines = ['basketball', 'mma', 'dressage']

plot_trend_chart(d_athletes[~d_athletes['discipline'].isin(excluded_disciplines)], width=1200, height=800, gender='F', window=3, group_by='gender', y_min=0, y_max=60, line_thickness=8, show_minimal=True)
plot_trend_chart(d_athletes[~d_athletes['discipline'].isin(excluded_disciplines)], width=1200, height=800, gender='both', window=3, group_by='gender', y_min=0, y_max=60, line_thickness=8, show_minimal=True)


In [9]:
def plot_histogram(df, group_column, x_start=None, x_end=None, height=600, width=800, gender='both'):
    # Check if the provided group_column exists in the dataframe
    if group_column not in df.columns:
        print(f"Column '{group_column}' not found in dataframe.")
        return
    
    # Filter the dataframe based on the gender selection
    if gender in ['M', 'F']:
        filtered_df = df[df['gender'] == gender]
    else:
        filtered_df = df  # Keep all if 'both' is selected
    
    # Calculate median age for each group in the filtered dataframe
    medians = filtered_df.groupby(group_column)['age_at_peak'].median()
    
    # Plot histogram with each year as its own bin
    fig = px.histogram(
        filtered_df, 
        x='age_at_peak', 
        color=group_column, 
        nbins=x_end - x_start + 1 if x_start is not None and x_end is not None else None,
        title=f'Age Distribution by {group_column.capitalize()}',
        labels={'age_at_peak': 'Age', group_column: group_column.capitalize()},
        barmode='overlay',   
        histnorm='percent'   
    )
    
    # Apply styling for contiguous bars, transparency, and outline
    fig.update_traces(
        xbins=dict(size=1),  
        marker_line_width=1.5,  # Set the outline width
        marker_line_color='black',  # Set the outline color to black
        opacity=1
    )

    # Update layout, remove grid lines, and hide axis labels
    fig.update_layout(
        template='base', 
        height=height, 
        width=width,
        bargap=0,           
        bargroupgap=0,
        xaxis=dict(showgrid=False, showticklabels=False),  # Hide x-axis gridlines and labels
        yaxis=dict(showgrid=False, showticklabels=False)   # Hide y-axis gridlines and labels
    )

    # Set x-axis range if specified
    if x_start is not None and x_end is not None:
        fig.update_xaxes(range=[x_start, x_end], dtick=1)  
    
    # Display the plot
    fig.show()

# Add the 'plot' column, where it is True for 'dressage' and False for others
d_athletes['plot'] = 'plot'
plot_histogram(d_athletes, 'plot', x_start=10, x_end=60, height=650, width=1200, gender='both')
plot_histogram(d_athletes, 'discipline', x_start=10, x_end=60, height=650, width=1200, gender='both')
plot_histogram(d_athletes, 'gender', x_start=10, x_end=60, height=650, width=1200, gender='both')


In [10]:
import plotly.express as px

def plot_histogram(df, group_column, x_start=None, x_end=None, y_min=None, y_max=None, color='#470945', height=600, width=800, gender='both', output_image='histogram.png'):
    # Check if the provided group_column exists in the dataframe
    if group_column not in df.columns:
        print(f"Column '{group_column}' not found in dataframe.")
        return
    
    # Filter the dataframe based on the gender selection
    if gender in ['M', 'F']:
        filtered_df = df[df['gender'] == gender]
    else:
        filtered_df = df  # Keep all if 'both' is selected
    
    # Plot histogram with each category as a separate trace
    fig = px.histogram(
        filtered_df, 
        x='age_at_peak', 
        color=group_column, 
        nbins=x_end - x_start + 1 if x_start is not None and x_end is not None else None,
        labels={'age_at_peak': ''},  # Hide axis titles
        barmode='stack',
        histnorm='percent'
    )

    # Apply styling for bars, including single color and outline
    fig.update_traces(
        xbins=dict(size=1),
        marker=dict(color=color, line=dict(width=1.5, color='black')),  # Set bars to specified color with outline
        opacity=1
    )

    # Update layout and set y-axis range if specified
    fig.update_layout(
        template='base_trans',
        showlegend=False,  # Hide legend
        height=height, 
        width=width,
        bargap=0,
        bargroupgap=0,
        xaxis=dict(showgrid=False, showticklabels=False, title=''),  # Hide x-axis title and ticks
        yaxis=dict(showgrid=False, showticklabels=False, range=[y_min, y_max], title='')  # Hide y-axis title and ticks
    )

    # Set x-axis range if specified
    if x_start is not None and x_end is not None:
        fig.update_xaxes(range=[x_start, x_end], dtick=1)
    
    # Save plot as PNG
    fig.write_image(output_image)
    
    # Display the plot
    fig.show()

# plot_histogram(d_athletes, 'plot', x_start=10, x_end=60, y_min=0, y_max=15, color='#fff5cc', height=650, width=1200, gender='both', output_image='1.png')
# plot_histogram(d_swimming100m, 'discipline', x_start=10, x_end=60, y_min=0, y_max=15, color='#fff5cc', height=650, width=1200, gender='both', output_image='2.png')
# plot_histogram(d_highjump, 'discipline', x_start=10, x_end=60, y_min=0, y_max=15, color='#fff5cc', height=650, width=1200, gender='both', output_image='3.png')
# plot_histogram(d_athletics100M, 'discipline', x_start=10, x_end=60, y_min=0, y_max=15, color='#fff5cc', height=650, width=1200, gender='both', output_image='4.png')
# plot_histogram(d_basketball, 'discipline', x_start=10, x_end=60, y_min=0, y_max=15, color='#fff5cc', height=650, width=1200, gender='both', output_image='5.png')
# plot_histogram(d_tennis, 'discipline', x_start=10, x_end=60, y_min=0, y_max=15, color='#fff5cc', height=650, width=1200, gender='both', output_image='6.png')
# plot_histogram(d_football, 'discipline', x_start=10, x_end=60, y_min=0, y_max=15, color='#fff5cc', height=650, width=1200, gender='both', output_image='7.png')
# plot_histogram(d_chess, 'discipline', x_start=10, x_end=60, y_min=0, y_max=15, color='#fff5cc', height=650, width=1200, gender='both', output_image='8.png')
# plot_histogram(d_gridiron, 'discipline', x_start=10, x_end=60, y_min=0, y_max=15, color='#fff5cc', height=650, width=1200, gender='both', output_image='9.png')
# plot_histogram(d_golf, 'discipline', x_start=10, x_end=60, y_min=0, y_max=15, color='#fff5cc', height=650, width=1200, gender='both', output_image='10.png')
# plot_histogram(d_mma, 'discipline', x_start=10, x_end=60, y_min=0, y_max=15, color='#fff5cc', height=650, width=1200, gender='both', output_image='11.png')
# plot_histogram(d_dressage, 'discipline', x_start=10, x_end=60, y_min=0, y_max=15, color='#fff5cc', height=650, width=1200, gender='both', output_image='12.png')

In [11]:
# Calculate summary statistics for 'age_at_peak' grouped by 'gender' and 'discipline'
age_stats = (
    d_athletes.groupby(['discipline'])['age_at_peak']
    .describe(percentiles=[0.25, 0.5, 0.75])  # Calculate quartiles and standard summary stats
    .loc[:, ['min', '25%', '50%', '75%', 'max']]  # Select only min, 25th, median (50%), 75th, and max
    .rename(columns={'25%': 'Q1', '50%': 'Median', '75%': 'Q3'})  # Rename columns for clarity
    .reset_index()  # Reset index to make 'gender' and 'discipline' columns
    .sort_values(by='Median', ascending=False)  # Sort by Median in descending order
)

from IPython.display import HTML

# Convert the DataFrame to HTML with scrolling enabled
def display_scrollable_table(df, max_height=400, max_width=800):
    # Generate HTML table with CSS for scrollable div
    html = f"""
    <div style="max-height: {max_height}px; max-width: {max_width}px; overflow: auto; border: 1px solid #ddd; padding: 10px;">
        {df.to_html(index=False)}
    </div>
    """
    return HTML(html)

# Display the age_stats DataFrame as a scrollable HTML table
display_scrollable_table(age_stats)


discipline,min,Q1,Median,Q3,max
dressage,21.0,30.0,37.0,45.0,60.0
mma,22.0,30.0,32.0,35.0,44.0
golf,17.0,27.0,31.0,35.0,51.0
formulaone,23.0,28.0,31.0,34.0,46.0
baseball,22.0,26.0,29.0,31.0,40.0
marathon,19.0,26.0,29.0,31.5,40.0
chess,17.0,25.0,28.0,33.0,45.0
gridiron,21.0,26.0,28.0,33.0,40.0
basketball,20.0,25.0,27.0,30.0,39.0
discus,19.0,24.0,27.0,30.0,40.0


In [12]:
# Calculate median for each discipline to set plot order
discipline_order = age_stats.set_index(['discipline']).sort_values('Median').index.get_level_values('discipline').unique()

# Melt the dataframe to make it suitable for a grouped box plot
age_stats_melted = age_stats.melt(id_vars=['discipline'], 
                                  value_vars=['min', 'Q1', 'Median', 'Q3', 'max'], 
                                  var_name='Statistic', 
                                  value_name='Age')

# Plot box plot with Plotly, using the calculated order
fig = px.box(age_stats_melted, 
             x='discipline', 
             y='Age', 
             title='Age at Peak Performance Distribution by Discipline',
             labels={'Age': 'Age at Peak', 'discipline': 'Discipline'},
             height=600, width=1200,
             category_orders={'discipline': discipline_order})

# Update layout for readability
fig.update_layout(xaxis_title="Discipline", yaxis_title="Age", template="base")
fig.show()


In [13]:
import plotly.express as px

def plot_trend_chart(df, width=800, height=600, gender='both', window=None, group_by='discipline', y_min=0, y_max=80):
    """
    Function to plot the trend chart for average age at peak performance by a chosen group (e.g., discipline, nationality) over time.

    Parameters:
    - df: The dataframe containing the data (e.g., d_athletes)
    - width: The width of the plot (default is 800)
    - height: The height of the plot (default is 600)
    - gender: The gender to filter ('M', 'F', or 'both', default is 'both')
    - window: The rolling window for smoothing the trend (None means no rolling window, otherwise set to an integer)
    - group_by: The column to group by (e.g., 'discipline', 'nationality', etc.), default is 'discipline'
    - y_min: The minimum value of the y-axis (default is 0)
    - y_max: The maximum value of the y-axis (default is 80)
    """
    
    # Check if the provided group_by column exists in the dataframe
    if group_by not in df.columns:
        print(f"Column '{group_by}' not found in dataframe.")
        return

    # Filter the dataframe based on the gender selection
    if gender in ['M', 'F']:
        filtered_df = df[df['gender'] == gender]
    else:
        filtered_df = df  # Keep all if 'both' is selected
    
    # Group the data by 'peak_date' and the chosen group_by column, then calculate the average 'age_at_peak'
    avg_age_by_year = filtered_df.groupby(['peak_date', group_by])['age_at_peak'].mean().reset_index()

    # Sort by the group_by column to ensure legend is ordered alphabetically
    avg_age_by_year = avg_age_by_year.sort_values(by=[group_by, 'peak_date'])

    # Apply rolling window if provided
    if window:
        avg_age_by_year['age_at_peak'] = avg_age_by_year.groupby(group_by)['age_at_peak'].transform(lambda x: x.rolling(window=window, min_periods=1).mean())

    # Create a line chart with Plotly
    fig = px.line(
        avg_age_by_year,
        x='peak_date',  # X-axis as year
        y='age_at_peak',  # Y-axis as average age
        color=group_by,  # Group by the selected column
        title=f'Average Age at Peak Performance by {group_by.capitalize()} Over Time (Gender: {gender})',
        labels={'peak_date': 'Year', 'age_at_peak': 'Average Age'},
        markers=True,
        line_group=group_by,  # Ensure lines are grouped by discipline
    )

    # Customize the layout with controlled y-axis range and alphabetically ordered legend
    fig.update_layout(
        xaxis_title='Year',
        yaxis_title='Average Age',
        template='base',
        width=width,
        height=height,
        yaxis=dict(range=[y_min, y_max]),  # Set the y-axis range
        legend=dict(traceorder='normal')  # Ensure the legend is ordered alphabetically
    )

    # Show the plot
    fig.show()

# d_olympics = d_athletes[d_athletes['discipline'].isin(['archery', 'athletics100m', 'athletics800m', 'diving10m', 'dessage', 'gymnastics', 'highjump', 'javelin', 'longjump', 'marathon', 'swimming100m'])]
# plot_trend_chart(d_olympics, width=1000, height=700, gender='', window=15, group_by='gender', y_min=20, y_max=35)

plot_trend_chart(d_athletes[d_athletes['peak_date'] >= 1950], width=1000, height=700, gender='both', window=15, group_by='gender', y_min=20, y_max=35)
plot_trend_chart(d_athletes, width=1000, height=700, gender='F', window=1, group_by='discipline', y_min=10, y_max=60)
plot_trend_chart(d_athletes, width=1000, height=700, gender='both', window=3, group_by='discipline', y_min=10, y_max=60)

In [14]:
plot_trend_chart(d_tennis, width=1000, height=700, gender='F', window=5, group_by='discipline', y_min=10, y_max=60)
plot_trend_chart(d_tennis, width=1000, height=700, gender='M', window=5, group_by='discipline', y_min=10, y_max=60)


In [19]:
import pandas as pd
import numpy as np
from scipy.stats import linregress

def calculate_slope_and_avg(df, gender='both'):
    """
    Function to calculate the slope (rate of change) of average age over time for each discipline, along with the average age.

    Parameters:
    - df: The dataframe containing the data (e.g., d_athletes)
    - gender: The gender to filter ('M', 'F', or 'both', default is 'both')

    Returns:
    A dataframe with columns: discipline, gender, avg_slope, avg_age.
    """

    # Filter the dataframe based on the gender selection
    if gender in ['M', 'F']:
        filtered_df = df[df['gender'] == gender]
    else:
        filtered_df = df  # Keep all if 'both' is selected
    
    # Group by discipline and calculate the slope (rate of change) for each discipline
    results = []
    
    for discipline, group in filtered_df.groupby('discipline'):
        # Ensure there's enough data to calculate a slope
        if group['peak_date'].nunique() > 1:
            # Perform linear regression to get the slope
            slope, intercept, r_value, p_value, std_err = linregress(group['peak_date'], group['age_at_peak'])
            
            # Calculate the overall average age for the discipline
            avg_age = group['age_at_peak'].mean()
            
            # Append results
            results.append({
                'discipline': discipline,
                'gender': gender,
                'avg_slope': slope,
                'avg_age': avg_age
            })

    # Convert results to dataframe
    result_df = pd.DataFrame(results)
    
    # Order by avg_slope in decreasing order
    result_df = result_df.sort_values(by='avg_slope', ascending=False)
    
    return result_df

# Example usage:
male_slope_avg = calculate_slope_and_avg(d_athletes, gender='M')
female_slope_avg = calculate_slope_and_avg(d_athletes, gender='F')

# Concatenate results for both genders
combined_results = pd.concat([male_slope_avg, female_slope_avg], ignore_index=True)
comb_slope = calculate_slope_and_avg(d_athletes, gender='F')

# Display the results
print(comb_slope)


       discipline gender  avg_slope    avg_age
15            mma      F   0.567115  32.323529
1   athletics100M      F   0.066918  24.521739
11       highjump      F   0.065009  25.197183
13       longjump      F   0.061341  25.816667
5          discus      F   0.058821  27.855072
12        javelin      F   0.054075  25.893939
16   swimming100m      F   0.049820  20.734177
3      basketball      F   0.038429  28.113475
14       marathon      F   0.029545  29.363636
7        dressage      F   0.019724  33.962025
6       diving10M      F  -0.008095  20.717949
17         tennis      F  -0.008565  25.170124
2   athletics800M      F  -0.010303  25.055556
10     gymnastics      F  -0.090752  20.586207
0         archery      F  -0.100916  24.404762
4           chess      F  -0.122246  27.964912
8        football      F  -0.197674  27.466667
9            golf      F  -0.225016  29.307692


In [16]:
import pandas as pd
from IPython.display import display, HTML

# Assuming d_athletes is your DataFrame
def display_scrollable_table(df):
    # Sort the DataFrame by 'age_at_peak' in descending order
    sorted_df = df.sort_values(by='age_at_peak', ascending=True)
    
    # Convert the sorted DataFrame to an HTML table with scrolling enabled
    html_table = sorted_df.to_html(classes='table table-striped', index=False)
    
    # Display the HTML table with a fixed height and scrolling
    display(HTML(f'''
    <div style="height:500px; overflow:auto; width:100%;">
        {html_table}
    </div>
    '''))
    
# Example usage:
display(d_athletes.player.nunique())
display_scrollable_table(d_athletes)

display_scrollable_table(d_athletes[(d_athletes['age_at_peak'] >= 26) & 
                                    (d_athletes['age_at_peak'] <= 29) & 
                                    (d_athletes['discipline'].isin(['formulaone']))])

3142

player,detail,peak_date,birth_year,gender,age_at_peak,discipline,plot
Quan Hongchan,Gold,2020,2007,F,13,diving10M,plot
Sirvard Emirzyan,Silver,1980,1966,F,14,diving10M,plot
Willy den Ouden,Silver,1932,1918,F,14,swimming100m,plot
Fu Mingxia,Gold,1992,1978,F,14,diving10M,plot
Xiong Ni,Silver,1988,1974,M,14,diving10M,plot
Franziska van Almsick,Bronze,1992,1978,F,14,swimming100m,plot
Shannon Miller,Silver,1992,1977,F,15,gymnastics,plot
Chen Yuxi,Silver,2020,2005,F,15,diving10M,plot
Shirley Babashoff,Silver,1972,1957,F,15,swimming100m,plot
Faith Leech,Bronze,1956,1941,F,15,swimming100m,plot


player,detail,peak_date,birth_year,gender,age_at_peak,discipline,plot
Max Verstappen,Champion,2023,1997,M,26,formulaone,plot
Sebastian Vettel,Champion,2013,1987,M,26,formulaone,plot
Emerson Fittipaldi,Champion,1972,1946,M,26,formulaone,plot
Niki Lauda,Champion,1975,1949,M,26,formulaone,plot
Jacques Villeneuve,Champion,1997,1971,M,26,formulaone,plot
Michael Schumacher,Champion,1995,1969,M,26,formulaone,plot
Jim Clark,Champion,1963,1936,M,27,formulaone,plot
Jochen Rindt,Champion,1970,1942,M,28,formulaone,plot
Emerson Fittipaldi,Champion,1974,1946,M,28,formulaone,plot
Niki Lauda,Champion,1977,1949,M,28,formulaone,plot
