In [None]:
# default_exp leaderboard

# Leaderboard

> Visualising the public leaderboard.

In [None]:
#hide
%load_ext autoreload
%autoreload 2

In [None]:
#export
import pandas as pd
import os
import plotly.graph_objects as go
import plotly.express as px
import typing
from fastcore.all import *
from ashrae import loading

In [None]:
#hide
pd.options.plotting.backend = "plotly"

In [None]:
%%time
csvs = loading.get_csvs()

In [None]:
leaderboard = pd.read_csv(csvs['public-leaderboard'], parse_dates=['SubmissionDate'])
leaderboard.head()

In [None]:
leaderboard['TeamId'].nunique(), len(leaderboard)

In [None]:
#export
@typed
def get_leaderboard_distribution(df:pd.DataFrame):
    return (df.sort_values('Score')
              .groupby('Score')
              .size()
              .to_frame('Count')
              .reset_index()
              .assign(**{'Cumulative share (%)': lambda x: 100 * x['Count'].cumsum()/x['Count'].sum()}))

In [None]:
%%time
dis = get_leaderboard_distribution(leaderboard)
dis.head()

In [None]:
dis['Score'].describe(percentiles=[.05, .1, .25, .5, .75, .95])

Public scores:

| Segment | Score |
| --- | --- |
| top 50% | 1.44 | 
| top 5% | 0.98 | 

With the best private leaderboard score being at 1.23 there is seems to be some overfitting / leakage in leading to those scores.

Line plot of the above

In [None]:
px.line(dis, x='Score', y='Cumulative share (%)', title='Cumulative distribution of public leaderboard scores')

Looking at the temporal trend of the scores to get an idea of jumps

In [None]:
leaderboard.plot(kind='scatter', x='SubmissionDate', y='Score', title='Trend of the public score over time')

Finding:
- There are like 3 clusters around 1.243, 1.118 (from 2019-10-25 onwards) and 0.979 (from 2019-11-20 onwards) appearing over time

In [None]:
#hide
from nbdev.export import *
notebook2script()