- Testing the scoring function.
- Simulations using the training data votes.

In [19]:
import numpy as np
import pandas as pd
import pandas.api.types

import kaggle_metric_utilities

from typing import Optional


In [20]:
class ParticipantVisibleError(Exception):
    pass


def kl_divergence(solution: pd.DataFrame, submission: pd.DataFrame, epsilon: float, micro_average: bool, sample_weights: Optional[pd.Series]):
    # Overwrite solution for convenience
    for col in solution.columns:
        # Prevent issue with populating int columns with floats
        if not pandas.api.types.is_float_dtype(solution[col]):
            solution[col] = solution[col].astype(float)

        # Clip both the min and max following Kaggle conventions for related metrics like log loss
        # Clipping the max avoids cases where the loss would be infinite or undefined, clipping the min
        # prevents users from playing games with the 20th decimal place of predictions.
        submission[col] = np.clip(submission[col], epsilon, 1 - epsilon)

        y_nonzero_indices = solution[col] != 0
        solution[col] = solution[col].astype(float)
        solution.loc[y_nonzero_indices, col] = solution.loc[y_nonzero_indices, col] * np.log(solution.loc[y_nonzero_indices, col] / submission.loc[y_nonzero_indices, col])
        # Set the loss equal to zero where y_true equals zero following the scipy convention:
        # https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.rel_entr.html#scipy.special.rel_entr
        solution.loc[~y_nonzero_indices, col] = 0

    if micro_average:
        return np.average(solution.sum(axis=1), weights=sample_weights)
    else:
        return np.average(solution.mean())


def score(
        solution: pd.DataFrame,
        submission: pd.DataFrame,
        row_id_column_name: str,
        epsilon: float=10**-15,
        micro_average: bool=True,
        sample_weights_column_name: Optional[str]=None
    ) -> float:
    ''' The Kullback–Leibler divergence.
    The KL divergence is technically undefined/infinite where the target equals zero.

    This implementation always assigns those cases a score of zero; effectively removing them from consideration.
    The predictions in each row must add to one so any probability assigned to a case where y == 0 reduces
    another prediction where y > 0, so crucially there is an important indirect effect.

    https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence

    solution: pd.DataFrame
    submission: pd.DataFrame
    epsilon: KL divergence is undefined for p=0 or p=1. If epsilon is not null, solution and submission probabilities are clipped to max(eps, min(1 - eps, p).
    row_id_column_name: str
    micro_average: bool. Row-wise average if True, column-wise average if False.

    Examples
    --------
    >>> import pandas as pd
    >>> row_id_column_name = "id"
    >>> score(pd.DataFrame({'id': range(4), 'ham': [0, 1, 1, 0], 'spam': [1, 0, 0, 1]}), pd.DataFrame({'id': range(4), 'ham': [.1, .9, .8, .35], 'spam': [.9, .1, .2, .65]}), row_id_column_name=row_id_column_name)
    0.216161...
    >>> solution = pd.DataFrame({'id': range(3), 'ham': [0, 0.5, 0.5], 'spam': [0.1, 0.5, 0.5], 'other': [0.9, 0, 0]})
    >>> submission = pd.DataFrame({'id': range(3), 'ham': [0, 0.5, 0.5], 'spam': [0.1, 0.5, 0.5], 'other': [0.9, 0, 0]})
    >>> score(solution, submission, 'id')
    0.0
    >>> solution = pd.DataFrame({'id': range(3), 'ham': [0, 0.5, 0.5], 'spam': [0.1, 0.5, 0.5], 'other': [0.9, 0, 0]})
    >>> submission = pd.DataFrame({'id': range(3), 'ham': [0.2, 0.3, 0.5], 'spam': [0.1, 0.5, 0.5], 'other': [0.7, 0.2, 0]})
    >>> score(solution, submission, 'id')
    0.160531...
    '''
    del solution[row_id_column_name]
    del submission[row_id_column_name]

    sample_weights = None
    if sample_weights_column_name:
        if sample_weights_column_name not in solution.columns:
            raise ParticipantVisibleError(f'{sample_weights_column_name} not found in solution columns')
        sample_weights = solution.pop(sample_weights_column_name)

    if sample_weights_column_name and not micro_average:
        raise ParticipantVisibleError('Sample weights are only valid if `micro_average` is `True`')

    for col in solution.columns:
        if col not in submission.columns:
            raise ParticipantVisibleError(f'Missing submission column {col}')

    kaggle_metric_utilities.verify_valid_probabilities(solution, 'solution')
    kaggle_metric_utilities.verify_valid_probabilities(submission, 'submission')

    return kaggle_metric_utilities.safe_call_score(kl_divergence, solution, submission, epsilon=epsilon, micro_average=micro_average, sample_weights=sample_weights)

Example given in docstrings:

In [14]:
row_id_column_name = "id"
score(pd.DataFrame({'id': range(4), 'ham': [0, 1, 1, 0], 'spam': [1, 0, 0, 1]}), pd.DataFrame({'id': range(4), 'ham': [.1, .9, .8, .35], 'spam': [.9, .1, .2, .65]}), row_id_column_name=row_id_column_name)

0.21616187468057918

Some simple examples with 6 classes:

In [73]:
id_name = range(6)

d = {
    'id_name' : id_name,
    'seizure' : [1, 0, 0, 0, 0, 0],
    'lpd'     : [0, 1, 0, 0, 0, 0],
    'gpd'     : [0, 0, 1, 0, 0, 0],
    'lrda'    : [0, 0, 0, 1, 0, 0],
    'grda'    : [0, 0, 0, 0, 1, 0],
    'other'   : [0, 0, 0, 0, 0, 1]
}
solution = pd.DataFrame(data=d)

d = {
    'id_name' : id_name,
    'seizure' : [1, 0, 0, 0, 0, 0],
    'lpd'     : [0, 1, 0, 0, 0, 0],
    'gpd'     : [0, 0, 1, 0, 0, 0],
    'lrda'    : [0, 0, 0, 1, 0, 0],
    'grda'    : [0, 0, 0, 0, 1, 0],
    'other'   : [0, 0, 0, 0, 0, 1]
}
submission = pd.DataFrame(data=d)

print("all correct")
score(solution, submission, 'id_name')


all correct


0.0

Always use float (don't use 1, use 1.0 and 0.0).

In [72]:
id_name = range(6)

d = {
    'id_name' : id_name,
    'seizure' : [1.0, 0, 0, 0, 0, 0],
    'lpd'     : [0, 1.0, 0, 0, 0, 0],
    'gpd'     : [0, 0, 1.0, 0, 0, 0],
    'lrda'    : [0, 0, 0, 1.0, 0, 0],
    'grda'    : [0, 0, 0, 0, 1.0, 0],
    'other'   : [0, 0, 0, 0, 0, 1.0]
}
solution = pd.DataFrame(data=d)

d = {
    'id_name' : id_name,
    'seizure' : [0  , 0  , 0  , 0  , 0  , 0.0],
    'lpd'     : [1.0, 0  , 0  , 0  , 0  , 0  ],
    'gpd'     : [0  , 1.0, 0  , 0  , 0  , 0  ],
    'lrda'    : [0  , 0  , 1.0, 0  , 0  , 0  ],
    'grda'    : [0  , 0  , 0  , 1.0, 0  , 0  ],
    'other'   : [0  , 0  , 0  , 0  , 1.0, 1.0]
}
submission = pd.DataFrame(data=d)

print("One hit (1)")
score(solution, submission, 'id_name')
# submission


One hit (1)


28.78231366242557

In [74]:
id_name = range(6)

d = {
    'id_name' : id_name,
    'seizure' : [1.0, 0, 0, 0, 0, 0],
    'lpd'     : [0, 1.0, 0, 0, 0, 0],
    'gpd'     : [0, 0, 1.0, 0, 0, 0],
    'lrda'    : [0, 0, 0, 1.0, 0, 0],
    'grda'    : [0, 0, 0, 0, 1.0, 0],
    'other'   : [0, 0, 0, 0, 0, 1.0]
}
solution = pd.DataFrame(data=d)

d = {
    'id_name' : id_name,
    'seizure' : [0  , 0  , 0  , 0  , 0  , 0.0],
    'lpd'     : [1.0, 0  , 0  , 0  , 0  , 0  ],
    'gpd'     : [0  , 1.0, 0  , 0  , 0  , 0  ],
    'lrda'    : [0  , 0  , 1.0, 0  , 0  , 0  ],
    'grda'    : [0  , 0  , 0  , 1.0, 1.0, 0  ],
    'other'   : [0  , 0  , 0  , 0  , 0  , 1.0]
}
submission = pd.DataFrame(data=d)

print("2 hits (1)")
score(solution, submission, 'id_name')
# submission


2 hits (1)


23.025850929940457

In [76]:
id_name = range(6)

d = {
    'id_name' : id_name,
    'seizure' : [1.0, 0, 0, 0, 0, 0.5],
    'lpd'     : [0, 1.0, 0, 0, 0, 0],
    'gpd'     : [0, 0, 1.0, 0, 0, 0],
    'lrda'    : [0, 0, 0, 1.0, 0, 0],
    'grda'    : [0, 0, 0, 0, 1.0, 0],
    'other'   : [0, 0, 0, 0, 0, 0.5]
}
solution = pd.DataFrame(data=d)

d = {
    'id_name' : id_name,
    'seizure' : [0  , 0  , 0  , 0  , 0  , 0.5],
    'lpd'     : [1.0, 0  , 0  , 0  , 0  , 0  ],
    'gpd'     : [0  , 1.0, 0  , 0  , 0  , 0  ],
    'lrda'    : [0  , 0  , 1.0, 0  , 0  , 0  ],
    'grda'    : [0  , 0  , 0  , 1.0, 1.0, 0  ],
    'other'   : [0  , 0  , 0  , 0  , 0  , 0.5]
}
submission = pd.DataFrame(data=d)

print("2 hits again, but changing 1 for 0.5, same score")
score(solution, submission, 'id_name')
# submission


2 hits again, but changing 1 for 0.5, same score


23.025850929940457

Around 5.8 for each full hit.

## Simulations

In [9]:
base_dir = "../../kaggle_data/hms"

df = pd.read_csv(f'{base_dir}/train.csv')

# Transform votes into percentages.
df['sum_votes'] = df.seizure_vote + df.lpd_vote + df.gpd_vote	+ df.lrda_vote + df.grda_vote + df.other_vote
df['seizure_vote'] = df.seizure_vote/df.sum_votes
df['lpd_vote'] = df.lpd_vote/df.sum_votes
df['gpd_vote'] = df.gpd_vote/df.sum_votes
df['lrda_vote'] = df.lrda_vote/df.sum_votes
df['grda_vote'] = df.grda_vote/df.sum_votes
df['other_vote'] = df.other_vote/df.sum_votes
df


Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote,sum_votes
0,1628180742,0,0.0,353733,0,0.0,127492639,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3
1,1628180742,1,6.0,353733,1,6.0,3887563113,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3
2,1628180742,2,8.0,353733,2,8.0,1142670488,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3
3,1628180742,3,18.0,353733,3,18.0,2718991173,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3
4,1628180742,4,24.0,353733,4,24.0,3080632009,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106795,351917269,6,12.0,2147388374,6,12.0,4195677307,10351,LRDA,0.0,0.0,0.0,1.0,0.0,0.0,3
106796,351917269,7,14.0,2147388374,7,14.0,290896675,10351,LRDA,0.0,0.0,0.0,1.0,0.0,0.0,3
106797,351917269,8,16.0,2147388374,8,16.0,461435451,10351,LRDA,0.0,0.0,0.0,1.0,0.0,0.0,3
106798,351917269,9,18.0,2147388374,9,18.0,3786213131,10351,LRDA,0.0,0.0,0.0,1.0,0.0,0.0,3


In [8]:
columns = df.columns[-7:-1]
columns

Index(['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote',
       'other_vote'],
      dtype='object')

In [17]:
# It needs an id column, score() deletes it.
solution_const = pd.DataFrame({'id' : list(range(len(df)))})
solution_const = pd.concat([solution_const, df[columns]], axis=1)
solution_const

Unnamed: 0,id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,0,1.0,0.0,0.0,0.0,0.0,0.0
1,1,1.0,0.0,0.0,0.0,0.0,0.0
2,2,1.0,0.0,0.0,0.0,0.0,0.0
3,3,1.0,0.0,0.0,0.0,0.0,0.0
4,4,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
106795,106795,0.0,0.0,0.0,1.0,0.0,0.0
106796,106796,0.0,0.0,0.0,1.0,0.0,0.0
106797,106797,0.0,0.0,0.0,1.0,0.0,0.0
106798,106798,0.0,0.0,0.0,1.0,0.0,0.0


Testing that this works as expected:

In [21]:
solution = solution_const.copy()
submission = solution.copy()

score(solution, submission, 'id')

5.30547308125413e-16

Equal probabilities, different sample sizes:

In [36]:
for N in [5000, 50000, 100000]:
    solution = solution_const.copy().sample(N)
    submission = solution.copy()
    cols = submission.columns[1:]
    submission[cols[0]] = np.ones(N) * 0.166
    submission[cols[1]] = np.ones(N) * 0.166
    submission[cols[2]] = np.ones(N) * 0.167
    submission[cols[3]] = np.ones(N) * 0.167
    submission[cols[4]] = np.ones(N) * 0.167
    submission[cols[5]] = np.ones(N) * 0.167

    print(score(solution, submission, 'id'))


1.4008134244377037
1.4049901851368456
1.4027555251734336


Always the same target, different sizes:

In [43]:
for N in [5000, 50000, 100000]:
    solution = solution_const.copy().sample(N)
    submission = solution.copy()
    cols = submission.columns[1:]
    submission[cols[0]] = np.ones(N)
    submission[cols[1]] = np.zeros(N) 
    submission[cols[2]] = np.zeros(N) 
    submission[cols[3]] = np.zeros(N) 
    submission[cols[4]] = np.zeros(N) 
    submission[cols[5]] = np.zeros(N) 

    print(score(solution, submission, 'id'))


27.02049716044172
26.92139152747061
26.966591881384115
