# Stats about Train / Dev / Test sets

In [1]:
import pandas as pd

import sys
sys.path.insert(0, '../')
from utils.config import PATHS
from utils.latex import show_latex, TABLES

In [2]:
domains=['ADM', 'ATT', 'BER', 'ENR', 'ETN', 'FAC', 'INS', 'MBW', 'STM']

# Domains classifier

## Load data

In [3]:
datapath = PATHS.getpath('data_expr_sept')
train = pd.read_pickle(datapath / 'clf_domains/train_excl_bck_add_pilot.pkl')
test = pd.read_pickle(datapath / 'clf_domains/test.pkl')
dev = pd.read_pickle(datapath / 'clf_domains/dev.pkl')

## Check correct split

i.e. there are no notes that appear in more than one set

In [4]:
print(test.NotitieID.isin(train.NotitieID).any())
print(dev.NotitieID.isin(train.NotitieID).any())
print(dev.NotitieID.isin(test.NotitieID).any())

False
False
False


## Number of sentences

- A sentence can contain more than one domain and therefore be counted more than once.
- The last column is the total number of sentences in the dataset (incl. all negative examples)

In [5]:
caption = "Domain classification: datasets, sentence-level"
label = "domains_datasets_sents"

data = pd.concat([
    train.assign(dataset = 'train'),
    test.assign(dataset = 'test'),
    dev.assign(dataset = 'dev'),
])

balance = pd.DataFrame(
    index = pd.MultiIndex.from_frame(data[['dataset', 'pad_sen_id']]),
    columns = domains,
    data = data.labels.to_list()
)

dataset_sizes = balance.pivot_table(
    index='dataset',
    aggfunc='size',
).rename('n_sentences')

piv = balance.pivot_table(
    index='dataset',
    aggfunc='sum',
).join(dataset_sizes)
piv.loc['total'] = piv.sum()

piv.pipe(show_latex, caption=caption, label=label)

Unnamed: 0_level_0,ADM,ATT,BER,ENR,ETN,FAC,INS,MBW,STM,n_sentences
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
dev,411,22,29,105,225,119,127,96,147,21742
test,775,39,54,160,382,253,287,125,181,22082
train,4988,247,486,989,2420,2489,1967,755,3390,239153
total,6174,308,569,1254,3027,2861,2381,976,3718,282977


## Number of notes

In [6]:
caption = "Domain classification: datasets, note-level"
label = "domains_datasets_notes"

data = pd.concat([
    train.assign(dataset = 'train'),
    test.assign(dataset = 'test'),
    dev.assign(dataset = 'dev'),
])

balance = pd.DataFrame(
    index = pd.MultiIndex.from_frame(data[['dataset', 'NotitieID']]),
    columns = domains,
    data = data.labels.to_list()
).groupby(level=[0,1]).any()

dataset_sizes = balance.pivot_table(
    index='dataset',
    aggfunc='size',
).rename('n_notes')

piv = balance.pivot_table(
    index='dataset',
    aggfunc='sum',
).join(dataset_sizes)
piv.loc['total'] = piv.sum()

piv.pipe(show_latex, caption=caption, label=label)

Unnamed: 0_level_0,ADM,ATT,BER,ENR,ETN,FAC,INS,MBW,STM,n_notes
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
dev,188,17,25,71,128,75,78,71,83,431
test,231,27,34,92,165,95,116,64,94,431
train,2345,175,381,707,1416,1631,1260,546,1989,6821
total,2764,219,440,870,1709,1801,1454,681,2166,7683


# Levels classifiers

## Number of sentences

In [7]:
caption = "Levels classification: datasets, sentence-level"
label = "levels_datasets_sents"

table = pd.DataFrame(index=['train', 'dev', 'test'])

for dom in domains:
    datapath = PATHS.getpath('data_expr_sept') / f'clf_levels_{dom}_sents'
    train = pd.read_pickle(datapath / 'train.pkl')
    test = pd.read_pickle(datapath / 'test.pkl')
    dev = pd.read_pickle(datapath / 'dev.pkl')
    table.loc['train', dom] = len(train)
    table.loc['dev', dom] = len(dev)
    table.loc['test', dom] = len(test)

table.astype(int).pipe(show_latex, caption=caption, label=label)

Unnamed: 0,ADM,ATT,BER,ENR,ETN,FAC,INS,MBW,STM
train,5233,251,216,1005,2491,1086,1104,766,1420
dev,440,23,29,107,236,124,132,98,148
test,421,32,26,100,183,139,136,60,155


## Number of notes

In [8]:
caption = "Levels classification: datasets, note-level"
label = "levels_datasets_notes"

table = pd.DataFrame(index=['train', 'dev', 'test'])

for dom in domains:
    datapath = PATHS.getpath('data_expr_sept') / f'clf_levels_{dom}_sents'
    train = pd.read_pickle(datapath / 'train.pkl')
    test = pd.read_pickle(datapath / 'test.pkl')
    dev = pd.read_pickle(datapath / 'dev.pkl')
    table.loc['train', dom] = train.NotitieID.nunique()
    table.loc['dev', dom] = dev.NotitieID.nunique()
    table.loc['test', dom] = test.NotitieID.nunique()

table.astype(int).pipe(show_latex, caption=caption, label=label)

Unnamed: 0,ADM,ATT,BER,ENR,ETN,FAC,INS,MBW,STM
train,2344,175,162,705,1417,717,699,536,792
dev,189,17,25,71,128,74,77,71,83
test,200,21,22,70,123,79,74,41,84


# Save tables

In [9]:
prefix = 'ml_datasets'
for idx, table in enumerate(TABLES):
    with open(f'./tables/{prefix}_{idx}.tex', 'w', encoding='utf8') as f:
        f.write(table)