# Create the C -> A dataset to be used by the fine-tuned BERT classifier

## Questions and thoughts
- Tutorial: https://huggingface.co/docs/transformers/custom_datasets
- Context texts must be limited to 512 tokens (Limit for BERT model)
- When labeling the dataset, should the labels be start, end, or start and inside? In other projects (with answer extraction) it seems they use start, end..
- Another option is to insert a higlight token around the sentence containing the answer, and then append the answers after a [SEP] token. As in: 
- There are multiple answer spans in the same context text.. Should those be labeled jointly? / should I have multiple instances of the same texts?
- My idea is to use the original text, no stopword removal or lemmatization.

In [14]:
# necessary library imports
import pandas as pd
import numpy as np
import math

In [9]:
# data imports, to be combined into the final datastructure
CA_df = pd.read_pickle("./data/labeled_CA_training_data.pkl")
CAR_df = pd.read_pickle("./data/labeled_CAR_data_train.pkl")
CRA_df = pd.read_pickle("./data/labeled_CRA_data_train.pkl")

In [53]:
# compute the class weights to use in the training of the C -> A model (to account for the scarse dataset)
# idea for how to scale weights:
# https://www.tensorflow.org/tutorials/structured_data/imbalanced_data#calculate_class_weights
# https://medium.com/gumgum-tech/handling-class-imbalance-by-introducing-sample-weighting-in-the-loss-function-3bdebd8203b4

def get_class_distribution(labeled_df):
    nr_classes = 3
    num_labels = 0
    num_zeros = 0
    num_ones = 0
    num_twos = 0
    for idx, point in labeled_df.iterrows():
        labels = point['labels']
        for label in labels:
            num_labels += 1
            if label == 0:
                num_zeros += 1
            elif label == 1:
                num_ones += 1
            else:
                num_twos += 1
    print('num labels: ', num_labels)
    print('num zeros: ', num_zeros)
    print('num ones: ', num_ones)
    print('num twos: ', num_twos)

    ins_weights = np.array([1/num_zeros, 1/num_ones, 1/num_twos]) * (num_labels/2)
    ins_weights_raw = np.array([1/num_zeros, 1/num_ones, 1/num_twos])
    ins_weights_norm = ins_weights_raw / np.sum(ins_weights_raw) * nr_classes

    isns_weights = np.array([1/math.sqrt(num_zeros), 1/math.sqrt(num_ones), 1/math.sqrt(num_twos)]) * (math.sqrt(num_labels/2))
    isns_weights_raw = np.array([1/math.sqrt(num_zeros), 1/math.sqrt(num_ones), 1/math.sqrt(num_twos)])
    isns_weights_norm = isns_weights_raw / np.sum(isns_weights_raw) * nr_classes

    # ENS
    B = 0.99999
    samples_per_class = [num_zeros, num_ones, num_twos]
    E_nc = (1.0 - np.power(B, samples_per_class)) / (1.0 - B)
    w = 1/E_nc
    # normalize:
    w = w / np.sum(w) * nr_classes

    # norm = np.linalg.norm(weights)
    # normal_array = weights/norm
    print('INS: ',ins_weights)
    print('INS, norm: ',ins_weights_norm)
    print('ISNS: ',isns_weights)
    print('ISNS, norm: ',isns_weights_norm)
    print('ENS: ',w)




In [54]:
get_class_distribution(CA_df)

num labels:  210487
num zeros:  204714
num ones:  1453
num twos:  4320
INS:  [ 0.51410016 72.43186511 24.3619213 ]
INS, norm:  [0.0158497  2.23307281 0.7510775 ]
ISNS:  [0.71700778 8.51069122 4.93577971]
ISNS, norm:  [0.15187112 1.80266968 1.0454592 ]
ENS:  [0.90404534 1.17975022 0.91620444]


In [50]:
get_class_distribution(CAR_df)

num labels:  729271
num zeros:  669463
num ones:  3313
num twos:  56495
INS:  [  0.54466864 110.06202837   6.45429684]
INS, norm:  [0.01395859 2.82063285 0.16540856]
ISNS:  [ 0.73801669 10.49104515  2.54053082]
ISNS, norm:  [0.16079271 2.28569837 0.55350893]
ENS:  [0.08833199 2.70726576 0.20440225]


In [47]:
get_class_distribution(CRA_df)

num labels:  604604
num zeros:  600883
num ones:  940
num twos:  2781
INS:  [  0.50309628 321.59787234 108.70262496]
INS, norm:  [0.00350343 2.23952082 0.75697575]
ISNS:  [ 0.7092928 17.9331501 10.4260551]
ISNS, norm:  [0.07320221 1.85078191 1.07601587]
ENS:  [0.184452   2.05582795 0.75972005]
