# Create the C -> A dataset to be used by the fine-tuned BERT classifier

## Questions and thoughts
- Tutorial: https://huggingface.co/docs/transformers/custom_datasets
- Context texts must be limited to 512 tokens (Limit for BERT model)
- When labeling the dataset, should the labels be start, end, or start and inside? In other projects (with answer extraction) it seems they use start, end..
- Another option is to insert a higlight token around the sentence containing the answer, and then append the answers after a [SEP] token. As in: 
- There are multiple answer spans in the same context text.. Should those be labeled jointly? / should I have multiple instances of the same texts?
- My idea is to use the original text, no stopword removal or lemmatization.

In [104]:
# necessary library imports
import pandas as pd
import numpy as np
import math
import pickle

In [97]:
# data imports, to be combined into the final datastructure
CA_df = pd.read_pickle("./data/CA/labeled_CA_data_train.pkl")
CA_df_eval = pd.read_pickle("./data/CA/labeled_CA_data_eval.pkl")
CAR_df = pd.read_pickle("./data/CAR/labeled_CAR_data_train.pkl")
CAR_sent_class_df = pd.read_pickle("./data/CAR_classification/labeled_CAR_data_train.pkl")
CAR_sent_class_df_eval = pd.read_pickle("./data/CAR_classification/labeled_CAR_data_eval.pkl")
CRA_df = pd.read_pickle("./data/CRA/labeled_CRA_data_train.pkl")
CRA_df_eval = pd.read_pickle("./data/CRA/labeled_CRA_data_eval.pkl")

In [108]:
# compute the class weights to use in the training of the C -> A model (to account for the scarse dataset)
# idea for how to scale weights:
# https://www.tensorflow.org/tutorials/structured_data/imbalanced_data#calculate_class_weights
# https://medium.com/gumgum-tech/handling-class-imbalance-by-introducing-sample-weighting-in-the-loss-function-3bdebd8203b4

def get_class_distribution(labeled_df, is_sent_class):
    if is_sent_class:
        nr_classes = 2
    else:
        nr_classes = 3
    counts = np.zeros(nr_classes)
    for idx, point in labeled_df.iterrows():
        if 'labels' in point.keys():
            labels = point['labels']
            for label in labels:
                if int(label) >= 0:
                    counts[int(label)] += 1


        else:
            label = point['label']
            if int(label) >= 0:
                counts[int(label)] += 1

    num_labels = np.sum(counts)
    ins_weights_raw = 1 / counts
    ins_weights = ins_weights_raw * (num_labels/2)
    ins_weights_norm = ins_weights_raw / np.sum(ins_weights_raw) * nr_classes

    isns_weights_raw = 1 / np.sqrt(counts)
    isns_weights = isns_weights_raw * (math.sqrt(num_labels/2))
    isns_weights_norm = isns_weights_raw / np.sum(isns_weights_raw) * nr_classes

    # ENS
    B = 0.99999
    E_nc = (1.0 - np.power(B, counts)) / (1.0 - B)
    w = 1/E_nc
    # normalize:
    w = w / np.sum(w) * nr_classes

    # norm = np.linalg.norm(weights)
    # normal_array = weights/norm
    print('fraction of each label: ', counts/num_labels)
    print('INS: ',ins_weights)
    print('INS, norm: ',ins_weights_norm)
    print('ISNS: ',isns_weights)
    print('ISNS, norm: ',isns_weights_norm)
    print('ENS: ',w)




In [109]:
CA_all = pd.concat([CA_df, CA_df_eval])
print('number of data points: ', len(CA_all))
print('number of train data points: ', len(CA_df))
print('number of eval data points: ', len(CA_df_eval))
CA_all = pd.concat([CA_df, CA_df_eval])
print('Distribution training data: ')
get_class_distribution(CA_df, False)
print('Distribution training and eval data: ')
get_class_distribution(CA_all, False)

number of data points:  498
number of train data points:  400
number of eval data points:  98
Distribution training data: 
fraction of each label:  [0.97245636 0.00689324 0.0206504 ]
INS:  [ 0.51416189 72.53486395 24.2126029 ]
INS, norm:  [0.01585914 2.23731182 0.74682904]
ISNS:  [0.71705083 8.51674022 4.92063034]
ISNS, norm:  [0.15197742 1.80510527 1.04291731]
ENS:  [0.03203581 2.21865512 0.74930907]
Distribution training and eval data: 
fraction of each label:  [0.97257313 0.00690304 0.02052383]
INS:  [ 0.51410016 72.43186511 24.3619213 ]
INS, norm:  [0.0158497  2.23307281 0.7510775 ]
ISNS:  [0.71700778 8.51069122 4.93577971]
ISNS, norm:  [0.15187112 1.80266968 1.0454592 ]
ENS:  [0.03659775 2.20955627 0.75384598]


In [94]:
get_class_distribution(CAR_df, False)

fraction of each label:  [0.91798933 0.00454289 0.07746777]
INS:  [  0.54466864 110.06202837   6.45429684]
INS, norm:  [0.01395859 2.82063285 0.16540856]
ISNS:  [ 0.73801669 10.49104515  2.54053082]
ISNS, norm:  [0.16079271 2.28569837 0.55350893]
ENS:  [0.08833199 2.70726576 0.20440225]


In [102]:
CRA_all = pd.concat([CRA_df, CRA_df_eval])
print('number of data points: ', len(CRA_all))
print('number of train data points: ', len(CRA_df))
print('number of eval data points: ', len(CRA_df_eval))
print('Distribution training data: ')
get_class_distribution(CRA_df, False)
print('Distribution training and eval data: ')
get_class_distribution(CRA_all, False)

number of data points:  1569
number of train data points:  1274
number of eval data points:  295
Distribution training data: 
fraction of each label:  [0.99317432 0.0017285  0.00509718]
INS:  [  0.5034363  289.26768868  98.09344175]
INS, norm:  [0.00389391 2.23738681 0.75871928]
ISNS:  [ 0.70953245 17.00787137  9.90421333]
ISNS, norm:  [0.07706273 1.84723486 1.0757024 ]
ENS:  [0.02798288 2.21243967 0.75957744]
Distribution training and eval data: 
fraction of each label:  [0.99315231 0.00174335 0.00510434]
INS:  [  0.50344745 286.80376516  97.95586312]
INS, norm:  [0.00392029 2.23330848 0.76277123]
ISNS:  [ 0.70954031 16.93528167  9.89726544]
ISNS, norm:  [0.07728611 1.84466211 1.07805178]
ENS:  [0.03424446 2.20221551 0.76354003]


In [103]:
CAR_class_all = pd.concat([CAR_sent_class_df, CAR_sent_class_df_eval])
print('number of data points: ', len(CAR_class_all))
print('number of train data points: ', len(CAR_sent_class_df))
print('number of eval data points: ', len(CAR_sent_class_df_eval))
print('Distribution training data: ')
get_class_distribution(CAR_sent_class_df, True)
print('Distribution training and eval data: ')
get_class_distribution(CAR_sent_class_df, True)

number of data points:  50273
number of train data points:  41554
number of eval data points:  8719
Distribution training data: 
fraction of each label:  [0.92027242 0.07972758]
INS:  [0.54331738 6.27135527]
INS, norm:  [0.15945517 1.84054483]
ISNS:  [0.73710066 2.50426741]
ISNS, norm:  [0.45480837 1.54519163]
ENS:  [0.18601511 1.81398489]
Distribution training and eval data: 
fraction of each label:  [0.92027242 0.07972758]
INS:  [0.54331738 6.27135527]
INS, norm:  [0.15945517 1.84054483]
ISNS:  [0.73710066 2.50426741]
ISNS, norm:  [0.45480837 1.54519163]
ENS:  [0.18601511 1.81398489]


In [113]:
# check class distribution of CAR class data subset
train_path = "../../model/data/CAR_classification/data_subset/CAR_class_data_train.pkl"
val_path = "../../model/data/CAR_classification/data_subset/CAR_class_data_eval.pkl"
with open(train_path, "rb") as input_file:
        train_data = pickle.load(input_file)
with open(val_path, "rb") as input_file:
    val_data = pickle.load(input_file)

CAR_sub_train = pd.DataFrame(train_data)
CAR_sub_eval = pd.DataFrame(train_data)
CAR_sub_all = pd.concat([CAR_sub_train, CAR_sub_eval])
get_class_distribution(CAR_sub_all, True)

fraction of each label:  [0.925 0.075]
INS:  [0.54054054 6.66666667]
INS, norm:  [0.15 1.85]
ISNS:  [0.73521462 2.5819889 ]
ISNS, norm:  [0.44327375 1.55672625]
ENS:  [0.15236789 1.84763211]
