# Create the C -> A dataset to be used by the fine-tuned BERT classifier

## Questions and thoughts
- Tutorial: https://huggingface.co/docs/transformers/custom_datasets
- Context texts must be limited to 512 tokens (Limit for BERT model)
- When labeling the dataset, should the labels be start, end, or start and inside? In other projects (with answer extraction) it seems they use start, end..
- Another option is to insert a higlight token around the sentence containing the answer, and then append the answers after a [SEP] token. As in: 
- There are multiple answer spans in the same context text.. Should those be labeled jointly? / should I have multiple instances of the same texts?
- My idea is to use the original text, no stopword removal or lemmatization.

In [1]:
# necessary library imports
import pandas as pd
import numpy as np
import math
import pickle

In [9]:
# data imports, to be combined into the final datastructure
CA_df = pd.read_pickle("./data/CA/labeled_CA_data_train.pkl")
CA_df_eval = pd.read_pickle("./data/CA/labeled_CA_data_eval.pkl")
CA_df_test = pd.read_pickle("./data/CA/labeled_CA_data_test.pkl")

# CAR_df = pd.read_pickle("./data/CAR/labeled_CAR_data_train.pkl")

CAR_sent_class_df = pd.read_pickle("./data/CAR_classification/labeled_CAR_data_train.pkl")
CAR_sent_class_df_eval = pd.read_pickle("./data/CAR_classification/labeled_CAR_data_eval.pkl")
CAR_sent_class_df_test = pd.read_pickle("./data/CAR_classification/labeled_CAR_data_test.pkl")

CRA_df = pd.read_pickle("./data/CRA/labeled_CRA_data_train.pkl")
CRA_df_eval = pd.read_pickle("./data/CRA/labeled_CRA_data_eval.pkl")
CRA_df_test = pd.read_pickle("./data/CRA/labeled_CRA_data_test.pkl")

In [10]:
# compute the class weights to use in the training of the C -> A model (to account for the scarse dataset)
# idea for how to scale weights:
# https://www.tensorflow.org/tutorials/structured_data/imbalanced_data#calculate_class_weights
# https://medium.com/gumgum-tech/handling-class-imbalance-by-introducing-sample-weighting-in-the-loss-function-3bdebd8203b4

def get_class_distribution(labeled_df, is_sent_class):
    if is_sent_class:
        nr_classes = 2
    else:
        nr_classes = 3
    counts = np.zeros(nr_classes)
    for idx, point in labeled_df.iterrows():
        if 'labels' in point.keys():
            labels = point['labels']
            for label in labels:
                if int(label) >= 0:
                    counts[int(label)] += 1


        else:
            label = point['label']
            if int(label) >= 0:
                counts[int(label)] += 1

    num_labels = np.sum(counts)
    ins_weights_raw = 1 / counts
    ins_weights = ins_weights_raw * (num_labels/2)
    ins_weights_norm = ins_weights_raw / np.sum(ins_weights_raw) * nr_classes

    isns_weights_raw = 1 / np.sqrt(counts)
    isns_weights = isns_weights_raw * (math.sqrt(num_labels/2))
    isns_weights_norm = isns_weights_raw / np.sum(isns_weights_raw) * nr_classes

    # ENS
    B = 0.99999
    E_nc = (1.0 - np.power(B, counts)) / (1.0 - B)
    w = 1/E_nc
    # normalize:
    w = w / np.sum(w) * nr_classes

    # norm = np.linalg.norm(weights)
    # normal_array = weights/norm
    print('fraction of each label: ', counts/num_labels)
    print('INS: ',ins_weights)
    print('INS, norm: ',ins_weights_norm)
    print('ISNS: ',isns_weights)
    print('ISNS, norm: ',isns_weights_norm)
    print('ENS: ',w)




In [11]:
CA_all = pd.concat([CA_df, CA_df_eval])
print('number of data points: ', len(CA_all))
print('number of train data points: ', len(CA_df))
print('number of eval data points: ', len(CA_df_eval))
print('number of test data points: ', len(CA_df_test))
CA_all = pd.concat([CA_df, CA_df_eval])
print('Distribution training data: ')
get_class_distribution(CA_df, False)
print('Distribution training and eval data: ')
get_class_distribution(CA_all, False)

number of data points:  557
number of train data points:  453
number of eval data points:  104
number of test data points:  41
Distribution training data: 
fraction of each label:  [0.9733263  0.00662795 0.02004575]
INS:  [ 0.51370234 75.43814027 24.94293903]
INS, norm:  [0.0152744 2.2430736 0.741652 ]
ISNS:  [0.71673031 8.68551324 4.99429064]
ISNS, norm:  [0.14935476 1.80991754 1.0407277 ]
ENS:  [0.03317115 2.22251493 0.74431392]
Distribution training and eval data: 
fraction of each label:  [0.97339812 0.00665904 0.01994284]
INS:  [ 0.51366444 75.08590734 25.07165879]
INS, norm:  [0.01530719 2.23755805 0.74713477]
ISNS:  [0.71670387 8.66521248 5.00716075]
ISNS, norm:  [0.14942665 1.80662299 1.04395036]
ENS:  [0.0380304  2.21191891 0.75005068]


In [12]:
# get_class_distribution(CAR_df, False)

In [13]:
CRA_all = pd.concat([CRA_df, CRA_df_eval])
print('number of data points: ', len(CRA_all))
print('number of train data points: ', len(CRA_df))
print('number of eval data points: ', len(CRA_df_eval))
print('number of test data points: ', len(CRA_df_test))
print('Distribution training data: ')
get_class_distribution(CRA_df, False)
print('Distribution training and eval data: ')
get_class_distribution(CRA_all, False)

number of data points:  1734
number of train data points:  1419
number of eval data points:  315
number of test data points:  80
Distribution training data: 
fraction of each label:  [0.9928871  0.00178927 0.00532363]
INS:  [  0.50358193 279.44283698  93.92089658]
INS, norm:  [0.00404086 2.24231559 0.75364355]
ISNS:  [ 0.70963507 16.71654381  9.69127941]
ISNS, norm:  [0.07850681 1.84934852 1.07214466]
ENS:  [0.03116612 2.21420097 0.7546329 ]
Distribution training and eval data: 
fraction of each label:  [0.99287172 0.00180052 0.00532776]
INS:  [  0.50358973 277.6971709   93.84809756]
INS, norm:  [0.00406067 2.23919922 0.75674011]
ISNS:  [ 0.70964056 16.66424828  9.68752278]
ISNS, norm:  [0.07867002 1.84738127 1.07394872]
ENS:  [0.03785552 2.20446832 0.75767616]


In [14]:
CAR_class_all = pd.concat([CAR_sent_class_df, CAR_sent_class_df_eval])
print('number of data points: ', len(CAR_class_all))
print('number of train data points: ', len(CAR_sent_class_df))
print('number of eval data points: ', len(CAR_sent_class_df_eval))
print('number of test data points: ', len(CAR_sent_class_df_test))
print('Distribution training data: ')
get_class_distribution(CAR_sent_class_df, True)
print('Distribution training and eval data: ')
get_class_distribution(CAR_sent_class_df, True)

number of data points:  54214
number of train data points:  45073
number of eval data points:  9141
number of test data points:  1894
Distribution training data: 
fraction of each label:  [0.91458301 0.08541699]
INS:  [0.54669723 5.85363636]
INS, norm:  [0.17083398 1.82916602]
ISNS:  [0.73938977 2.41942893]
ISNS, norm:  [0.46814321 1.53185679]
ENS:  [0.20111125 1.79888875]
Distribution training and eval data: 
fraction of each label:  [0.91458301 0.08541699]
INS:  [0.54669723 5.85363636]
INS, norm:  [0.17083398 1.82916602]
ISNS:  [0.73938977 2.41942893]
ISNS, norm:  [0.46814321 1.53185679]
ENS:  [0.20111125 1.79888875]


In [113]:
# check class distribution of CAR class data subset
train_path = "../../model/data/CAR_classification/data_subset/CAR_class_data_train.pkl"
val_path = "../../model/data/CAR_classification/data_subset/CAR_class_data_eval.pkl"
with open(train_path, "rb") as input_file:
        train_data = pickle.load(input_file)
with open(val_path, "rb") as input_file:
    val_data = pickle.load(input_file)

CAR_sub_train = pd.DataFrame(train_data)
CAR_sub_eval = pd.DataFrame(train_data)
CAR_sub_all = pd.concat([CAR_sub_train, CAR_sub_eval])
get_class_distribution(CAR_sub_all, True)

fraction of each label:  [0.925 0.075]
INS:  [0.54054054 6.66666667]
INS, norm:  [0.15 1.85]
ISNS:  [0.73521462 2.5819889 ]
ISNS, norm:  [0.44327375 1.55672625]
ENS:  [0.15236789 1.84763211]
