In [1]:
import utils_ecthr as utils
from transformers import AutoTokenizer
from pprint import pprint

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B")

Load the dataset completely and once only the silver rationales

In [2]:
dataset_all = utils.load_ecthr_dataset(allegations = True, silver = False, is_multi_label= True)
dataset_silver = utils.load_ecthr_dataset(True, True, True)

See how long the texts are

In [3]:
train_all = dataset_all["train"].to_pandas()
utils.summarize_text_column(train_all, "facts", tokenizer)

Token indices sequence length is longer than the specified maximum sequence length for this model (8665 > 2048). Running this sequence through the model will result in indexing errors


Character Lengths of 'facts':
count      9000.000000
mean       9798.360000
std       12082.929808
min         390.000000
25%        2911.750000
50%        5929.000000
75%       12046.250000
max      209861.000000
Name: facts, dtype: float64

Token Lengths of 'facts' using the provided tokenizer:
count     9000.000000
mean      2081.563333
std       2575.485235
min         91.000000
25%        631.000000
50%       1261.500000
75%       2523.500000
max      47806.000000
Name: facts, dtype: float64


In [4]:
train_silver = dataset_silver["train"].to_pandas()
utils.summarize_text_column(train_silver, "facts", tokenizer)

Character Lengths of 'facts':
count      9000.000000
mean       5298.743889
std        7637.059861
min          38.000000
25%        1452.000000
50%        2969.500000
75%        6164.250000
max      209861.000000
Name: facts, dtype: float64

Token Lengths of 'facts' using the provided tokenizer:
count     9000.000000
mean      1125.161444
std       1630.668427
min          8.000000
25%        315.000000
50%        636.000000
75%       1308.000000
max      47806.000000
Name: facts, dtype: float64


Multi-label, see the spread of the classes

In [5]:
allegations_multi_variable = utils.load_ecthr_dataset(True, False, True)
violation_multi_variable = utils.load_ecthr_dataset(False, False, True)

In [6]:
allegations_multi_variable_reduced = utils.load_ecthr_dataset(True, False, True, frequency_threshold=150)

In [7]:
import pandas as pd


# the multi labels are one hot encoded, for every position could how often it appears
def return_label_distribution(df):
    label_count = {}
    for label in df["labels"]:
        for i, x in enumerate(label):
            if x == 1:
                if i not in label_count:
                    label_count[i] = 0
                label_count[i] += 1
    return pd.DataFrame(label_count.items(), columns=["label", "count"])

In [8]:
return_label_distribution(allegations_multi_variable["train"].to_pandas())

Unnamed: 0,label,count
0,7,1056
1,12,1665
2,5,5437
3,13,444
4,22,1558
5,8,81
6,9,441
7,18,547
8,10,162
9,1,623


In [9]:
return_label_distribution(violation_multi_variable["train"].to_pandas())

Unnamed: 0,label,count
0,7,710
1,5,4704
2,9,291
3,1,505
4,4,1368
5,12,1238
6,2,1349
7,10,110
8,22,1421
9,8,41


In [10]:
return_label_distribution(allegations_multi_variable_reduced["train"].to_pandas())

Unnamed: 0,label,count
0,7,1056
1,12,1665
2,5,5437
3,13,444
4,22,1558
5,9,441
6,18,547
7,10,162
8,1,623
9,4,1623


In [11]:
reduced = return_label_distribution(allegations_multi_variable_reduced["train"].to_pandas())
non_reduced = return_label_distribution(allegations_multi_variable["train"].to_pandas())

joined = reduced.merge(non_reduced, on="label", how="outer", suffixes=("_reduced", "_non_reduced"))

In [12]:
joined

Unnamed: 0,label,count_reduced,count_non_reduced
0,1,623.0,623
1,2,1740.0,1740
2,3,,26
3,4,1623.0,1623
4,5,5437.0,5437
5,6,,72
6,7,1056.0,1056
7,8,,81
8,9,441.0,441
9,10,162.0,162


Binary see the spread of the classes

In [13]:
allegation_binary = utils.load_ecthr_dataset(True, True, False)
violation_binary = utils.load_ecthr_dataset(False, True, False)

Binary classification for allegations doesnt seem sensible, but you do you


In [14]:
def print_label_distribution_binary(df):
    label_count = {}
    for label in df["labels"]:
        if label not in label_count:
            label_count[label] = 0
        label_count[label] += 1
    items = list(label_count.items())
    items = sorted(items, key =lambda x: -x[1])
    pprint(items)

In [15]:
print_label_distribution_binary(allegation_binary["train"].to_pandas())

[(1, 8955), (0, 45)]


In [16]:
print_label_distribution_binary(violation_binary["train"].to_pandas())

[(1, 8238), (0, 762)]


In [17]:
example_data = utils.load_ecthr_dataset(True, True, True)

thing = utils.tokenize_dataset(example_data, tokenizer, 3000)

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [18]:
thing["train"]

Dataset({
    features: ['facts', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 9000
})

In [19]:
len(thing["train"]["input_ids"][0])

2018