## Fix duplicate data

In [1]:
import os

import pandas as pd
import numpy as np

from main import load_datasets, FeverLoader, PubhealthLoader, ClimateFeverLoader
from datasets import Dataset, DatasetDict, ClassLabel, Value, Features

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
root = '../data_2023_06_02'

fever_dir = os.path.join(root, 'preprocessed/FEVER')
pubhealth_dir = os.path.join(root, 'preprocessed/PUBHEALTH')
climate_dir = os.path.join(root, 'preprocessed/CLIMATE-FEVER')

In [5]:
climate_params = {
    'dev_size': 200,
    'test_size': 200,
    'random_state': 392
}


"""
Load and prepare datasets for experiments
"""
#===================================================
# Load preprocessed datasets
#===================================================
fever_train_ds, fever_dev_ds, fever_test_ds = FeverLoader.load(fever_dir)
pubhealth_train_ds, pubhealth_dev_ds, pubhealth_test_ds = PubhealthLoader.load(pubhealth_dir)
climate_train_ds, climate_dev_ds, climate_test_ds = ClimateFeverLoader.load(climate_dir, climate_params)

### Check for duplicates

In [55]:
ds = fever_test_ds
df = pd.DataFrame(ds)

In [58]:
df.shape

(9999, 3)

In [57]:
df.drop_duplicates().shape

(9944, 3)

In [53]:
df[df.duplicated(keep=False)].sort_values(by=["claim", "label"]).head().iloc[3].to_dict()

IndexError: single positional indexer is out-of-bounds

## Testing some code

In [12]:
import os
from main import load_datasets

import numpy as np
from sklearn.utils import class_weight

In [4]:
root = '../data_2023_06_02'

fever_dir = os.path.join(root, 'preprocessed/FEVER')
pubhealth_dir = os.path.join(root, 'preprocessed/PUBHEALTH')
climate_dir = os.path.join(root, 'preprocessed/CLIMATE-FEVER')

ds1, ds2, ds3, ds_test = load_datasets(fever_dir, pubhealth_dir, climate_dir)

In [19]:
class_weights = class_weight.compute_class_weight(
    "balanced", 
    classes=np.array([0,1,2]), 
    y=ds3["train"]["label"]
)

In [20]:
class_weights

array([0.70474138, 1.81666667, 0.97032641])

In [29]:
fmt = "../model/{}/{}"

In [31]:
model_dir = "../model/v3/"

In [37]:
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

In [38]:
DATASETS = ["FEVER", "PUBHEALTH", "CLIMATE"]
MODELS =  ["BERT", "RoBERTa", "ALBERT", "SciBERT", "BioBERT"]

In [None]:
model_dir "../models/v3/"

In [40]:
[os.path.join(model_dir, f"{model_name}_{ds_name}") for model_name in MODELS for ds_name in DATASETS]

['../model/v3/BERT_FEVER',
 '../model/v3/BERT_PUBHEALTH',
 '../model/v3/BERT_CLIMATE',
 '../model/v3/RoBERTa_FEVER',
 '../model/v3/RoBERTa_PUBHEALTH',
 '../model/v3/RoBERTa_CLIMATE',
 '../model/v3/ALBERT_FEVER',
 '../model/v3/ALBERT_PUBHEALTH',
 '../model/v3/ALBERT_CLIMATE',
 '../model/v3/SciBERT_FEVER',
 '../model/v3/SciBERT_PUBHEALTH',
 '../model/v3/SciBERT_CLIMATE',
 '../model/v3/BioBERT_FEVER',
 '../model/v3/BioBERT_PUBHEALTH',
 '../model/v3/BioBERT_CLIMATE']

In [114]:
import json
import os

import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split

In [42]:
climate_path = '../data_2023_06_02/raw/CLIMATE-FEVER/climate-fever-dataset-r1.jsonl'

In [139]:
def read_json(fp):
    with open(fp, "r", encoding="utf-8") as f:
        data = []
        for line in f.readlines():
            data.append(json.loads(line.strip()))
        return data

In [44]:
def read_json2(fp):
    with open(fp, "r") as f:
        data = [json.loads(item) for item in list(f)]
    return data

In [103]:
ls1 = read_json1(climate_path)
ls2 = read_json2(climate_path)

In [51]:
len(ls1)

1535

In [53]:
df = pd.DataFrame.from_records(ls1)

In [55]:
df['claim_label'].unique()

array(['SUPPORTS', 'REFUTES', 'NOT_ENOUGH_INFO', 'DISPUTED'], dtype=object)

In [57]:
def drop_labels(sample):
    return sample['claim_label'] in ['SUPPORTS', 'REFUTES', 'NOT_ENOUGH_INFO']

In [63]:
ls2 = list(filter(drop_labels, ls1))

In [66]:
print(len(ls1),len(ls2))

1535 1381


In [67]:
def standardize_labels(sample):
    label = sample['claim_label']
    if label == "NOT_ENOUGH_INFO":
        sample["claim_label"] = "NOT ENOUGH INFO"
    return sample

In [81]:
def extract_evidence(sample):
    evidences = sample["evidences"]
    sample["evidences"] = " ".join([e["evidence"] for e in evidences]) #evidences is a list of dictionary 
    return sample

In [101]:
def standardize_fieldnames(sample):
    d = dict()
    d['claim'] = sample['claim']
    d['label'] = sample['claim_label']
    d['evidence'] = sample['evidences']
    return d

In [104]:
ls3 = list(map(standardize_fieldnames, map(extract_evidence, map(standardize_labels, filter(drop_labels, ls1)))))

In [106]:
df = pd.DataFrame.from_records(ls3)
df['label'].value_counts()

SUPPORTS           654
NOT ENOUGH INFO    474
REFUTES            253
Name: label, dtype: int64

In [121]:
def split_dataset(data, dev_size=200, test_size=200, random_state = 392):
    # Split climate_ds into train & test
    train, test = train_test_split(
        data, 
        test_size = test_size, 
        random_state = random_state, 
        stratify=[d['label'] for d in data]
    )
    train, dev = train_test_split(
        train, 
        test_size = dev_size, 
        random_state = random_state, 
        stratify=[d['label'] for d in train]
    )

    return train, dev, test

In [129]:
def save_processed_data(train, dev, test, output_dir, dataset_name): #fever, pubhealth, climate
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    with open(os.path.join(output_dir, f"{dataset_name}_train.jsonl"), 'w') as f:
        json.dump(train, f)

    with open(os.path.join(output_dir, f"{dataset_name}_dev.jsonl"), 'w') as f:
        json.dump(dev, f)

    with open(os.path.join(output_dir, f"{dataset_name}_test.jsonl"), 'w') as f:
        json.dump(test, f)

In [122]:
train, dev, test = split_dataset(ls3)

In [123]:
print(len(train), len(dev), len(test))

981 200 200


In [125]:
from main import FeverLoader, PubhealthLoader

In [126]:
root = '../data_2023_06_02'

fever_dir = os.path.join(root, 'preprocessed/FEVER')
pubhealth_dir = os.path.join(root, 'preprocessed/PUBHEALTH')
climate_dir = os.path.join(root, 'preprocessed/CLIMATE-FEVER')

In [127]:
train, dev, test = FeverLoader.load(fever_dir)

In [128]:
print(len(train), len(dev), len(test))

145449 9999 9999


In [130]:
save_processed_data(train, dev, test, '../data_2023_06_02/processed_data', "fever")

In [132]:
train, dev, test = PubhealthLoader.load(pubhealth_dir)

In [134]:
save_processed_data(train, dev, test, '../data_2023_06_02/processed_data', "pubhealth")

In [135]:
from datasets import Dataset, DatasetDict, ClassLabel, Value, Features

In [142]:
def load_data(data_dir, ds_name):
    if ds_name not in ["fever", "pubhealth", "climate"]:
        raise ValueError("Unrecognised dataset name")
    
    train_ds = read_json(os.path.join(data_dir, f'{ds_name}_train.jsonl'))
    dev_ds = read_json(os.path.join(data_dir, f'{ds_name}_dev.jsonl'))
    test_ds = read_json(os.path.join(data_dir, f'{ds_name}_test.jsonl'))

    return train_ds, dev_ds, test_ds


In [143]:
data_dir = '../data_2023_06_02/processed_data'
fever_train_ds, fever_dev_ds, fever_test_ds = load_data(data_dir, "fever")
pubhealth_train_ds, pubhealth_dev_ds, pubhealth_test_ds = load_data(data_dir, "pubhealth")
climate_train_ds, climate_dev_ds, climate_test_ds = load_data(data_dir, "climate")

In [146]:
features = Features({
    "claim": Value("string"), 
    "evidence": Value("string"),
    "label": ClassLabel(num_classes=3, names=["SUPPORTS", "REFUTES", "NOT ENOUGH INFO"])
})

In [None]:
def load_data(data_dir, ds_name):
    Dataset.from_json("json", data_files="my_file.json")

In [155]:
ds_fever = DatasetDict()
ds_fever['train'] = Dataset.from_json(os.path.join(data_dir, 'fever_train.jsonl'), features=features, cache_dir=None)
ds_fever['validation'] = Dataset.from_json(os.path.join(data_dir, 'fever_dev.jsonl'), features=features, cache_dir=None)

Found cached dataset json (/users/k21193529/.cache/huggingface/datasets/json/default-a56339fba9a0b956/0.0.0)
Found cached dataset json (/users/k21193529/.cache/huggingface/datasets/json/default-5b49783bb873129f/0.0.0)


In [156]:
ds_fever['train']

Dataset({
    features: ['claim', 'evidence', 'label'],
    num_rows: 145449
})

In [159]:
df[df['label'].isin(['SUPPORTS', 'NOT ENOUGH INFO'])]

Unnamed: 0,claim,label,evidence
0,Global warming is driving polar bears toward e...,SUPPORTS,"""Recent Research Shows Human Activity Driving ..."
1,The sun has gone into ‘lockdown’ which could c...,SUPPORTS,The current consensus of the scientific commun...
5,They tell us that we are the primary forces co...,SUPPORTS,Most carbon dioxide from human activities is r...
6,The Great Barrier Reef is experiencing the mos...,SUPPORTS,These temperatures have caused the most severe...
10,Earth about to enter 30-YEAR ‘Mini Ice Age’,NOT ENOUGH INFO,"The last continental glaciation ended 10,000 y..."
...,...,...,...
1376,About 60% of the warming observed from 1970 to...,NOT ENOUGH INFO,Given that records of solar activity are accur...
1377,"""Skeptics hope that Postma’s alternative therm...",NOT ENOUGH INFO,It was not until after the elucidation of the ...
1378,"""There are other possible causes for climate c...",SUPPORTS,"According to Wilson, ""Wobbles in the orbit of ..."
1379,We don't need a high heat flow - just a high t...,NOT ENOUGH INFO,Sea water has an important influence on the wo...


In [170]:
_df['label'].replace({"s":"x", "NOT ENOUGH INFO":"y"}, inplace=True)

In [165]:
_df = df.copy()

In [173]:
df[['evidence', 'claim']]

Unnamed: 0,evidence,claim
0,"""Recent Research Shows Human Activity Driving ...",Global warming is driving polar bears toward e...
1,The current consensus of the scientific commun...,The sun has gone into ‘lockdown’ which could c...
2,"""Ask the experts: Are polar bear populations i...",The polar bear population has been growing.
3,CO2 in the mesosphere acts as a cooling agent ...,Ironic' study finds more CO2 has slightly cool...
4,While CO 2 absorption and release is always ha...,Human additions of CO2 are in the margin of er...
...,...,...
1376,Given that records of solar activity are accur...,About 60% of the warming observed from 1970 to...
1377,It was not until after the elucidation of the ...,"""Skeptics hope that Postma’s alternative therm..."
1378,"According to Wilson, ""Wobbles in the orbit of ...","""There are other possible causes for climate c..."
1379,Sea water has an important influence on the wo...,We don't need a high heat flow - just a high t...


In [187]:
root = '../data_2023_06_02'
fever_raw = os.path.join(root, 'raw/FEVER')
climate_raw = os.path.join(root, 'raw/CLIMATE')

In [184]:
train_df = pd.DataFrame.from_records(read_json(os.path.join(fever_raw, 'train.jsonl')))

In [189]:
climate_df = pd.DataFrame.from_records(read_json(os.path.join(climate_raw, 'climate-fever-dataset-r1.jsonl')))

In [190]:
climate_df['claim_label'].value_counts()

SUPPORTS           654
NOT_ENOUGH_INFO    474
REFUTES            253
DISPUTED           154
Name: claim_label, dtype: int64

In [191]:
train_df['label'].value_counts()

SUPPORTS           80035
NOT ENOUGH INFO    35639
REFUTES            29775
Name: label, dtype: int64

## Debug Processed Data

In [25]:
import json
def read_json(fp):
    with open(fp, "r", encoding="utf-8") as f:
        data = []
        for line in f.readlines():
            data.append(json.loads(line.strip()))
        return data

In [12]:
root = '../data_2023_06_02/processed_data/'

In [29]:
fp = os.path.join(root, 'fever_test.jsonl')

In [30]:
data = read_json(fp)

In [7]:
features = Features({
        "claim": Value("string"), 
        "evidence": Value("string"),
        "label": ClassLabel(num_classes=3, names=["SUPPORTS", "REFUTES", "NOT ENOUGH INFO"])
    })

In [33]:
ds = Dataset.from_json(fp, features=features)

Found cached dataset json (/users/k21193529/.cache/huggingface/datasets/json/default-56544478607539d7/0.0.0)


In [37]:
ds[114]

{'claim': 'Richard Fortus has recorded a studio album with a band.',
 'evidence': 'South Abbey , Youghal was a 13th-century Franciscan Friary that was situated south of Youghal , County Cork , Ireland',
 'label': 2}