<a href="https://colab.research.google.com/github/eugeneyan/visualizing-finetunes/blob/main/1_prep_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install -q transformers accelerate bitsandbytes datasets peft watermark

In [None]:
%load_ext watermark
%watermark --conda -p torch,transformers,peft,datasets,sklearn

  warn("The installed version of bitsandbytes was compiled without GPU support. "


/usr/local/lib/python3.10/dist-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32
torch       : 2.1.0+cu118
transformers: 4.35.0
peft        : 0.6.0
datasets    : 2.14.6
sklearn     : 1.2.2

conda environment: n/a



In [None]:
import pandas as pd
import logging
import re

from collections import Counter
from datasets import load_dataset
from sklearn.model_selection import train_test_split

In [None]:
# Set up logger
logger = logging.getLogger('1-prep-data')
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    force=True
)

logger.info('Running notebook to prep data')

2023-11-05 05:17:58 - INFO - Running notebook to prep data


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Prepare FIB data
- FIB contains one-sentence summaries on CNN/DM & XSUM news articles.
- Note: We exclude the CNN/Daily Mail data is pretty bad.
- https://huggingface.co/datasets/r-three/fib

In [None]:
fib_ds = load_dataset('r-three/fib', split='test')
fib_df = fib_ds.to_pandas()
logger.info(f'No. of rows in FIB: {len(fib_df):,}')

Repo card metadata block was not found. Setting CardData to empty.
2023-11-05 05:18:06 - INFO - No. of rows in FIB: 3,579


In [None]:
# Visualize the CNN/DM data
fib_df.loc[fib_df['dataset'] == 'cnn_dm', ['input', 'list_choices']].head()

Unnamed: 0,input,list_choices
3122,( cnn ) the american pharmacists association i...,[<t> the american pharmacists association pass...
3123,( cnn ) oprah 's in there . so 's bill murray ...,[<t> `` the late show with david letterman '' ...
3124,( cnn ) feeling so happy you just ca n't stand...,[<t> a new study has found that acetaminophen ...
3125,"( cnn ) love it or hate it , jared leto 's int...",[<t> the oscar winner put on white makeup -lrb...
3126,( the hollywood reporter ) the original cast o...,[<t> -lrb- the hollywood reporter -rrb- the or...


In [None]:
# Only keep xsum data
fib_df = fib_df[fib_df['dataset'] == 'xsum']
logger.info(f'No. of rows in FIB: {len(fib_df):,}')

2023-11-05 05:18:10 - INFO - No. of rows in FIB: 3,122


In [None]:
fib_df[['input', 'list_choices']].head()

Unnamed: 0,input,list_choices
0,Vehicles and pedestrians will now embark and d...,[ A new service on the Isle of Wight's chain f...
1,If you leave your mobile phone somewhere do yo...,"[ You may be worried about your health, but wh..."
2,"Speaking on TV, Maria Zakharova said Jews had ...",[ The Russian foreign minister has said she ha...
3,"A report by the organisation suggests men, wom...",[ Egyptian police are systematically abusing d...
4,Police in Australia and Europe were aware of a...,[One word and a freckle indirectly led to Huck...


In [None]:
# Each list choice contains a positive and negative summary; we'll explode, clean, and drop duplicates
fib_df = fib_df.explode('list_choices')
fib_df['list_choices'] = fib_df['list_choices'].apply(lambda x: x.strip())
fib_df = fib_df.drop_duplicates(subset=['input', 'list_choices'])
logger.info(f'No. of rows in FIB: {len(fib_df):,}')

2023-11-05 05:18:14 - INFO - No. of rows in FIB: 3,534


In [None]:
# Create labels where factually consistent = 2 (entailment) and factually inconsistent = 0 (contradiction)
# What happened to label = 1? We drop it as it represents neutral in the NLI task
fib_df.loc[fib_df['correct_choice'] == fib_df['list_choices'], 'label'] = 2
fib_df.loc[fib_df['correct_choice'] != fib_df['list_choices'], 'label'] = 0
fib_df['label'] = fib_df['label'].astype(int)

logger.info(f'Label distribution:\n{fib_df["label"].value_counts()}')

2023-11-05 05:18:29 - INFO - Label distribution:
0    3034
2     500
Name: label, dtype: int64


In [None]:
# Split into train and val, ensuring that the same source doc doesn't appear across train and val
source_grouped = (fib_df.groupby('input')
                  .agg({'label': 'count'})
                  .reset_index())

input_train, input_val = train_test_split(source_grouped,
                                          test_size=0.2,
                                          stratify=source_grouped['label'],
                                          random_state=1368)

fib_train = fib_df[fib_df['input'].isin(input_train['input'])]
fib_val = fib_df[fib_df['input'].isin(input_val['input'])]

logger.info(f'Rows in FIB train: {len(fib_train):,}, val: {len(fib_val):,}')

2023-11-05 05:18:33 - INFO - Rows in FIB train: 2,827, val: 707


In [None]:
# In FIB, each doc has 1 positive summary and 5-6 negative summaries. We'll balance it
fib_train = fib_train.drop_duplicates(subset=['input', 'label'])
fib_val = fib_val.drop_duplicates(subset=['input', 'label'])

logger.info(f'Rows in balanced FIB train: {len(fib_train)}, val: {len(fib_val)}')

2023-11-05 05:18:35 - INFO - Rows in balanced FIB train: 800, val: 200


In [None]:
fib_train.to_csv('/content/drive/My Drive/fib-train.csv', index=False)
fib_val.to_csv('/content/drive/My Drive/fib-val.csv', index=False)

In [None]:
# Test loading into dataset
fib_files = {'train': '/content/drive/My Drive/fib-train.csv',
             'val': '/content/drive/My Drive/fib-val.csv'}

fib_ds = load_dataset('csv', data_files=fib_files)
fib_ds = fib_ds.select_columns(['input', 'list_choices', 'label'])
fib_ds = fib_ds.rename_column('input', 'premise').rename_column('list_choices', 'hypothesis')

logger.info(f"Label distribution - Train: {Counter(fib_ds['train']['label'])}, Val: {Counter(fib_ds['val']['label'])}")

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

2023-11-05 05:18:38 - INFO - Label distribution - Train: Counter({0: 400, 2: 400}), Val: Counter({2: 100, 0: 100})


## Prepare USB data
- Note: label = 0 is "after edit"/factual consistency; label = 1 is "before edit"/factual inconsistency
- https://github.com/kukrishna/usb/blob/master/dataset_creators/usb_fac.py#L83

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
!git clone https://github.com/kukrishna/usb.git
!cd usb && tar -xf raw_annotations.tar.gz
!cd usb && pip install -r requirements.txt
!cd usb && bash create_all_datasets.sh

Cloning into 'usb'...
remote: Enumerating objects: 34, done.[K
remote: Counting objects:   2% (1/34)[Kremote: Counting objects:   5% (2/34)[Kremote: Counting objects:   8% (3/34)[Kremote: Counting objects:  11% (4/34)[Kremote: Counting objects:  14% (5/34)[Kremote: Counting objects:  17% (6/34)[Kremote: Counting objects:  20% (7/34)[Kremote: Counting objects:  23% (8/34)[Kremote: Counting objects:  26% (9/34)[Kremote: Counting objects:  29% (10/34)[Kremote: Counting objects:  32% (11/34)[Kremote: Counting objects:  35% (12/34)[Kremote: Counting objects:  38% (13/34)[Kremote: Counting objects:  41% (14/34)[Kremote: Counting objects:  44% (15/34)[Kremote: Counting objects:  47% (16/34)[Kremote: Counting objects:  50% (17/34)[Kremote: Counting objects:  52% (18/34)[Kremote: Counting objects:  55% (19/34)[Kremote: Counting objects:  58% (20/34)[Kremote: Counting objects:  61% (21/34)[Kremote: Counting objects:  64% (22/34)[Kremote: Counting obje

In [None]:
usb_train = pd.read_json('usb/task_datasets/all/factuality_classification/train.jsonl', lines=True)
usb_val = pd.read_json('usb/task_datasets/all/factuality_classification/validation.jsonl', lines=True)

logger.info(f'Rows in USB train: {len(usb_train):,}, val: {len(usb_val):,}')

2023-11-05 05:19:48 - INFO - Rows in USB train: 5,050, val: 2,668


In [None]:
usb_train['source'] = usb_train['input_lines'].apply(lambda x: ' '.join(line for line in x))
usb_val['source'] = usb_val['input_lines'].apply(lambda x: ' '.join(line for line in x))

In [None]:
usb_train[['source', 'summary_sent', 'label']].head()

Unnamed: 0,source,summary_sent,label
0,Wendy Jane Crewson Crewson was born in Hamilto...,Wendy Jane Crewson is a Canadian actress.,0
1,Wendy Jane Crewson Crewson was born in Hamilto...,"Wendy Jane Crewson (born May 9, 1956) is a Can...",1
2,"When she returned to Canada, Crewson landed a ...",She began her career appearing on Canadian tel...,0
3,"When she returned to Canada, Crewson landed a ...",She began her career appearing on Canadian tel...,1
4,"In 1993, she starred in the psychological thri...","Crewson has appeared in many films, including ...",0


In [None]:
usb_train['label'] = usb_train['label'].apply(lambda x: 0 if x == 1 else 2)
usb_val['label'] = usb_val['label'].apply(lambda x: 0 if x == 1 else 2)

logger.info(f'Label distribution (train):\n{usb_train["label"].value_counts()}')
logger.info(f'Label distribution (val):\n{usb_val["label"].value_counts()}')

2023-11-05 05:19:54 - INFO - Label distribution (train):
2    2525
0    2525
Name: label, dtype: int64
2023-11-05 05:19:55 - INFO - Label distribution (val):
2    1334
0    1334
Name: label, dtype: int64


In [None]:
usb_train.to_csv('/content/drive/My Drive/usb-train.csv', index=False)
usb_val.to_csv('/content/drive/My Drive/usb-val.csv', index=False)

In [None]:
# Test loading into dataset
usb_files = {'train': '/content/drive/My Drive/usb-train.csv',
             'val': '/content/drive/My Drive/usb-val.csv'}

usb_ds = load_dataset('csv', data_files=usb_files)
usb_ds = usb_ds.select_columns(['source', 'summary_sent', 'label'])
usb_ds = usb_ds.rename_column('source', 'premise').rename_column('summary_sent', 'hypothesis')

logger.info(f"Label distribution - Train: {Counter(usb_ds['train']['label'])}, Val: {Counter(usb_ds['val']['label'])}")

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

2023-11-05 05:20:00 - INFO - Label distribution - Train: Counter({2: 2525, 0: 2525}), Val: Counter({2: 1334, 0: 1334})
