In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import numpy as np
import pandas as pd
from datasets import load_dataset, load_metric, concatenate_datasets, load_from_disk, Dataset
from src.sibyl import *

In [3]:
np_random = np.random.default_rng(42)

# Refactoring SibylCollator

In [13]:
class SibylTransformer:
    def __init__(self, task, num_classes=2, multiplier=1, num_INV=1, num_SIB=1):
        self.task = task
        self.num_classes = num_classes
        self.multiplier = multiplier
        self.num_INV = num_INV
        self.num_SIB = num_SIB
        
        self.tran_df = init_transforms(task_name=self.task)
        self.INV_fns = self.tran_df[self.tran_df['tran_type']=='INV']['tran_fn'].to_list()
        self.SIB_fns = self.tran_df[self.tran_df['tran_type']=='SIB']['tran_fn'].to_list()
        
    def sample_transform(self, tran_type):
        if tran_type == 'INV':
            return np_random.choice(self.INV_fns)
        else:
            return np_random.choice(self.SIB_fns)
        
    def apply_transform(self, batch, transform):
        if is_batched(transform):
            (new_text, new_labels), meta = transform(
                batch, 
                num_classes=self.num_classes
            )
            new_labels = [np.squeeze(one_hot_encode(y, self.num_classes)) for y in new_labels]
            return new_text, new_labels
        else:
            new_text, new_labels = [], []
            for X, y in zip(*batch):
                X, y, meta = transform.transform_Xy(X, y)
                new_text.append(X)
                new_labels.append(y)  
            return new_text, new_labels       
                    
    def __call__(self, batch):
        new_text, new_labels = [], []
        for _ in range(self.multiplier):
            num_INV_applied, num_SIB_applied = 0, 0
            while num_INV_applied < self.num_INV or num_SIB_applied < self.num_SIB:
                
                # sample transform
                sample_prob = np.array([self.num_INV - num_INV_applied, self.num_SIB - num_SIB_applied])
                sample_prob = sample_prob / sample_prob.sum()
                tran_type = np_random.choice(['INV', 'SIB'], p=sample_prob)
                transform = self.sample_transform(tran_type)
                
                # apply transform
                text_, labels_ = self.apply_transform(batch, transform)
                
                new_text.extend(text_)
                new_labels.extend(labels_)

                num_INV_applied += 1 if tran_type == 'INV' else 0
                num_SIB_applied += 1 if tran_type == 'SIB' else 0
                
        # format types
        new_text = [str(x[0]) if type(x) == list else str(x) for x in new_text]
        new_labels = [np.squeeze(y).tolist() if isinstance(y, (list, np.ndarray, torch.Tensor)) else y for y in new_labels]
        
        return new_text, new_labels

In [14]:
dataset = load_dataset("glue", "sst2", split="train[:10]")
dataset = dataset.rename_column('sentence', 'text')

Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [15]:
t = SibylTransformer("sentiment", num_INV = 2, num_SIB = 2)

In [16]:
def batcher(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

In [17]:
new_text, new_labels = [], []
for batch in batcher(dataset, 5):
    t_, l_ = t((batch['text'], batch['label']))
    new_text.extend(t_)
    new_labels.extend(l_)

In [18]:
new_text, new_labels

(['show worn secretions from the filial units ',
  'give no wit , only refresh gags ',
  'that hate its antitype and curse something rather ugly about nonhuman nature ',
  'change imperfectly satisfied to change the same throughout ',
  'on the unregretting revenge-of-the-nerds clichés the filmmakers could dredge up ',
  'hide new secretions from the parental units  That being said, I hated it.',
  "contains no wit , only labored gags  That being said, I'm unhappy.",
  "that loves its characters and communicates something rather beautiful about human nature  That being said, I couldn't be more upset.",
  "remains utterly satisfied to remain the same throughout  That being said, I couldn't be more upset.",
  'on the worst revenge-of-the-nerds clichés the filmmakers could dredge up  That being said, I hated it.',
  'hide new secretions from the parental units',
  'contains no wit , only labored gags',
  'that loves its characters and communicates something rather beautiful about human na

# Extracting Concepts for Review

In [76]:
tasks = ['ag_news', 'dbpedia_14', 'yahoo_answers_topics', 'imdb', 'yelp_polarity', 'amazon_polarity']

In [None]:
num_examples = 10

results = []
for task in tasks:
    
    # set task metadata
    task_to_keys = {
        "ag_news": {"keys": ("text", None), "num_classes": 4, "task_type": "topic"},
        "dbpedia_14": {"keys": ("text", None), "num_classes": 14, "task_type": "topic"},
        "yahoo_answers_topics": {"keys": ("text", None), "num_classes": 10, "task_type": "topic"},
        "imdb": {"keys": ("text", None), "num_classes": 2, "task_type": "sentiment"},
        "yelp_polarity":  {"keys": ("text", None), "num_classes": 2, "task_type": "sentiment"},
        "amazon_polarity":  {"keys": ("text", None), "num_classes": 2, "task_type": "sentiment"}
    }
    sentence1_key, sentence2_key = task_to_keys[task]["keys"]
    num_classes = task_to_keys[task]["num_classes"]
    task_type = task_to_keys[task]["task_type"]
    
    dataset = load_dataset(task, split='train').select(range(num_examples))
    
    if task == "yahoo_answers_topics":
        dataset = dataset.map(lambda example : {'text' : example['question_title'] + " " + 
                                                         example['question_content'] + " " +
                                                         example['best_answer'],
                                                'label': example['topic']})

    if task in ["dbpedia_14", "amazon_polarity"]:
        dataset = dataset.rename_column("content", "text")
        
    transform = Concept2Sentence(dataset=task, return_concepts=True)

    def apply_c2s_to_dataset(batch):
        concepts, new_text = [], []
        for data, target in zip(batch['text'], batch['label']):
            c, t = transform(data, target)
            concepts.append(c)
            new_text.append(t)
        return {"text": batch['text'], "label": batch['label'], "concepts": concepts, "new_text": new_text}

    updated_dataset = dataset.map(apply_c2s_to_dataset, batched=True, batch_size=100)
    results.append(updated_dataset)

Using custom data configuration default
Reusing dataset ag_news (C:\Users\fabri\.cache\huggingface\datasets\ag_news\default\0.0.0\bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


Using andi611/distilbert-base-uncased-ner-agnews to rationalize keyphrase selections.


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Reusing dataset d_bpedia14 (C:\Users\fabri\.cache\huggingface\datasets\d_bpedia14\dbpedia_14\2.0.0\7f0577ea0f4397b6b89bfe5c5f2c6b1b420990a1fc5e8538c7ab4ec40e46fa3e)


Using fabriceyhc/bert-base-uncased-dbpedia_14 to rationalize keyphrase selections.


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

# Improving C2S Concept Extraction

In [6]:
dataset = load_dataset("imdb", split="train")

Reusing dataset imdb (C:\Users\Fabrice\.cache\huggingface\datasets\imdb\plain_text\1.0.0\e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)


In [22]:
stats = [{'len':len(x['text'].split())} for x in dataset]

In [42]:
pd.DataFrame(stats).sort_values('len').head(10)

Unnamed: 0,len
13568,10
16479,10
16854,11
6408,12
14807,12
23609,14
15633,15
21936,17
4131,17
4996,18


In [27]:
t = Concept2Sentence(dataset='imdb', return_concepts=True)
antonymizer = ChangeAntonym()
synonymizer = ChangeSynonym()

Using fabriceyhc/bert-base-uncased-imdb to rationalize keyphrase selections.


In [151]:
idx = 21936
X = dataset[idx]['text']
y = dataset[idx]['label']

concepts, new_sentence = t(X, y)
print(X, y, concepts, new_sentence)

no comment - stupid movie, acting average or worse... screenplay - no sense at all... SKIP IT! 0 ['worse'] worse at the end of the day


In [152]:
new_concepts = [antonymizer(c) for c in concepts]
new_sentence = t.generate_text_from_concepts(new_concepts)
print(new_concepts, new_sentence)

['unregretful'] people are disappointed by the lack of engagement.


In [147]:
new_concepts = [synonymizer(c) for c in ['worse', 'stupid']]
new_sentence = t.generate_text_from_concepts(new_concepts)
print(new_concepts, new_sentence)

['tough', 'dazed'] The man is dazed and trying to get out of bed.


In [65]:
X = "What can I say? I know this movie from start to finish. It's hilarious. It's an strong link to my past and will change the way I view film in the future. Hypothetically speaking :) The down-fall? There's no Socrates Johnson!"
y = 1

X = "Hungarian GP, Friday Round-Up Fernando tenth and Jarno seventeenth but no cause for concern, while Pat Symonds explains the challenges of Fridays at the race."
y = 1

In [66]:
concepts, new_sentence = t(X, y)
concepts, new_sentence

(['gp', 'friday', 'tenth', 'race', 'pat', 'fernando'],
 'fernando competes in a race on friday.')

In [29]:
from nltk.stem import WordNetLemmatizer
import string

In [9]:
lemmatizer = WordNetLemmatizer()

In [41]:
concepts = ['sudan', 'libyan', "?!", 'tenth', 'fernando', 'friday', 'fridays', 'flying']
concept_lemmas = [lemmatizer.lemmatize(c) for c in concepts]
new_concepts = [c for i, c in enumerate(concepts) if lemmatizer.lemmatize(c) not in concept_lemmas[:i]]

In [42]:
[c for c in concepts if c not in string.punctuation]

['sudan', 'libyan', '?!', 'tenth', 'fernando', 'friday', 'fridays', 'flying']

In [44]:
concept_depunct = [c.translate(str.maketrans('', '', string.punctuation)) for c in concepts]
concept_depunct

['sudan', 'libyan', '', 'tenth', 'fernando', 'friday', 'fridays', 'flying']

In [45]:
[c1 for c1, c2 in zip(concepts, concept_depunct) if len(c2) > 1]

['sudan', 'libyan', 'tenth', 'fernando', 'friday', 'fridays', 'flying']

# Bulk Dataset Transformation

In [29]:
task = "imdb"
t = "INV"
dataset = load_dataset(task, split="train[:20]")

task_to_keys = {
        "ag_news": {"keys": ("text", None), "num_classes": 4, "task_type": "topic"},
        "dbpedia_14": {"keys": ("text", None), "num_classes": 14, "task_type": "topic"},
        "yahoo_answers_topics": {"keys": ("text", None), "num_classes": 10, "task_type": "topic"},
        "imdb": {"keys": ("text", None), "num_classes": 2, "task_type": "sentiment"}
    }

sentence1_key, sentence2_key = task_to_keys[task]["keys"]
num_classes = task_to_keys[task]["num_classes"]
task_type = task_to_keys[task]["task_type"]

transform = None
num_sampled_INV = 0
num_sampled_SIB = 0
label_type = "soft"

if t == "ORIG":
    label_type = "hard"
elif t == "INV":
    num_sampled_INV = 2
    label_type = "hard"
elif t == "SIB":
    num_sampled_SIB = 2
elif t == 'INVSIB':
    num_sampled_INV = 1
    num_sampled_SIB = 1
    label_type = None
    
sibyl_collator = SibylCollator( 
        sentence1_key=sentence1_key,
        sentence2_key=sentence2_key,
        tokenize_fn=None, 
        transform=transform, 
        num_sampled_INV=num_sampled_INV, 
        num_sampled_SIB=num_sampled_SIB,
        dataset=task,
        task_type=task_type, 
        tran_type=None, 
        label_type=None,
        one_hot=label_type != "hard",
        transform_prob=0.5,
        target_pairs=[],
        target_prob=0.0,
        reduce_mixed=False,
        num_classes=num_classes,
        return_tensors='np',
        return_text=True
    )   

Reusing dataset imdb (C:\Users\fabri\.cache\huggingface\datasets\imdb\plain_text\1.0.0\e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)


SibylCollator initialized with num_sampled_INV=2 and num_sampled_SIB=0


In [37]:
def sibyl_dataset_transform(batch):
    new_batch = []
    for data, target in zip(batch['text'], batch['label']):
        new_batch.append({'text': data, 'label': target})
    text, label = sibyl_collator(new_batch)
    return {"text": text, "label": label}

updated_dataset = dataset.map(sibyl_dataset_transform, batched=True, batch_size=100)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [38]:
test_dataset = load_dataset(task, split='test')

Reusing dataset imdb (C:\Users\fabri\.cache\huggingface\datasets\imdb\plain_text\1.0.0\e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)


In [43]:
test_dataset.to_pandas()

Unnamed: 0,text,label
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1
...,...,...
24995,I occasionally let my kids watch this garbage ...,0
24996,When all we have anymore is pretty much realit...,0
24997,The basic genre is a thriller intercut with an...,0
24998,Four things intrigued me as to this film - fir...,0


In [78]:
t = Concept2Sentence(dataset='dbpedia_14', return_concepts=True)

Using fabriceyhc/bert-base-uncased-dbpedia_14 to rationalize keyphrase selections.


In [85]:
X = ["Allez Oop is a 1934 American short comedy film starring Buster Keaton."]
y = 1
task_config = {'input_idx': [1],
  'tran_type': 'INV',
  'label_type': 'hard',
  'task_name': 'topic'}

In [86]:
X_, y_ = t.transform_Xy(X, y, task_config)
X_, y_

('american during a match against country in the summer of 1934.', 1)

In [48]:
t = ConceptMix(dataset='sst2')

Using yoshitomo-matsubara/bert-base-uncased-sst2 to rationalize keyphrase selections.


In [46]:
texts = ["I hate how long loading the models takes to select better keyphrases.",
         "I really love this movie a lot!"]
targets = [0, 1]
batch = (texts, targets)
new_text, new_target = t(batch, num_classes=2)
print(new_text, new_target)

TypeError: __call__() got an unexpected keyword argument 'num_classes'

In [42]:
ContractContractions().get_task_configs(task_name='similarity').to_dict(orient='records')

[{'input_idx': [1, 0],
  'tran_type': 'INV',
  'label_type': 'hard',
  'task_name': 'similarity'},
 {'input_idx': [0, 1],
  'tran_type': 'INV',
  'label_type': 'hard',
  'task_name': 'similarity'},
 {'input_idx': [1, 1],
  'tran_type': 'INV',
  'label_type': 'hard',
  'task_name': 'similarity'}]

In [27]:
def invert_label(y, soften=False, num_classes=2):
    if not isinstance(y, np.ndarray):
        y = soften_label(y, num_classes)
    y = y[::-1]
    if not soften:
        y = np.argmax(y)
    return y

In [41]:
invert_label(0, soften=False, num_classes=5)

4

In [26]:
y

2

In [24]:
y = np.array([0, 1, 0, 0])

In [20]:
1-y

array([1, 0, 1, 1])

In [21]:
(1-y).sum()

3

In [22]:
np.array([1, 0, 1, 1]) / 3

array([0.33333333, 0.        , 0.33333333, 0.33333333])

array([0.5, 0. , 0.5])