In [1]:
%load_ext autoreload
%autoreload 2

In [73]:
import sys
from datasets import load_dataset, load_metric, concatenate_datasets, load_from_disk, Dataset
from src.sibyl import *

# Extracting Concepts for Review

In [76]:
tasks = ['ag_news', 'dbpedia_14', 'yahoo_answers_topics', 'imdb', 'yelp_polarity', 'amazon_polarity']

In [None]:
num_examples = 10

results = []
for task in tasks:
    
    # set task metadata
    task_to_keys = {
        "ag_news": {"keys": ("text", None), "num_classes": 4, "task_type": "topic"},
        "dbpedia_14": {"keys": ("text", None), "num_classes": 14, "task_type": "topic"},
        "yahoo_answers_topics": {"keys": ("text", None), "num_classes": 10, "task_type": "topic"},
        "imdb": {"keys": ("text", None), "num_classes": 2, "task_type": "sentiment"},
        "yelp_polarity":  {"keys": ("text", None), "num_classes": 2, "task_type": "sentiment"},
        "amazon_polarity":  {"keys": ("text", None), "num_classes": 2, "task_type": "sentiment"}
    }
    sentence1_key, sentence2_key = task_to_keys[task]["keys"]
    num_classes = task_to_keys[task]["num_classes"]
    task_type = task_to_keys[task]["task_type"]
    
    dataset = load_dataset(task, split='train').select(range(num_examples))
    
    if task == "yahoo_answers_topics":
        dataset = dataset.map(lambda example : {'text' : example['question_title'] + " " + 
                                                         example['question_content'] + " " +
                                                         example['best_answer'],
                                                'label': example['topic']})

    if task in ["dbpedia_14", "amazon_polarity"]:
        dataset = dataset.rename_column("content", "text")
        
    transform = Concept2Sentence(dataset=task, return_concepts=True)

    def apply_c2s_to_dataset(batch):
        concepts, new_text = [], []
        for data, target in zip(batch['text'], batch['label']):
            c, t = transform(data, target)
            concepts.append(c)
            new_text.append(t)
        return {"text": batch['text'], "label": batch['label'], "concepts": concepts, "new_text": new_text}

    updated_dataset = dataset.map(apply_c2s_to_dataset, batched=True, batch_size=100)
    results.append(updated_dataset)

Using custom data configuration default
Reusing dataset ag_news (C:\Users\fabri\.cache\huggingface\datasets\ag_news\default\0.0.0\bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


Using andi611/distilbert-base-uncased-ner-agnews to rationalize keyphrase selections.


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Reusing dataset d_bpedia14 (C:\Users\fabri\.cache\huggingface\datasets\d_bpedia14\dbpedia_14\2.0.0\7f0577ea0f4397b6b89bfe5c5f2c6b1b420990a1fc5e8538c7ab4ec40e46fa3e)


Using fabriceyhc/bert-base-uncased-dbpedia_14 to rationalize keyphrase selections.


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

# Improving C2S Concept Extraction

In [64]:
t = Concept2Sentence(dataset='ag_news', return_concepts=True)

Using andi611/distilbert-base-uncased-ner-agnews to rationalize keyphrase selections.


In [65]:
X = "What can I say? I know this movie from start to finish. It's hilarious. It's an strong link to my past and will change the way I view film in the future. Hypothetically speaking :) The down-fall? There's no Socrates Johnson!"
y = 1

X = "Hungarian GP, Friday Round-Up Fernando tenth and Jarno seventeenth but no cause for concern, while Pat Symonds explains the challenges of Fridays at the race."
y = 1

In [66]:
concepts, new_sentence = t(X, y)
concepts, new_sentence

(['gp', 'friday', 'tenth', 'race', 'pat', 'fernando'],
 'fernando competes in a race on friday.')

In [29]:
from nltk.stem import WordNetLemmatizer
import string

In [9]:
lemmatizer = WordNetLemmatizer()

In [41]:
concepts = ['sudan', 'libyan', "?!", 'tenth', 'fernando', 'friday', 'fridays', 'flying']
concept_lemmas = [lemmatizer.lemmatize(c) for c in concepts]
new_concepts = [c for i, c in enumerate(concepts) if lemmatizer.lemmatize(c) not in concept_lemmas[:i]]

In [42]:
[c for c in concepts if c not in string.punctuation]

['sudan', 'libyan', '?!', 'tenth', 'fernando', 'friday', 'fridays', 'flying']

In [44]:
concept_depunct = [c.translate(str.maketrans('', '', string.punctuation)) for c in concepts]
concept_depunct

['sudan', 'libyan', '', 'tenth', 'fernando', 'friday', 'fridays', 'flying']

In [45]:
[c1 for c1, c2 in zip(concepts, concept_depunct) if len(c2) > 1]

['sudan', 'libyan', 'tenth', 'fernando', 'friday', 'fridays', 'flying']

# Bulk Dataset Transformation

In [29]:
task = "imdb"
t = "INV"
dataset = load_dataset(task, split="train[:20]")

task_to_keys = {
        "ag_news": {"keys": ("text", None), "num_classes": 4, "task_type": "topic"},
        "dbpedia_14": {"keys": ("text", None), "num_classes": 14, "task_type": "topic"},
        "yahoo_answers_topics": {"keys": ("text", None), "num_classes": 10, "task_type": "topic"},
        "imdb": {"keys": ("text", None), "num_classes": 2, "task_type": "sentiment"}
    }

sentence1_key, sentence2_key = task_to_keys[task]["keys"]
num_classes = task_to_keys[task]["num_classes"]
task_type = task_to_keys[task]["task_type"]

transform = None
num_sampled_INV = 0
num_sampled_SIB = 0
label_type = "soft"

if t == "ORIG":
    label_type = "hard"
elif t == "INV":
    num_sampled_INV = 2
    label_type = "hard"
elif t == "SIB":
    num_sampled_SIB = 2
elif t == 'INVSIB':
    num_sampled_INV = 1
    num_sampled_SIB = 1
    label_type = None
    
sibyl_collator = SibylCollator( 
        sentence1_key=sentence1_key,
        sentence2_key=sentence2_key,
        tokenize_fn=None, 
        transform=transform, 
        num_sampled_INV=num_sampled_INV, 
        num_sampled_SIB=num_sampled_SIB,
        dataset=task,
        task_type=task_type, 
        tran_type=None, 
        label_type=None,
        one_hot=label_type != "hard",
        transform_prob=0.5,
        target_pairs=[],
        target_prob=0.0,
        reduce_mixed=False,
        num_classes=num_classes,
        return_tensors='np',
        return_text=True
    )   

Reusing dataset imdb (C:\Users\fabri\.cache\huggingface\datasets\imdb\plain_text\1.0.0\e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)


SibylCollator initialized with num_sampled_INV=2 and num_sampled_SIB=0


In [37]:
def sibyl_dataset_transform(batch):
    new_batch = []
    for data, target in zip(batch['text'], batch['label']):
        new_batch.append({'text': data, 'label': target})
    text, label = sibyl_collator(new_batch)
    return {"text": text, "label": label}

updated_dataset = dataset.map(sibyl_dataset_transform, batched=True, batch_size=100)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [38]:
test_dataset = load_dataset(task, split='test')

Reusing dataset imdb (C:\Users\fabri\.cache\huggingface\datasets\imdb\plain_text\1.0.0\e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)


In [43]:
test_dataset.to_pandas()

Unnamed: 0,text,label
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1
...,...,...
24995,I occasionally let my kids watch this garbage ...,0
24996,When all we have anymore is pretty much realit...,0
24997,The basic genre is a thriller intercut with an...,0
24998,Four things intrigued me as to this film - fir...,0


In [78]:
t = Concept2Sentence(dataset='dbpedia_14', return_concepts=True)

Using fabriceyhc/bert-base-uncased-dbpedia_14 to rationalize keyphrase selections.


In [85]:
X = ["Allez Oop is a 1934 American short comedy film starring Buster Keaton."]
y = 1
task_config = {'input_idx': [1],
  'tran_type': 'INV',
  'label_type': 'hard',
  'task_name': 'topic'}

In [86]:
X_, y_ = t.transform_Xy(X, y, task_config)
X_, y_

('american during a match against country in the summer of 1934.', 1)

In [48]:
t = ConceptMix(dataset='sst2')

Using yoshitomo-matsubara/bert-base-uncased-sst2 to rationalize keyphrase selections.


In [46]:
texts = ["I hate how long loading the models takes to select better keyphrases.",
         "I really love this movie a lot!"]
targets = [0, 1]
batch = (texts, targets)
new_text, new_target = t(batch, num_classes=2)
print(new_text, new_target)

TypeError: __call__() got an unexpected keyword argument 'num_classes'

In [42]:
ContractContractions().get_task_configs(task_name='similarity').to_dict(orient='records')

[{'input_idx': [1, 0],
  'tran_type': 'INV',
  'label_type': 'hard',
  'task_name': 'similarity'},
 {'input_idx': [0, 1],
  'tran_type': 'INV',
  'label_type': 'hard',
  'task_name': 'similarity'},
 {'input_idx': [1, 1],
  'tran_type': 'INV',
  'label_type': 'hard',
  'task_name': 'similarity'}]

In [27]:
def invert_label(y, soften=False, num_classes=2):
    if not isinstance(y, np.ndarray):
        y = soften_label(y, num_classes)
    y = y[::-1]
    if not soften:
        y = np.argmax(y)
    return y

In [41]:
invert_label(0, soften=False, num_classes=5)

4

In [26]:
y

2

In [24]:
y = np.array([0, 1, 0, 0])

In [20]:
1-y

array([1, 0, 1, 1])

In [21]:
(1-y).sum()

3

In [22]:
np.array([1, 0, 1, 1]) / 3

array([0.33333333, 0.        , 0.33333333, 0.33333333])

array([0.5, 0. , 0.5])