In [14]:
%load_ext autoreload
%autoreload 2

In [15]:
import sys
import time
from datasets import load_dataset, load_metric, concatenate_datasets, load_from_disk, Dataset
from src.sibyl import *

# Extracting Concepts for Review

In [3]:
tasks = ['ag_news', 'dbpedia_14', 'yahoo_answers_topics', 'imdb', 'yelp_polarity', 'amazon_polarity']

In [4]:
num_examples = 10

results = []
for task in tasks:
        
    dataset = load_dataset(task, split='train')
    dataset = dataset.shuffle(seed=43).select(range(num_examples))
    
    if task == "yahoo_answers_topics":
        dataset = dataset.map(lambda example : {'text' : example['question_title'] + " " + 
                                                         example['question_content'] + " " +
                                                         example['best_answer'],
                                                'label': example['topic']})
        dataset = dataset.remove_columns(['question_title', 'question_content', 'best_answer', 'topic'])

    if task in ["dbpedia_14", "amazon_polarity"]:
        dataset = dataset.rename_column("content", "text")
        
    c2s = Concept2Sentence(dataset=task, return_concepts=True, require_concepts_in_new_text=False)
    # c2s_req = Concept2Sentence(dataset=task, return_concepts=True, require_concepts_in_new_text=True)

#     def apply_c2s_to_dataset(batch):
#         concepts1, new_text1 = [], []
#         concepts2, new_text2 = [], []
#         for data, target in zip(batch['text'], batch['label']):
#             c1, t1 = c2s(data, target)
#             concepts1.append(c1)
#             new_text1.append(t1)
#             c2, t2 = c2s_req(data, target)
#             concepts2.append(c2)
#             new_text2.append(t2)
#         return {"text": batch['text'], 
#                 "label": batch['label'], 
#                 "raw_concepts": concepts1, 
#                 "raw_new_text": new_text1,
#                 "trimmed_concepts": concepts2,
#                 "trimmed_new_text": new_text2
#                }
    
    def apply_c2s_to_dataset(batch):
        concepts, new_text = [], []
        for data, target in zip(batch['text'], batch['label']):
            c, t = c2s(data, target)
            concepts.append(c)
            new_text.append(t)
        return {"text": batch['text'], "label": batch['label'], "concepts": concepts, "new_text": new_text}

    updated_dataset = dataset.map(apply_c2s_to_dataset, batched=True, batch_size=1)
    results.append(updated_dataset)

Using custom data configuration default
Reusing dataset ag_news (C:\Users\fabri\.cache\huggingface\datasets\ag_news\default\0.0.0\bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


Using fabriceyhc/bert-base-uncased-ag_news to rationalize keyphrase selections.




HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




Reusing dataset d_bpedia14 (C:\Users\fabri\.cache\huggingface\datasets\d_bpedia14\dbpedia_14\2.0.0\7f0577ea0f4397b6b89bfe5c5f2c6b1b420990a1fc5e8538c7ab4ec40e46fa3e)


Using fabriceyhc/bert-base-uncased-dbpedia_14 to rationalize keyphrase selections.


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




Reusing dataset yahoo_answers_topics (C:\Users\fabri\.cache\huggingface\datasets\yahoo_answers_topics\yahoo_answers_topics\1.0.0\b2712a72fde278f1d6e96cc4f485fd89ed2f79ecb231441e13645b53da021902)


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Using fabriceyhc/bert-base-uncased-yahoo_answers_topics to rationalize keyphrase selections.


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




Reusing dataset imdb (C:\Users\fabri\.cache\huggingface\datasets\imdb\plain_text\1.0.0\e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)


Using fabriceyhc/bert-base-uncased-imdb to rationalize keyphrase selections.


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




Reusing dataset yelp_polarity (C:\Users\fabri\.cache\huggingface\datasets\yelp_polarity\plain_text\1.0.0\a770787b2526bdcbfc29ac2d9beb8e820fbc15a03afd3ebc4fb9d8529de57544)


Using fabriceyhc/bert-base-uncased-yelp_polarity to rationalize keyphrase selections.




HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




Reusing dataset amazon_polarity (C:\Users\fabri\.cache\huggingface\datasets\amazon_polarity\amazon_polarity\3.0.0\ac31acedf6cda6bc2aa50d448f48bbad69a3dd8efc607d2ff1a9e65c2476b4c1)


Using fabriceyhc/bert-base-uncased-amazon_polarity to rationalize keyphrase selections.


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [5]:
dfs = []
for r, d in zip(results, tasks):
    df = r.to_pandas()
    df['dataset'] = d
    dfs.append(df)

In [6]:
df = pd.concat(dfs, ignore_index=True)

In [7]:
df = df[['dataset', 'text', 'label', 'concepts', 'new_text']]
# df = df[['dataset', 'text', 'label', 'raw_concepts', 'raw_new_text', 'trimmed_concepts', 'trimmed_new_text']]

In [8]:
df.to_clipboard(excel=True)

In [9]:
df

Unnamed: 0,dataset,text,label,concepts,new_text
0,ag_news,African Union pledges to send more troops to D...,0,"[darfur, troops, union]",troops of armed force patrol the border with t...
1,ag_news,Rookie Leads Falcons Past Vikings 27-24 (AP) A...,1,"[rookie, falcons, quarterback, ap, receivers, ...",rookie throws a pass to a streaking quarterbac...
2,ag_news,Marsh amp; McLennan halts fees at center of p...,2,"[market, quot, mclennan, amp, marsh, insurance]",mclennan is an amphetamine amphetamine that is...
3,ag_news,Davenport remains on title streak Top-seeded L...,1,"[lindsay, victory, championship, davenport]",davenport celebrates his victory in the champi...
4,ag_news,Brazil Race Debrief: Fernandos sterling drive ...,1,"[race, season]",race at the end of the season.
5,ag_news,Halliburton says way clear for units to exit b...,2,"[company, bankruptcy]",venture funded company has filed for bankruptcy.
6,ag_news,Consumer Confidence Slips Again Consumer confi...,2,"[consumer, confidence]",how to increase the confidence of your consumers.
7,ag_news,Dodgers can clinch division title Friday BRIST...,1,"[division, dodgers, title, league, playoff]",cricket player in action during the league cha...
8,ag_news,Cricket: Tendulkar sets new best Sachin Tendul...,0,[bangladesh],bangladesh is one of the largest cities in the...
9,ag_news,IBM Said to Be Leaving the PC Business The com...,2,"[ibm, quot, business, pc]",biological species is a small business with a ...


# Improving C2S Concept Extraction

In [172]:
t = Concept2Sentence(dataset='amazon_polarity', return_concepts=True)

Using fabriceyhc/bert-base-uncased-amazon_polarity to rationalize keyphrase selections.




In [171]:
X = "What can I say? I know this movie from start to finish. It's hilarious. It's an strong link to my past and will change the way I view film in the future. Hypothetically speaking :) The down-fall? There's no Socrates Johnson!"
y = 1

X = "Hungarian GP, Friday Round-Up Fernando tenth and Jarno seventeenth but no cause for concern, while Pat Symonds explains the challenges of Fridays at the race."
y = 1

X = "I've had Cox internet/cable service at my home in Tempe since 2008. So far I feel like we're subsidizing a predatory monopoly by allowing Cox to stay in business preventing fair competition in the market.\n\nFor example:\n(a) Installation didn't work. They will only set up one jack for internet and will only use existing lines. So if you have cable jacks that are not hooked up to your current line, they are essentially useless. Cox won't activate a jack that is for example, in your living room, if it isn't already activated. \n\n(b) Service has interruptions and is not as fast as advertised. Because all the cable lines in Tempe are above ground, anytime a tree hits the power poles, you don't have internet for a few days.\n\n(c) Cable lines above ground also conduct lightning. When lightning hit near my neighbors house across the street nothing except my roommates computer was fried because I have every outlet on a surge protector. Where the ethernet cord was connected to my roommates computer, there was a visible burn and his laptop was destroyed.\n\n(d) Billing is not rational. If you decide to change the amount of service you subscribe to, they overbill you. You will spend several hours on the phone with customer service for your next three bills, because what they didn't overcharge you for the first months, they will overcharge you for in the following 3 months.\n\n(e) When there are billing problems they will turn off your account for a $14 difference even if it is not a month overdue  without giving you notice. Then they charge you a fee for the turn off. Then when you call to have it fixed, they tell you to call back when the billing department opens weekdays bankers hours.\n\n(f) Then they stop sending you a bill. I haven't received a bill for over 9 months. I just autopay from my checking account and have no idea what they are doing with my money. I've asked them to send it to me, but people on the phone never seem to be able to follow through.\n\n(g) To top it all off, I just received a threat of suit because Lions Gate alleges that I downloaded a movie, that I didn't download. Cox cable gave them my ID from my IP address so they could look at the transactions that occur from my IP address. When I responded that the download was false, I get an automatic reply telling me to encrypt my wireless and am directed to call their useless customer service line. -I never signed away my rights to privacy, and I don't download movies (have netflix, no need).\n\n(h) The local free TV stations can't be seen unless we buy the basic cable package. \n\nThe threat of suit  based on my alleged internet use is the last straw. Cox is a predatory business that uses it's monopoly on pre-existing cable lines to assert it's authority over our access to information on the internet, and what should be free television stations.  Cox cable didn't provide the infrastructure they are using; they shouldn't be allowed to benefit from the tax breaks we gave the phone and electricity companies to set up the network of  lines Cox now uses. They shouldn't be given this benefit, because they overcharge for their services, have predatory billing practices and fail to respect our individual privacy in our homes."
y = 0

X = "Siskel & Ebert were terrific on this show whether you agreed with them or not because of the genuine conflict their separate professional opinions generated. Roeper took this show down a notch or two because he wasn't really a film critic and because he substituted snide for opinionated. Now, when Ben Lyons comes on I feel like I'm watching 'Teen News' -- you know, that kids' news show, hosted by kids for kids? Manckiewitz is not much better. It's obvious they've encountered only a steady diet of mainstream films their entire lives. The idea that these two rank amateurs have anything of interest or consequence to say about motion pictures is ludicrous. If they are reviewing a non-formula film, they are completely lost. Show them something original and intelligent -- they just find it 'confusing'. Wait -- I think I get it ... ABC is owned by Disney ... Disney makes movies for kids. While Siskel, Ebert, and Roper promoted independent films and were only hit-or-miss with the big budget studio productions -- what a surprise: these two guys LOVE the big studio schlock and only manage to tolerate a few indies. Plus everyone knows the age group TV advertisers are aiming for. The blatant nepotism is the icing on the cake. In what alternate universe do these guys qualify as film critics?"
y = 0

X = "This is a great printer. I had trouble at first with the 4 x 6 photo tray - all the photos were jamming so I called HP customer care and was instantly able to talk to very nice person who gave me instructions to load the tray properly (the little oval slider should be even with the little line farthest away from the printer when using HP paper with the tear off tab) - NO MORE JAMS!! Cannot believe the quality of the photo printing - combined with my Canon Powershot G1 this amateur looks like she knows what she is doing."
y = 1

In [174]:
concepts, new_sentence = t(X, y)
concepts, new_sentence

(['load', 'properly'], 'A man is loading his load properly.')

In [136]:
t.extract_concepts(X, y)

['siskel']

In [162]:
t.generate_text_from_concepts(concepts)

'i like the idea of a headset.'

In [147]:
from nltk.stem import WordNetLemmatizer
import string

In [148]:
lemmatizer = WordNetLemmatizer()

In [41]:
concepts = ['sudan', 'libyan', "?!", 'tenth', 'fernando', 'friday', 'fridays', 'flying']
concept_lemmas = [lemmatizer.lemmatize(c) for c in concepts]
new_concepts = [c for i, c in enumerate(concepts) if lemmatizer.lemmatize(c) not in concept_lemmas[:i]]

In [42]:
[c for c in concepts if c not in string.punctuation]

['sudan', 'libyan', '?!', 'tenth', 'fernando', 'friday', 'fridays', 'flying']

In [44]:
concept_depunct = [c.translate(str.maketrans('', '', string.punctuation)) for c in concepts]
concept_depunct

['sudan', 'libyan', '', 'tenth', 'fernando', 'friday', 'fridays', 'flying']

In [45]:
[c1 for c1, c2 in zip(concepts, concept_depunct) if len(c2) > 1]

['sudan', 'libyan', 'tenth', 'fernando', 'friday', 'fridays', 'flying']

# Bulk Dataset Transformation

In [None]:
def sibyl_dataset_transform(batch):
    new_batch = []
    for data, target in zip(batch['text'], batch['label']):
        new_batch.append({'text': data, 'label': target})
    text, label = sibyl_collator(new_batch)
    return {"text": text, "label": label}

In [79]:
task = "imdb"
t = "Concept2Sentence"
dataset = load_dataset(task, split="train[:20]")

task_to_keys = {
        "ag_news": {"keys": ("text", None), "num_classes": 4, "task_type": "topic"},
        "dbpedia_14": {"keys": ("text", None), "num_classes": 14, "task_type": "topic"},
        "yahoo_answers_topics": {"keys": ("text", None), "num_classes": 10, "task_type": "topic"},
        "imdb": {"keys": ("text", None), "num_classes": 2, "task_type": "sentiment"}
    }

sentence1_key, sentence2_key = task_to_keys[task]["keys"]
num_classes = task_to_keys[task]["num_classes"]
task_type = task_to_keys[task]["task_type"]

transform = Concept2Sentence(dataset=task)
num_sampled_INV = 0
num_sampled_SIB = 0
label_type = "hard"

if t == "INV":
    num_sampled_INV = 2
elif t == "SIB":
    num_sampled_SIB = 2
    label_type = "soft"
elif t == 'INVSIB':
    num_sampled_INV = 1
    num_sampled_SIB = 1
    label_type = "soft"
    
sibyl_collator = SibylCollator( 
        sentence1_key=sentence1_key,
        sentence2_key=sentence2_key,
        tokenize_fn=None, 
        transform=transform, 
        num_sampled_INV=num_sampled_INV, 
        num_sampled_SIB=num_sampled_SIB,
        dataset=task,
        task_type=task_type, 
        tran_type=None, 
        label_type=None,
        one_hot=label_type != "hard",
        transform_prob=1.0,
        target_pairs=[],
        target_prob=0.0,
        reduce_mixed=False,
        num_classes=num_classes,
        return_tensors='np',
        return_text=True,
        num_outputs = 15,
        num_jobs = 2
    )   



Using fabriceyhc/bert-base-uncased-imdb to rationalize keyphrase selections.




SibylCollator initialized with Concept2Sentence


In [None]:
for i in [1, 2, 4, 6, 8, 10]:
    sibyl_collator.num_jobs = i
    start = time.time()
    updated_dataset = dataset.map(sibyl_dataset_transform, batched=True, batch_size=5)
    print('num_jobs:', sibyl_collator.num_jobs, 'time:', time.time()-start)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

In [38]:
test_dataset = load_dataset(task, split='test')

Reusing dataset imdb (C:\Users\fabri\.cache\huggingface\datasets\imdb\plain_text\1.0.0\e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)


In [43]:
test_dataset.to_pandas()

Unnamed: 0,text,label
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1
...,...,...
24995,I occasionally let my kids watch this garbage ...,0
24996,When all we have anymore is pretty much realit...,0
24997,The basic genre is a thriller intercut with an...,0
24998,Four things intrigued me as to this film - fir...,0


In [78]:
t = Concept2Sentence(dataset='dbpedia_14', return_concepts=True)

Using fabriceyhc/bert-base-uncased-dbpedia_14 to rationalize keyphrase selections.


In [85]:
X = ["Allez Oop is a 1934 American short comedy film starring Buster Keaton."]
y = 1
task_config = {'input_idx': [1],
  'tran_type': 'INV',
  'label_type': 'hard',
  'task_name': 'topic'}

In [86]:
X_, y_ = t.transform_Xy(X, y, task_config)
X_, y_

('american during a match against country in the summer of 1934.', 1)

In [209]:
t = ConceptMix(dataset='sst2', generation_type='joint')

In [211]:
texts = ["I hate how long loading the models takes to select better keyphrases.",
         "I really love this movie a lot!"]
targets = [0, 1]
batch = (texts, targets)
new_text, new_target = t(batch, num_classes=2)
print(new_text, new_target)

['how to get rid of hate for a long time.', 'i really hate these keyphrases.'] [[1.0, 0.0], [0.6666666865348816, 0.3333333432674408]]


In [42]:
ContractContractions().get_task_configs(task_name='similarity').to_dict(orient='records')

[{'input_idx': [1, 0],
  'tran_type': 'INV',
  'label_type': 'hard',
  'task_name': 'similarity'},
 {'input_idx': [0, 1],
  'tran_type': 'INV',
  'label_type': 'hard',
  'task_name': 'similarity'},
 {'input_idx': [1, 1],
  'tran_type': 'INV',
  'label_type': 'hard',
  'task_name': 'similarity'}]

In [27]:
def invert_label(y, soften=False, num_classes=2):
    if not isinstance(y, np.ndarray):
        y = soften_label(y, num_classes)
    y = y[::-1]
    if not soften:
        y = np.argmax(y)
    return y

In [41]:
invert_label(0, soften=False, num_classes=5)

4

In [26]:
y

2

In [24]:
y = np.array([0, 1, 0, 0])

In [20]:
1-y

array([1, 0, 1, 1])

In [21]:
(1-y).sum()

3

In [22]:
np.array([1, 0, 1, 1]) / 3

array([0.33333333, 0.        , 0.33333333, 0.33333333])

array([0.5, 0. , 0.5])