In [1]:
from sys import executable as python

# !{python} -m pip install -q -U pip seqeval plotly plotly_utils spacy matplotlib spacy datasets
# !{python} -m spacy download en_core_web_sm
# !git clone https://github.com/huggingface/transformers.git
#!{python} -m pip install -e transformers

In [6]:
!{python} -m pip freeze seqeval plotly plotly_utils spacy matplotlib spacy datasets

argon2-cffi==20.1.0
async-generator==1.10
attrs==20.3.0
backcall==0.2.0
bleach==3.2.2
blis==0.7.4
catalogue==2.0.1
certifi==2020.12.5
cffi==1.14.4
chardet==4.0.0
click==7.1.2
contextvars==2.4
cycler==0.10.0
cymem==2.0.5
dataclasses==0.8
datasets==1.2.1
decorator==4.4.2
defusedxml==0.6.0
dill==0.3.3
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl
entrypoints==0.3
filelock==3.0.12
idna==2.10
immutables==0.14
importlib-metadata==3.4.0
ipykernel==5.4.3
ipython==7.16.1
ipython-genutils==0.2.0
ipywidgets==7.6.3
jedi==0.18.0
Jinja2==2.11.2
joblib==1.0.0
jsonschema==3.2.0
jupyter==1.0.0
jupyter-client==6.1.11
jupyter-console==6.2.0
jupyter-core==4.7.0
jupyter-http-over-ws==0.0.8
jupyterlab-pygments==0.1.2
jupyterlab-widgets==1.0.0
kiwisolver==1.3.1
MarkupSafe==1.1.1
matplotlib==3.3.4
mistune==0.8.4
multiprocess==0.70.11.1
murmurhash==1.0.5
nbclient==0.5.1
nbcon

In [4]:
!{python} -m pip freeze > requirements.txt

In [5]:
from transformers import pipeline
from seqeval.metrics import f1_score, precision_score, recall_score,\
                            performance_measure
import csv, json, pickle, spacy, requests, logging
import pandas as pd
from urllib.request import urlopen
from sys import stdout
import plotly.express as px
from matplotlib import pyplot as plt
import numpy as np

logging.disable(logging.WARNING)

from tqdm import tqdm as tq
def tqdm(iter, **kwargs):
    return tq(list(iter), kwargs, position=0, leave=True, file=stdout)

DATA_DIR = 'https://raw.githubusercontent.com/IntelLabs/nlp-architect/libert/nlp_architect/models/libert/data/'
CSV_DATA_DIR = DATA_DIR + 'csv/spacy/domains_all/'
JSON_DATA_DIR = DATA_DIR + 'Dai2019/semeval14/'

def load_dataset(csv_url, json_url, multi_token=False):
    json_rows = requests.get(JSON_DATA_DIR + json_url).text.splitlines()

    row_break = "\r\n_,_,_,_,_,_,_\r\n_,_,_,_,_,_,_\r\n"
    csv_text = requests.get(CSV_DATA_DIR + csv_url).text
    csv_text = csv_text[csv_text.index('\n') + 1:] # skip csv header
    csv_sents = csv_text.split(row_break)[:len(json_rows)] # clip train data
    ds, aspects = [], []

    for json_row, csv_sent in zip(json_rows, csv_sents):
        json_obj = json.loads(json_row)
        text = json_obj['text']
        tokens, labels, aspects = [], [], []
        for token, label, *_ in csv.reader(csv_sent.splitlines()):
            if label == 'I-ASP':
                aspects = None
                break
            tokens.append(token)
            if label == 'B-ASP':
                aspects.append(token)
                labels.append(label)
            else:
                labels.append('O')
        if aspects == None and not multi_token:
            continue
        if multi_token and 'terms' in json_obj:
            aspects = [t['term'] for t in json_obj['terms']]
        ds.append((text, tokens, labels, aspects))
    return ds

# res_ds = load_dataset('restaurants.csv', "restaurants/restaurants_train_sents.json")
# lap_ds = load_dataset('laptops.csv', "laptops/laptops_train_sents.json")
# domain_ds = {'res': res_ds, 'lap': lap_ds}

# print(f'RESTAURANTS (size={len(res_ds)}):\n')
# for ex in res_ds[:5]:
#     print(ex[0], ex[3])

# print(f'\n\nLAPTOPS (size={len(lap_ds)}):\n')
# for ex in lap_ds[:5]:
#     print(ex[0], ex[3])

# train_size = 100
# res_train, res_test = res_ds[:train_size], res_ds[train_size:]
# lap_train, lap_test = lap_ds[:train_size], lap_ds[train_size:]

# models_dict = {}

In [6]:
res_ds_multi = load_dataset('restaurants.csv', "restaurants/restaurants_train_sents.json", multi_token=True)
lap_ds_multi = load_dataset('laptops.csv', "laptops/laptops_train_sents.json", multi_token=True)

print(f'RESTAURANTS (size={len(res_ds_multi)}):\n')
for ex in res_ds_multi:
    if ex[3] and ' ' in ex[3][0]:
        print(ex[0], ex[3])

print(f'\n\nLAPTOPS (size={len(lap_ds_multi)}):\n')
for ex in lap_ds_multi:
    if ex[3] and ' ' in ex[3][0]:
        print(ex[0], ex[3])

RESTAURANTS (size=3044):

Our agreed favorite is the orrechiete with sausage and chicken (usually the waiters are kind enough to split the dish in half so you get to sample both meats). ['orrechiete with sausage and chicken', 'waiters', 'meats', 'dish']
All the money went into the interior decoration, none of it went to the chefs. ['interior decoration', 'chefs']
I asked for seltzer with lime, no ice. ['seltzer with lime']
this little place has a cute interior decor and affordable city prices. ['interior deco', 'prices']
The fried rice is amazing here. ['fried rice']
it's a perfect place to have a amanzing indian food. ['indian food']
At the end you're left with a mild broth with noodles that you can slurp out of a cup. ['broth with noodles']
The wine list is excellent. ['wine list']
Ive been to many Thai restaurants in Manhattan before, and Toons is by far the best Thai food Ive had (except for my mom's of course). ['Thai food']
They wouldnt even let me finish my glass of wine before 

In [7]:
res_ds = load_dataset('restaurants.csv', "restaurants/restaurants_train_sents.json")
lap_ds = load_dataset('laptops.csv', "laptops/laptops_train_sents.json")
domain_ds = {'res': res_ds, 'lap': lap_ds, 'res_multi': res_ds_multi, 'lap_multi': lap_ds_multi}
domain_names = {'res': 'Restaurants', 'lap': 'Laptops', 'res_multi': 'Restaurants (w/ multi-token aspects)',
               'lap_multi': 'Laptops (w/ multi-token aspects)'}

print(f'RESTAURANTS (size={len(res_ds)}):\n')
for ex in res_ds[:5]:
    print(ex[0], ex[3])

print(f'\n\nLAPTOPS (size={len(lap_ds)}):\n')
for ex in lap_ds[:5]:
    print(ex[0], ex[3])

train_size = 100
res_train, res_test = res_ds[:train_size], res_ds[train_size:]
lap_train, lap_test = lap_ds[:train_size], lap_ds[train_size:]

models_dict = {}

RESTAURANTS (size=2358):

But the staff was so horrible to us. ['staff']
To be completely fair, the only redeeming factor was the food, which was above average, but couldn't make up for all the other deficiencies of Teodora. ['food']
The food is uniformly exceptional, with a very capable kitchen which will proudly whip up whatever you feel like eating, whether it's on the menu or not. ['food', 'kitchen', 'menu']
Where Gabriela personaly greets you and recommends you what to eat. []
For those that go once and don't enjoy it, all I can say is that they just don't get it. []


LAPTOPS (size=2361):

I bought a HP Pavilion DV4-1222nr laptop and have had so many problems with the computer. []
I investigated netbooks and saw the Toshiba NB305-N410BL. []
The other day I had a presentation to do for a seminar at a large conference in town- lots of people, little time to prep and have to set up a computer to a projector, etc. []
it is of high quality, has a killer GUI, is extremely stable, is hi

In [8]:
nlp = spacy.load("en_core_web_sm", \
             disable=["parser", "ner", "entity_linker", "textcat",
                      "entity_ruler", "sentencizer",
                      "merge_noun_chunks", "merge_entities",
                      "merge_subtokens"])
    
def lemma(nlp, token):
    return nlp(token)[0].lemma_.lower()
    
def aspect_dist(domain, percentiles=(0.9, 0.925, 0.95, 0.975), min_freq=10, lemmas=False):
    aspects_list = [lemma(nlp, a) if lemmas else a for ex in domain_ds[domain] for a in ex[3]]
    aspects_df = pd.DataFrame({'aspect': pd.Series(aspects_list).values})
    
#     px.histogram(aspects_df, log_y=True, width=1080, height=400)\
#         .update_xaxes(categoryorder="total descending")\
#         .update_layout(
#             title_text=f"Aspect Distribution - {domain_names[domain]}", title_x=.45,
#             margin=dict(l=10, r=1, t=40, b=20),
#             paper_bgcolor="LightSteelBlue")\
#         .show()

#     px.histogram(aspects_df.groupby('aspect').filter(lambda x : (x['aspect'].count()>=10).any()),
#                  log_y=True, width=1080, height=400)\
#         .update_xaxes(categoryorder="total descending")\
#         .update_layout(
#             title_text=f"Aspect Distribution - {domain_names[domain]}, frequency > 10", title_x=.45,
#             margin=dict(l=10, r=1, t=40, b=20),
#             paper_bgcolor="LightSteelBlue")\
#         .show()
    df_other = aspects_df.copy()
    series = pd.value_counts(aspects_df['aspect'])
    mask = (series/series.sum() * 100).lt(1)
    df_other['aspect'] = np.where(df_other['aspect'].isin(series[mask].index), 'Other', df_other['aspect'])
    df_count = df_other.value_counts().reset_index(name='count')
    px.pie(df_count, values='count', names='aspect')\
    .update_layout(
            title_text=f"Aspect Distribution - {domain_names[domain]} (Other < 2%)", title_x=.45,
            margin=dict(l=10, r=1, t=40, b=20),
            paper_bgcolor="LightSteelBlue")\
    .update_traces(textposition='inside').show()

    return aspects_df.groupby('aspect').size().describe(percentiles=percentiles)

In [37]:
aspect_dist("res_multi", lemmas=False)

count    1295.000000
mean        2.856371
std        12.657029
min         1.000000
50%         1.000000
90%         4.000000
92.5%       5.000000
95%         8.300000
97.5%      16.650000
max       358.000000
dtype: float64

In [38]:
aspect_dist("lap_multi", lemmas=False)

count    1048.000000
mean        2.264313
std         4.978635
min         1.000000
50%         1.000000
90%         3.300000
92.5%       4.475000
95%         7.000000
97.5%      12.825000
max        59.000000
dtype: float64

In [39]:
aspect_dist("res_multi", lemmas=True)

count    684.000000
mean       5.407895
std       20.052391
min        1.000000
50%        1.000000
90%       10.000000
92.5%     13.775000
95%       18.700000
97.5%     38.925000
max      393.000000
dtype: float64

In [40]:
aspect_dist("lap_multi", lemmas=True)

count    523.000000
mean       4.537285
std        9.475925
min        1.000000
50%        1.000000
90%       10.800000
92.5%     13.000000
95%       18.900000
97.5%     27.950000
max      108.000000
dtype: float64

In [338]:
aspect_dist("res", lemmas=False)

count    310.000000
mean       6.638710
std       23.745927
min        1.000000
50%        1.000000
90%       13.100000
92.5%     17.000000
95%       28.100000
97.5%     38.375000
max      329.000000
dtype: float64

In [325]:
aspect_dist("lap", lemmas=True)

count    230.000000
mean       4.560870
std        7.878069
min        1.000000
50%        2.000000
90%       10.100000
92.5%     13.825000
95%       19.100000
97.5%     29.650000
max       54.000000
dtype: float64

In [9]:
P1 = "So, the <mask> is the interesting aspect."

P2 = "So, the interesting aspect is <mask>."

P3 = "So, the <mask> are the interesting aspect."

P4 = "So, this is my opinion on <mask>."

P5 = "So, my review focuses on the <mask>."

In [10]:
def get_fm_pipeline(model, device=3):
    if model in models_dict:
        fm_model = models_dict[model]
    else:
        print(f"\nLoading {model} fill-mask pipeline...\n")
        stdout.flush()
        fm_model = pipeline('fill-mask', model=model, framework="pt", device=device)
        models_dict[model] = fm_model
    return fm_model

def run_example(text, tokens, model, pattern, top_k=10, thresh=-1, target=True, device=3):
    hparams = locals()
    hparams.pop('text')
    hparams.pop('tokens')

    delim = ' ' if text[-1] in ('.', '!', '?') else '. '
    
    fm_pipeline = get_fm_pipeline(model)
    pattern = pattern.replace('<mask>', f"{fm_pipeline.tokenizer.mask_token}")
    preds_meta = fm_pipeline(delim.join([text, pattern]), top_k=top_k,
                         target=tokens if target else None)
    preds, valid_preds, valid_idx = [], [], set()

    for pred in preds_meta:
        pred_token, score = pred['token_str'].lstrip(), pred['score']
        preds.append(pred_token)

        if score > thresh:
            try:
                idx = tokens.index(pred_token)
                valid_idx.add(idx)
                valid_preds.append((pred_token, f"{score:.3f}"))
            except ValueError:
                pass

    pred_bio = ['B-ASP' if i in valid_idx else 'O' for i in range(len(tokens))]
    return preds, valid_preds, pred_bio, preds_meta, hparams

def run_ds_examples(ds, model, **kwargs):
    print(f"Pattern: {kwargs['pattern']}\n")
    for i, (text, tokens, gold_bio, aspects) in tqdm(enumerate(ds)):
        preds, valid_preds, pred_bio, _, _ = run_example(model=model, text=text, tokens=tokens, **kwargs)
        print(i, text)
        print(tokens)
        print(f'gold: {aspects}\ngold_bio: {gold_bio}\nvalid_preds: {valid_preds}\npreds: {preds}\npred_bio: {pred_bio}\n')
    
def eval_domain(domain, limit=None, **kwargs):
    all_preds_bio, all_preds, all_preds_meta, all_gold_bio = [], [], [], []
    for text, tokens, gold_bio, aspects in domain_ds[domain][:limit]:
        preds, _, pred_bio, preds_meta, hparams = run_example(text=text, tokens=tokens, **kwargs)
        all_preds.append(preds)
        all_preds_bio.append(pred_bio)
        all_preds_meta.append(preds_meta)
        all_gold_bio.append(gold_bio)

    with open(f'{domain}.pkl', 'wb') as f:
        pickle.dump((all_preds, all_preds_meta), f)

    return {'metrics': metrics(all_gold_bio, all_preds_bio, domain, **kwargs), 'hparams': hparams}

def metrics(gold, preds, domain, verbose=False, **kwargs):
    F, P, R, conf = (f(gold, preds) for f in (f1_score, precision_score,\
                     recall_score, performance_measure))
    if verbose:
        print(f'{domain}')
        print(f'F1: {F:.3f}, P: {P:.3f}, R: {R:.3f}, {conf}')
    return {'F1': F, 'Precision': P, 'Recall': R}

def post_process(domain, limit=None, thresh=-1, **kwargs):
    with open(f'{domain}.pkl', 'rb') as f:
        _, all_preds_meta = pickle.load(f)

    nlp = spacy.load("en_core_web_sm", \
                     disable=["parser", "ner", "entity_linker", "textcat",
                              "entity_ruler", "sentencizer",
                              "merge_noun_chunks", "merge_entities",
                              "merge_subtokens"])
    
    # TODO: lemmatize sentences, not token lists

    all_preds_bio, all_gold_bio = [], []
    for (text, tokens, gold_bio, aspects), preds_meta in \
        zip(domain_ds[domain][:limit], all_preds_meta[:limit]):

        pred_lems, valid_preds, token_lems = [], [], []
        for t in tokens:
            toks = list(nlp(t))
            token_lems.append(toks[0].lemma_ if toks else [''])

        valid_idx = set()
        for pred in preds_meta:
            pred_token = pred['token_str'].lstrip() #'Ġ'
            score = pred['score']
            if score > thresh:
                pred_as_tokens = list(nlp(pred_token))
                if pred_as_tokens:
                    pred_lem = pred_as_tokens[0].lemma_
                    try:
                        valid_idx.add(token_lems.index(pred_lem))
                    except ValueError:
                        pass
        pred_bio = ['B-ASP' if i in valid_idx else 'O' for i in range(len(tokens))]
        all_preds_bio.append(pred_bio)
        all_gold_bio.append(gold_bio)
    return {'metrics': metrics(all_gold_bio, all_preds_bio, domain)}

In [13]:
for model in ('checkpoint-1000',):
    run_ds_examples(res_train, 
                    model=model,
                    pattern=P5,
                    top_k=10);

Pattern: So, my review focuses on the <mask>.

  0%|          | 0/100 [00:00<?, ?it/s]
Loading checkpoint-1000 fill-mask pipeline...

0 But the staff was so horrible to us.
['But', 'the', 'staff', 'was', 'so', 'horrible', 'to', 'us', '.']
gold: ['staff']
gold_bio: ['O', 'O', 'B-ASP', 'O', 'O', 'O', 'O', 'O', 'O']
valid_preds: [('staff', '0.007')]
preds: ['food', 'prices', 'price', 'service', 'staff', 'place', 'sushi', 'drinks', 'location', 'menu']
pred_bio: ['O', 'O', 'B-ASP', 'O', 'O', 'O', 'O', 'O', 'O']

  1%|          | 1/100 [00:05<08:22,  5.07s/it]1 To be completely fair, the only redeeming factor was the food, which was above average, but couldn't make up for all the other deficiencies of Teodora.
['To', 'be', 'completely', 'fair', ',', 'the', 'only', 'redeeming', 'factor', 'was', 'the', 'food', ',', 'which', 'was', 'above', 'average', ',', 'but', 'could', "n't", 'make', 'up', 'for', 'all', 'the', 'other', 'deficiencies', 'of', 'Teodora', '.']
gold: ['food']
gold_bio: ['O', 'O',

 13%|█▎        | 13/100 [00:05<03:37,  2.50s/it]13 He has visited Thailand and is quite expert on the cuisine.
['He', 'has', 'visited', 'Thailand', 'and', 'is', 'quite', 'expert', 'on', 'the', 'cuisine', '.']
gold: ['cuisine']
gold_bio: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ASP', 'O']
valid_preds: []
preds: ['food', 'price', 'prices', 'service', 'place', 'sushi', 'dishes', 'staff', 'menu', 'fish']
pred_bio: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

14 I would definitely recommend Mary's and am making it one of my regular neighborhood haunts.
['I', 'would', 'definitely', 'recommend', 'Mary', "'s", 'and', 'am', 'making', 'it', 'one', 'of', 'my', 'regular', 'neighborhood', 'haunts', '.']
gold: []
gold_bio: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
valid_preds: []
preds: ['food', 'prices', 'price', 'menu', 'service', 'drinks', 'staff', 'sushi', 'place', 'decor']
pred_bio: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'

35 We usually go to the Chart House to celebrate a birthday or anniversary.
['We', 'usually', 'go', 'to', 'the', 'Chart', 'House', 'to', 'celebrate', 'a', 'birthday', 'or', 'anniversary', '.']
gold: []
gold_bio: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
valid_preds: []
preds: ['food', 'prices', 'price', 'service', 'staff', 'menu', 'place', 'drinks', 'sushi', 'location']
pred_bio: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

36 You'll adore it.
['You', "'ll", 'adore', 'it', '.']
gold: []
gold_bio: ['O', 'O', 'O', 'O', 'O']
valid_preds: []
preds: ['food', 'prices', 'price', 'service', 'menu', 'place', 'staff', 'drinks', 'sushi', 'location']
pred_bio: ['O', 'O', 'O', 'O', 'O']

 37%|███▋      | 37/100 [00:05<00:38,  1.63it/s]37 Two words: Free wine.
['Two', 'words', ':', 'Free', 'wine', '.']
gold: ['wine']
gold_bio: ['O', 'O', 'O', 'O', 'B-ASP', 'O']
valid_preds: [('wine', '0.007')]
preds: ['food', 'price', 'prices', 'service', 'drin

54 Not what I wanted to hear.
['Not', 'what', 'I', 'wanted', 'to', 'hear', '.']
gold: []
gold_bio: ['O', 'O', 'O', 'O', 'O', 'O', 'O']
valid_preds: []
preds: ['food', 'prices', 'price', 'service', 'sushi', 'drinks', 'place', 'menu', 'staff', 'location']
pred_bio: ['O', 'O', 'O', 'O', 'O', 'O', 'O']

 55%|█████▌    | 55/100 [00:06<00:09,  4.50it/s]55 Pricey, but worth a try, at least once.
['Pricey', ',', 'but', 'worth', 'a', 'try', ',', 'at', 'least', 'once', '.']
gold: []
gold_bio: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
valid_preds: []
preds: ['food', 'price', 'prices', 'service', 'sushi', 'cost', 'menu', 'drinks', 'staff', 'place']
pred_bio: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

56 I've been there a few times, and I usually end up sitting next to some pretty odd people.
['I', "'ve", 'been', 'there', 'a', 'few', 'times', ',', 'and', 'I', 'usually', 'end', 'up', 'sitting', 'next', 'to', 'some', 'pretty', 'odd', 'people', '.']
gold: []
gold_bio: ['O',

71 Unfortunately, the food is outstanding, but everything else about this restaurant is the pits.
['Unfortunately', ',', 'the', 'food', 'is', 'outstanding', ',', 'but', 'everything', 'else', 'about', 'this', 'restaurant', 'is', 'the', 'pits', '.']
gold: ['food']
gold_bio: ['O', 'O', 'O', 'B-ASP', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
valid_preds: [('food', '0.588')]
preds: ['food', 'prices', 'price', 'service', 'place', 'menu', 'location', 'staff', 'drinks', 'sushi']
pred_bio: ['O', 'O', 'O', 'B-ASP', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

72 Deff recimmend Bukhara they also have opened two new locations guess ill try them out.
['Deff', 'recimmend', 'Bukhara', 'they', 'also', 'have', 'opened', 'two', 'new', 'locations', 'guess', 'ill', 'try', 'them', 'out', '.']
gold: []
gold_bio: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
valid_preds: []
preds: ['food', 'prices', 'price', 'service', 'menu', 'su

90 I've never been treated so rudely.
['I', "'ve", 'never', 'been', 'treated', 'so', 'rudely', '.']
gold: []
gold_bio: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
valid_preds: []
preds: ['food', 'prices', 'price', 'service', 'sushi', 'staff', 'place', 'menu', 'drinks', 'location']
pred_bio: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

 91%|█████████ | 91/100 [00:06<00:00, 23.77it/s]91 I'll be back soon.
['I', "'ll", 'be', 'back', 'soon', '.']
gold: []
gold_bio: ['O', 'O', 'O', 'O', 'O', 'O']
valid_preds: []
preds: ['food', 'prices', 'price', 'service', 'sushi', 'staff', 'menu', 'drinks', 'place', 'location']
pred_bio: ['O', 'O', 'O', 'O', 'O', 'O']

92 Went inside and I started to get excited.
['Went', 'inside', 'and', 'I', 'started', 'to', 'get', 'excited', '.']
gold: []
gold_bio: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
valid_preds: []
preds: ['food', 'prices', 'price', 'service', 'sushi', 'menu', 'place', 'drinks', 'staff', 'location']
pred_bio: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O

In [10]:
def eval_all(**kwargs):    
    eval_res, post_res = {}, {}
    hparams = None
    for domain in tqdm(('res', 'lap')):
        for res, func in tqdm(zip((eval_res, post_res), (eval_domain, post_process))):
            func_res = func(domain, **kwargs)
            hparams = func_res.get('hparams', hparams)
            res[domain] = {'hparams': hparams, 'metrics': func_res['metrics']}
    return {'eval_res': eval_res, 'post_res': post_res, 'hparams': hparams}

def test_hparam(hparam, values, **kwargs):
    kwargs_dict = dict(kwargs)
    eval_res, post_res = [], []
    for v in tqdm(values):
        kwargs_dict[hparam] = v
        res = eval_all(**kwargs_dict)
        eval_res.append(res['eval_res'])
        post_res.append(res['post_res'])
    test_res = eval_res, post_res, hparam, values
    plot_all(*test_res)
    final_hparams = res['hparams']
    final_hparams.pop(hparam)
    print(final_hparams)

def plot_per_domain(res_dicts, hparam, values, title):
    fig, axs = plt.subplots(1, 2, figsize=(20, 6), sharey=True)
    fig.suptitle(title, fontsize=20)

    for i, domain in enumerate(['res', 'lap']):
        data = [d[domain]['metrics'] for d in res_dicts]
        df = pd.DataFrame(data, index=pd.Index(values, name=hparam))
        axs[i].set_yticks(np.linspace(.1, .9, num=33))
        axs[i].yaxis.set_tick_params(labelbottom=True)
        sns.lineplot(data=df, ax=axs[i]).set_title(domain_names[domain])

def plot_all(eval_res, post_res, hparam, values):
    data = []
    for res_dicts in eval_res, post_res:
        for i, domain in enumerate(['res', 'lap']):
            for res_dict, value in zip(res_dicts, values):
                for metric, score in res_dict[domain]['metrics'].items():
                    data.append({
                                hparam: value,
                                'domain': domain_names[domain],
                                'Metric': metric,
                                'score': score,
                                'Lemmatized': res_dicts == post_res})
                    
    fig = px.line(data, x=hparam, y='score', facet_col='domain', 
            line_dash='Lemmatized', color='Metric', line_shape='spline', hover_data={
                'Lemmatized': False,
                hparam: False,
                'domain': False,
                'Metric': True,
                'score': ":.3f"}).update_layout(title_text=f"Effect of '{hparam}' Value", title_x=0.5, hoverlabel=dict(
                    font_size=12,
                    font_family="Rockwell"),
                    font=dict(family="Courier New, monospace", size=18))\
            .update_traces(mode="markers+lines", hovertemplate="%{customdata[2]}=%{y:.3f}<extra></extra>")\
            .update_xaxes(showgrid=False, showspikes=True)\
            .show("notebook")

In [11]:
# test_hparam(hparam='model', values=('roberta-base', 'google/electra-small-generator', 'xlm-roberta-base'),
#             pattern=P5)

In [12]:
# test_hparam(hparam='model', values=('roberta-base', 'google/electra-small-generator', 'xlm-roberta-base'),
#             pattern=P1)

In [13]:
# test_hparam(hparam='top_k', values=(8, 13), model='roberta-base', pattern=P5, limit=100)

In [14]:
# test_hparam(hparam='pattern', values=(P1, P5), model='roberta-base', limit=10)

In [15]:
# test_thresh = test_hparam('thresh', (-.01, 0., 0.01, 0.02, 0.03, 0.04), pattern=P1, model="roberta-base")

## Masked Pattern Language Modeling

In [16]:
def apply_pattern(P1):
    def apply(text):
        delim = ' ' if text[-1] in ('.', '!', '?') else '. '
        return delim.join([text, P1])
    return apply

def mlm_splits(ds, pattern):
    P = apply_pattern(pattern)
    split_point = int(len(ds) * 0.7)
    mlm_train = ds[:split_point]
    mlm_test = ds[split_point:]
    with open('train.txt', 'w') as f:
        f.writelines(P(x) + '\n' for x, *_ in mlm_train)
    with open('test.txt', 'w') as f:
        f.writelines(P(x) + '\n' for x, *_ in mlm_test)    

In [17]:
# mlm_splits(res_ds, P5)

Get HF's MLM Script

In [18]:
# !wget -q https://raw.githubusercontent.com/huggingface/transformers/master/examples/language-modeling/run_mlm.py

In [3]:
!{python} run_mlm.py --help
# --validation_file test.txt --no_eval
# --help #--seed=42 --num_train_epochs --validation_split_percentage 

usage: run_mlm.py [-h] [--model_name_or_path MODEL_NAME_OR_PATH]
                  [--model_type MODEL_TYPE] [--config_name CONFIG_NAME]
                  [--tokenizer_name TOKENIZER_NAME] [--cache_dir CACHE_DIR]
                  [--no_use_fast_tokenizer]
                  [--use_fast_tokenizer [USE_FAST_TOKENIZER]]
                  [--model_revision MODEL_REVISION]
                  [--use_auth_token [USE_AUTH_TOKEN]]
                  [--dataset_name DATASET_NAME]
                  [--dataset_config_name DATASET_CONFIG_NAME]
                  [--train_file TRAIN_FILE]
                  [--validation_file VALIDATION_FILE]
                  [--overwrite_cache [OVERWRITE_CACHE]]
                  [--validation_split_percentage VALIDATION_SPLIT_PERCENTAGE]
                  [--max_seq_length MAX_SEQ_LENGTH]
                  [--preprocessing_num_workers PREPROCESSING_NUM_WORKERS]
                  [--mlm_probability MLM_PROBABILITY]
                  [--line_by_line [LI

In [6]:
# !export CUDA_VISIBLE_DEVICES=1
from sys import executable as python
!{python} run_mlm.py --seed=42 --num_train_epochs=2 --learning_rate=5e-04 --line_by_line \
    --output_dir=. --train_file=train.txt --validation_file=test.txt --per_device_train_batch_size=1 \
    --model_type=roberta --model_name_or_path=roberta-base --do_train --do_eval \
    --overwrite_output_dir --overwrite_cache --evaluation_strategy=epoch

02/09/2021 16:01:01 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(output_dir=., overwrite_output_dir=True, do_train=True, do_eval=True, do_predict=False, evaluation_strategy=EvaluationStrategy.EPOCH, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=8, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=0.0005, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=2.0, max_steps=-1, lr_scheduler_type=SchedulerType.LINEAR, warmup_steps=0, logging_dir=runs/Feb09_16-01-01_aipg-nlp01, logging_first_step=False, logging_steps=500, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level=O1, fp16_backend=auto, local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=500, dataloader_num_workers=0, past_index=-1, run_name=., disable_tqdm=False, remove_unused_columns=True, 

## QA Method Feasability

In [21]:
# Q1 = 'Which aspect is being reviewed?'
# run_qa(res_ds[:20], Q1)

# def run_qa(ds, question):

#     print(f'Question: {question}')
#     for i, (text, tokens, gold_bio, aspects) in tqdm(enumerate(ds)):
#         QA_input = {
#             'question': question,
#             'context': text
#         }
#         print()
#         print(i, text)
#         print(f"gold: {aspects}")
#         print('\n'.join([str((t['answer'], t['score'])) for t in qa_roberta(QA_input, topk=4)]))
#         print()

# qa_model = f'deepset/roberta-base-squad2'
# qa_roberta = pipeline('question-answering', model=qa_model, framework="pt", device=0)

## DEBERTA - Not Working

In [23]:
# from transformers import AutoModelForMaskedLM, AutoTokenizer
# import torch
# model_name = 'microsoft/deberta-base'
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# # model = AutoModelForMaskedLM.from_pretrained(model_name)

# # tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-base')
# # ids = tokenizer.tokenize("I have a new GPU!")
# # print([tokenizer.gpt2_tokenizer.decode([i]) for i in ids])
# # print(tokenizer.encode("I have a new GPU!", return_tensors="pt"))
# # print(tokenizer.tokenize("I have a new GPU!"))

# def mlm(text, pattern, tokenizer, model):
#     text += ' ' + pattern
#     print(f"Text: {text}")
#     tokenized = tokenizer.tokenize(text)
#     input = torch.tensor([[int(i) if i != '[MASK]' else tokenizer.mask_token_id for i in tokenized]])

# #     input = tokenizer.encode(text, return_tensors="pt")
#     print(f"input: {input}")
#     print(f"tokenizer.mask_token_id {tokenizer.mask_token_id}")
#     print(tokenizer.mask_token_id in input)
#     what, mask_token_index = torch.where(input == tokenizer.mask_token_id)
#     print(what, mask_token_index)
#     print(f"mask_token_index: {mask_token_index}")
#     token_logits = model(input).logits
#     print(f"token_logits: {token_logits}")
#     mask_token_logits = token_logits[0, mask_token_index, :]
#     print(f"mask_token_logits: {mask_token_logits}")
#     top_k = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
#     print(f"top_k: {top_k}")

#     for token in top_k:
#         print(tokenizer.gpt2_tokenizer.decode([token]))

# mlm("First sentence.", "[MASK] pattern.", tokenizer, model)

In [None]:
# model_name = 'microsoft/deberta-base'
# tokenizer = AutoTokenizer.from_pretrained(model_name)

res = pipeline('fill-mask', model=model_name, tokenizer=tokenizer, framework="pt", device=3)\
    ("But the staff was so horrible to us. So, my review focuses on the  [MASK].")

gp2_tokenizer = tokenizer.gpt2_tokenizer

[gp2_tokenizer.decode([d['token']]) for d in res]