In [30]:
import os

if os.getcwd().endswith('notebooks'):
    os.chdir('..')
    
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('huawei-noah/TinyBERT_General_4L_312D', 
                                          additional_special_tokens=('[COL]', '[VAL]'))

tokenizer('Some test text with a complicatedbombastic word').tokens()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


['[CLS]',
 'some',
 'test',
 'text',
 'with',
 'a',
 'complicated',
 '##bo',
 '##mba',
 '##stic',
 'word',
 '[SEP]']

In [4]:
import pandas as pd

train_set = pd.read_csv('data/processed/contrastive/amazon_google/pretrain-train.csv')
train_set.head()

Unnamed: 0,text,cluster_id,source
0,[COL] title [VAL] sony vegas movie studio + dv...,350,#1
1,[COL] title [VAL] adobe photoshop elements 4.0...,1030,#1
2,[COL] title [VAL] mcafee virusscan plus 3 user...,239,#1
3,[COL] title [VAL] microsoft windows vista home...,506,#1
4,[COL] title [VAL] punch ! professional home de...,444,#1


In [5]:
train_set['text_tokenized'] = train_set['text'].apply(lambda x: tokenizer(x).tokens())
train_set.head()

Unnamed: 0,text,cluster_id,source,text_tokenized
0,[COL] title [VAL] sony vegas movie studio + dv...,350,#1,"[[CLS], [COL], title, [VAL], sony, vegas, movi..."
1,[COL] title [VAL] adobe photoshop elements 4.0...,1030,#1,"[[CLS], [COL], title, [VAL], adobe, photos, ##..."
2,[COL] title [VAL] mcafee virusscan plus 3 user...,239,#1,"[[CLS], [COL], title, [VAL], mca, ##fe, ##e, v..."
3,[COL] title [VAL] microsoft windows vista home...,506,#1,"[[CLS], [COL], title, [VAL], microsoft, window..."
4,[COL] title [VAL] punch ! professional home de...,444,#1,"[[CLS], [COL], title, [VAL], punch, !, profess..."


In [57]:
from typing import List

def unite_tokens(tokens: List[str], index: int) -> List[str]:
    current_concat = tokens[index]
    i = index - 1
    result = []
    seed = [tokens[index]]
    while i > 0 and tokens[i + 1].startswith('##'):
        current_concat = tokens[i] + current_concat[2:]
        seed.insert(0, tokens[i])
        result.append((current_concat, tuple(seed)))
        i -= 1
        
    return result

def extract_possible_concatenations(tokens: List[str]) -> List[str]:
    result = []
    for i in range(len(tokens) - 1):
        if tokens[i + 1].startswith('##'):
            result += unite_tokens(tokens, i + 1)
            
    return result
        
train_set['potential_new_tokens'] = train_set['text_tokenized'].apply(extract_possible_concatenations)
train_set.head()

Unnamed: 0,text,cluster_id,source,text_tokenized,potential_new_tokens
0,[COL] title [VAL] sony vegas movie studio + dv...,350,#1,"[[CLS], [COL], title, [VAL], sony, vegas, movi...",[]
1,[COL] title [VAL] adobe photoshop elements 4.0...,1030,#1,"[[CLS], [COL], title, [VAL], adobe, photos, ##...","[(photoshop, (photos, ##hop)), (29180, (291, #..."
2,[COL] title [VAL] mcafee virusscan plus 3 user...,239,#1,"[[CLS], [COL], title, [VAL], mca, ##fe, ##e, v...","[(mcafe, (mca, ##fe)), (##fee, (##fe, ##e)), (..."
3,[COL] title [VAL] microsoft windows vista home...,506,#1,"[[CLS], [COL], title, [VAL], microsoft, window...",[]
4,[COL] title [VAL] punch ! professional home de...,444,#1,"[[CLS], [COL], title, [VAL], punch, !, profess...",[]


In [79]:
tokens_df = pd.DataFrame()
tokens_df['option'] = train_set['potential_new_tokens']

tokens_df = tokens_df.explode('option')
tokens_df.dropna(inplace=True)

tokens_df = tokens_df.groupby('option').option.count().rename('count').reset_index()
tokens_df.head()

Unnamed: 0,option,count
0,"(##000, (##00, ##0))",9
1,"(##00034, (##00, ##0, ##34))",1
2,"(##0006, (##00, ##0, ##6))",2
3,"(##0006w, (##00, ##0, ##6, ##w))",1
4,"(##0006wwc, (##00, ##0, ##6, ##w, ##wc))",1


In [80]:
len(tokens_df)

4289

In [81]:
tokens_df['option'].nunique()

4289

In [82]:
present_at_least = len(train_set) // 200

In [83]:
tokens_df = tokens_df.sort_values('count', ascending=False)
tokens_df = tokens_df[tokens_df['count'] >= present_at_least]
tokens_df.head(len(tokens_df))

Unnamed: 0,option,count
2994,"(cs3, (cs, ##3))",149
3706,"(photoshop, (photos, ##hop))",48
3785,"(quickbooks, (quick, ##books))",47
4012,"(syman, (sy, ##man))",46
1496,"(##mantec, (##man, ##tec))",46
...,...,...
3736,"(printmaster, (print, ##master))",13
2758,"(avanquest, (ava, ##n, ##quest))",13
3200,"(firewall, (fire, ##wall))",13
1555,"(##nquest, (##n, ##quest))",13


In [84]:
tokens_df['token'], tokens_df['seed'] = zip(*tokens_df['option'])
tokens_df.head(50)

Unnamed: 0,option,count,token,seed
2994,"(cs3, (cs, ##3))",149,cs3,"(cs, ##3)"
3706,"(photoshop, (photos, ##hop))",48,photoshop,"(photos, ##hop)"
3785,"(quickbooks, (quick, ##books))",47,quickbooks,"(quick, ##books)"
4012,"(syman, (sy, ##man))",46,syman,"(sy, ##man)"
1496,"(##mantec, (##man, ##tec))",46,##mantec,"(##man, ##tec)"
4013,"(symantec, (sy, ##man, ##tec))",46,symantec,"(sy, ##man, ##tec)"
2976,"(corel, (core, ##l))",44,corel,"(core, ##l)"
3330,"(intuit, (int, ##uit))",42,intuit,"(int, ##uit)"
3123,"(emedia, (em, ##ed, ##ia))",40,emedia,"(em, ##ed, ##ia)"
3122,"(emed, (em, ##ed))",40,emed,"(em, ##ed)"


In [85]:
all_seeds = tokens_df['seed'].tolist()

clustered_seeds = {}
for seed in all_seeds:
    cluster = len(seed)
    if cluster not in clustered_seeds:
        clustered_seeds[cluster] = []
        
    clustered_seeds[cluster].append(seed)
    
useless_seeds = []
for k in clustered_seeds.keys():
    if k + 1 not in clustered_seeds:
        break
    
    for lower_rank_seed in clustered_seeds[k]:
        for higher_rank_seed in clustered_seeds[k+1]:
            if set(lower_rank_seed).issubset(set(higher_rank_seed)):
                useless_seeds.append(lower_rank_seed)
                
print(useless_seeds)

tokens_df = tokens_df[~tokens_df['seed'].isin(useless_seeds)]
tokens_df.head(50)

[('sy', '##man'), ('##man', '##tec'), ('em', '##ed'), ('##ed', '##ia'), ('##py', '##r'), ('as', '##py'), ('##bel', '##ius'), ('si', '##bel'), ('web', '##ro'), ('##ro', '##ot'), ('ac', '##ro'), ('##ro', '##bat'), ('##tar', '##t'), ('jumps', '##tar'), ('##der', '##bund'), ('##dee', '##p'), ('river', '##dee'), ('bro', '##der'), ('mca', '##fe'), ('##fe', '##e'), ('retro', '##sp'), ('##sp', '##ot'), ('micro', '##sp'), ('##sp', '##ect'), ('##er', '##ve'), ('##put', '##race'), ('arcs', '##er'), ('com', '##put'), ('##co', '##on'), ('ty', '##co'), ('##n', '##quest'), ('ava', '##n')]


Unnamed: 0,option,count,token,seed
2994,"(cs3, (cs, ##3))",149,cs3,"(cs, ##3)"
3706,"(photoshop, (photos, ##hop))",48,photoshop,"(photos, ##hop)"
3785,"(quickbooks, (quick, ##books))",47,quickbooks,"(quick, ##books)"
4013,"(symantec, (sy, ##man, ##tec))",46,symantec,"(sy, ##man, ##tec)"
2976,"(corel, (core, ##l))",44,corel,"(core, ##l)"
3330,"(intuit, (int, ##uit))",42,intuit,"(int, ##uit)"
3123,"(emedia, (em, ##ed, ##ia))",40,emedia,"(em, ##ed, ##ia)"
3282,"(hoyle, (ho, ##yle))",39,hoyle,"(ho, ##yle)"
3641,"(onone, (ono, ##ne))",37,onone,"(ono, ##ne)"
3699,"(peachtree, (peach, ##tree))",34,peachtree,"(peach, ##tree)"


In [91]:
vocab = tokenizer.get_vocab()

def translate_tokens_to_ids(tokens: List[str]) -> List[int]:
    return [vocab[t] for t in tokens]

tokens_df['seed_numeric'] = tokens_df['seed'].apply(lambda x: [vocab[t] for t in x])
tokens_df.head(50)

Unnamed: 0,option,count,token,seed,seed_numeric
2994,"(cs3, (cs, ##3))",149,cs3,"(cs, ##3)","[20116, 2509]"
3706,"(photoshop, (photos, ##hop))",48,photoshop,"(photos, ##hop)","[7760, 18471]"
3785,"(quickbooks, (quick, ##books))",47,quickbooks,"(quick, ##books)","[4248, 17470]"
4013,"(symantec, (sy, ##man, ##tec))",46,symantec,"(sy, ##man, ##tec)","[25353, 2386, 26557]"
2976,"(corel, (core, ##l))",44,corel,"(core, ##l)","[4563, 2140]"
3330,"(intuit, (int, ##uit))",42,intuit,"(int, ##uit)","[20014, 14663]"
3123,"(emedia, (em, ##ed, ##ia))",40,emedia,"(em, ##ed, ##ia)","[7861, 2098, 2401]"
3282,"(hoyle, (ho, ##yle))",39,hoyle,"(ho, ##yle)","[7570, 12844]"
3641,"(onone, (ono, ##ne))",37,onone,"(ono, ##ne)","[21058, 2638]"
3699,"(peachtree, (peach, ##tree))",34,peachtree,"(peach, ##tree)","[18237, 13334]"


In [92]:
tokenizer.add_tokens(tokens_df['token'].tolist(), special_tokens=True)

51

In [94]:
tokenizer.get_vocab()['csdc']

30571

In [96]:
dict(zip(tokens_df['token'].tolist(), tokens_df['seed_numeric'].tolist()))

{'cs3': [20116, 2509],
 'photoshop': [7760, 18471],
 'quickbooks': [4248, 17470],
 'symantec': [25353, 2386, 26557],
 'corel': [4563, 2140],
 'intuit': [20014, 14663],
 'emedia': [7861, 2098, 2401],
 'hoyle': [7570, 12844],
 'onone': [21058, 2638],
 'peachtree': [18237, 13334],
 'aspyr': [2004, 7685, 2099],
 'sibelius': [9033, 8671, 4173],
 'webroot': [4773, 3217, 4140],
 'acrobat': [9353, 3217, 14479],
 'sonicwall': [12728, 9628],
 'steinberg': [14233, 4059],
 'fogware': [9666, 8059],
 'freeverse': [2489, 16070],
 'jumpstart': [14523, 7559, 2102],
 'quicken': [4248, 2368],
 'cakewalk': [9850, 17122],
 'abacus': [19557, 7874],
 '##soft': [6499, 6199],
 'antivirus': [3424, 23350],
 'upsell': [11139, 5349],
 '1u': [1015, 2226],
 'riverdeep': [2314, 26095, 2361],
 'broderbund': [22953, 4063, 27265],
 'vpn': [21210, 2078],
 'allume': [2035, 17897],
 'essentials': [6827, 2015],
 'exec': [4654, 8586],
 'mcafee': [22432, 7959, 2063],
 'laptops': [12191, 2015],
 'sweeper': [11740, 2121],
 'mav

# Fidling with the transformer

In [86]:
from transformers import AutoModel

transformer = AutoModel.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')

transformer.embeddings.word_embeddings

Some weights of the model checkpoint at huawei-noah/TinyBERT_General_4L_312D were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'fit_denses.2.bias', 'fit_denses.3.weight', 'fit_denses.0.weight', 'cls.predictions.transform.LayerNorm.bias', 'fit_denses.1.bias', 'fit_denses.3.bias', 'fit_denses.4.weight', 'cls.predictions.decoder.weight', 'fit_denses.0.bias', 'cls.seq_relationship.weight', 'fit_denses.2.weight', 'fit_denses.1.weight', 'cls.predictions.transform.LayerNorm.weight', 'fit_denses.4.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing

Embedding(30522, 312, padding_idx=0)

In [88]:
len(tokenizer.get_vocab())

30524

In [90]:
tokenizer('cs ##3').tokens()

['[CLS]', 'cs', '#', '#', '3', '[SEP]']