# Character BERT

In this notebook we will try and use the character level BERT model to create embeddings of garble.

In [1]:
"""Basic example: getting word embeddings from CharacterBERT"""
from transformers import BertTokenizer
from modeling.character_bert import CharacterBertModel
from utils.character_cnn import CharacterIndexer

In [2]:
import pickle

In [3]:

# Example text
x = "house"
#

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
x = tokenizer.basic_tokenizer.tokenize(x)

# Add [CLS] and [SEP]
x = ['[CLS]', *x, '[SEP]']

In [5]:
# Convert token sequence into character indices
indexer = CharacterIndexer()
batch = [x]  # This is a batch with a single token sequence x
batch_ids = indexer.as_padded_tensor(batch)

In [6]:
# Load some pre-trained CharacterBERT
model = CharacterBertModel.from_pretrained(
    './pretrained-models/general_character_bert/')

In [7]:
def return_embedding(word, tokenizer):
    x = tokenizer.basic_tokenizer.tokenize(word)
    x = ['[CLS]', *x, '[SEP]']
    # Convert token sequence into character indices
    indexer = CharacterIndexer()
    batch = [x]  # This is a batch with a single token sequence x
    batch_ids = indexer.as_padded_tensor(batch)
    # Feed batch to CharacterBERT & get the embeddings
    embeddings_for_batch, _ = model(batch_ids)
    embeddings_for_x = embeddings_for_batch[0]
#     print('These are the embeddings produces by CharacterBERT (last transformer layer)')
#     for token, embedding in zip(x, embeddings_for_x):
#         print(token, embedding)
    return embeddings_for_x[1].detach().numpy()

In [8]:
return_embedding("house", tokenizer)

array([ 3.39125067e-01,  1.88909040e-03,  2.66949832e-01,  2.14349240e-01,
        2.29175404e-01, -2.42003016e-02, -1.12646602e-01,  3.13164413e-01,
       -6.71179742e-02, -5.26673377e-01, -4.83276933e-01,  5.62324934e-02,
       -1.98581278e-01,  7.04406649e-02, -2.80656159e-01,  1.46029249e-01,
        1.41989607e-02, -2.88872302e-01,  5.54671168e-01,  9.05460298e-01,
       -3.27440143e-01,  2.31643990e-01, -1.65029317e-01, -2.25597955e-02,
        3.63494664e-01,  1.12873101e+00, -2.02578176e-02, -2.07099557e-01,
       -2.77566373e-01,  2.63823479e-01, -4.01734263e-01, -1.53891578e-01,
        2.50293285e-01,  4.73870516e-01, -6.32535994e-01, -8.84413004e-01,
        1.86959371e-01, -2.80881673e-01,  8.27190056e-02,  2.47876197e-01,
        3.49640965e-01,  2.37101857e-02,  3.42199147e-01,  2.20760673e-01,
        2.46204942e-01, -3.36798936e-01,  5.03440537e-02, -1.00920987e+00,
        1.20637253e-01,  1.79463238e-01, -3.10113966e-01,  2.26338819e-01,
       -7.95056745e-02,  

In [9]:
nonsense = []

In [10]:
import pandas as pd

In [11]:
df = pd.read_csv("nonsense 20k.csv")

In [13]:
df

Unnamed: 0,stoccalipsies,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,brigatorts,,,,,,
1,ambooked,,,,,,
2,squitisfer,,,,,,
3,eninated,,,,,,
4,bodiationsint,,,,,,
...,...,...,...,...,...,...,...
19994,shoemation,,,,,,
19995,hallerends,,,,,,
19996,freersisess,,,,,,
19997,stificatues,,,,,,


In [14]:
nonsense = list(df['stoccalipsies'])

In [15]:
nonsense.append('stoccalipsies')

In [17]:
nonsense_embeddings = {}

In [18]:
for word in nonsense:
    nonsense_embeddings[word] = return_embedding(word, tokenizer)

In [19]:
with open('20kpseudo_bert_embeddings.pickle', 'wb') as handle:
    pickle.dump(nonsense_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
jabber = []

In [15]:
for line in open("../jabberwocky.txt"):
    if line not in jabber:
        jabber.append(line.replace("\n", ""))

In [28]:
new_words_2019 = []

In [29]:
for line in open("New words 2019.txt"):
    if line not in new_words_2019:
        new_words_2019.append(line.replace("\n", "").lower())

In [30]:
new_words_2020 = []

In [31]:
for line in open("New words 2020.txt"):
    if line not in new_words_2020:
        new_words_2020.append(line.replace("\n", "").lower())

In [32]:
slang = []

In [33]:
for line in open("Yeet.txt"):
    if line not in slang:
        slang.append(line.replace("\n", "").lower())

In [34]:
new_2020 = {}
new_2019 = {}
slang_embeddings = {}

In [35]:
for word in new_words_2019:
    new_2019[word] = return_embedding(word, tokenizer)

In [36]:
for word in new_words_2020:
    new_2020[word] = return_embedding(word, tokenizer)

In [37]:
for word in slang:
    slang_embeddings[word] = return_embedding(word, tokenizer)

In [38]:
with open('new_2019.pickle', 'wb') as handle:
    pickle.dump(new_2019, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [39]:
with open('new_2020.pickle', 'wb') as handle:
    pickle.dump(new_2020, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [40]:
with open('slang.pickle', 'wb') as handle:
    pickle.dump(slang_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [41]:
slang_embeddings

{'yeet': array([ 2.25380883e-01, -9.09769982e-02, -1.63654283e-01, -2.33951494e-01,
         3.15579101e-02,  3.50468457e-02,  1.01842947e-01, -5.01115620e-01,
         9.88269150e-02, -2.19805017e-01, -1.98769376e-01,  2.19392806e-01,
         2.54866928e-01, -1.03978135e-01,  9.23262015e-02,  2.47636646e-01,
         2.31105015e-01,  7.00955838e-03,  6.57568276e-01,  7.54153490e-01,
        -1.45498037e+00, -3.76410899e-03, -6.29198179e-02, -3.16538036e-01,
         2.64918983e-01,  1.54704463e+00,  1.62261203e-01, -3.04008245e-01,
        -6.55122936e-01,  1.16167262e-01, -2.63047032e-02,  4.59515333e-01,
         3.63238752e-02,  4.54097390e-01, -5.06146789e-01, -6.19484603e-01,
         2.27934346e-01,  3.56175870e-01,  1.01975903e-01,  2.90051818e-01,
         3.94336790e-01, -5.39758086e-01, -1.14975013e-01,  1.47008479e-01,
         1.39692232e-01,  1.20452836e-01, -1.88440368e-01, -8.96726847e-01,
         2.00414628e-01,  2.15440214e-01, -8.91922265e-02,  1.75825611e-01,
    

In [42]:
new_2019

{'abugida': array([ 1.38617501e-01,  4.27729309e-01,  4.67256963e-01, -2.29469389e-01,
        -9.01991278e-02, -1.05375201e-02, -1.47387058e-01, -2.09779799e-01,
        -1.12019397e-01, -1.12593979e-01,  1.20408863e-01,  5.73284745e-01,
         2.93474853e-01,  3.41120690e-01, -1.31905973e-01,  4.13572729e-01,
         1.96328312e-01, -2.08523586e-01,  1.00594378e+00,  4.16008919e-01,
        -1.39620543e+00, -5.58263175e-02,  1.32284686e-02,  8.50670114e-02,
         5.01378894e-01,  1.38704121e+00,  1.68905288e-01, -3.28564286e-01,
        -4.21478927e-01,  8.65309164e-02, -1.51542112e-01,  3.39602202e-01,
         1.50219113e-01,  2.01043114e-01, -3.50623459e-01, -7.08465636e-01,
         3.53247941e-01,  2.86275566e-01,  4.15639579e-01,  5.14913499e-01,
         2.58073926e-01, -1.89694211e-01, -1.52807422e-02,  1.52884692e-01,
         2.21820399e-02,  3.46381307e-01, -1.50257573e-01, -5.54820240e-01,
         9.13410559e-02,  5.58749199e-01, -2.74480492e-01, -7.88571388e-02,
 

In [None]:
new_2020

In [24]:
import spacy

In [26]:
nlp = spacy.load("en_core_web_sm")

In [32]:
words = []
for text in jabber:
    doc = nlp(text)
    for w in doc:
        # if it's not a stop word or punctuation mark, add it to our article!
        if w.text != '\n' and not w.is_punct and not w.like_num:
            # we add the lematized version of the word
            words.append(w.lemma_.strip().lower())



In [33]:
words

['twas',
 'brillig',
 'and',
 'the',
 'slithy',
 'tove',
 '',
 'do',
 'gyre',
 'and',
 'gimble',
 'in',
 'the',
 'wabe',
 'all',
 'mimsy',
 'be',
 'the',
 'borogove',
 '',
 'and',
 'the',
 'mome',
 'rath',
 'outgrabe',
 'beware',
 'the',
 'jabberwock',
 'my',
 'son',
 '',
 'the',
 'jaw',
 'that',
 'bite',
 'the',
 'claws',
 'that',
 'catch',
 'beware',
 'the',
 'jubjub',
 'bird',
 'and',
 'shun',
 '',
 'the',
 'frumious',
 'bandersnatch',
 'he',
 'take',
 'his',
 'vorpal',
 'sword',
 'in',
 'hand',
 '',
 'long',
 'time',
 'the',
 'manxome',
 'foe',
 'he',
 'seek',
 'so',
 'rest',
 'he',
 'by',
 'the',
 'tumtum',
 'tree',
 '',
 'and',
 'stand',
 'awhile',
 'in',
 'thought',
 'and',
 'as',
 'in',
 'uffish',
 'think',
 'he',
 'stand',
 '',
 'the',
 'jabberwock',
 'with',
 'eye',
 'of',
 'flame',
 'come',
 'whiffle',
 'through',
 'the',
 'tulgey',
 'wood',
 '',
 'and',
 'burble',
 'as',
 'it',
 'come',
 'and',
 'through',
 'and',
 'through',
 '',
 'the',
 'vorpal',
 'blade',
 'go',
 'snick

In [34]:
jabber_embeddings = {}

In [35]:
for word in words:
    jabber_embeddings[word] = return_embedding(word, tokenizer)

In [37]:
with open('jabber_bert_embeddings.pickle', 'wb') as handle:
    pickle.dump(jabber_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Word Lists

We'll now get our 40,000 most commonly used words and 40,000 garble words and create embeddings for them.

In [None]:
garble = []

In [None]:
for line in open("../../../data/random_words_all.txt"):
    if line not in garble:
        garble.append(line.replace("\n", ""))

In [None]:
garble[0:8]

In [None]:
english_words = []

In [None]:
import json

In [None]:
vectors_path = "/Users/bhargavvader/open_source/comp-syn/vectors_data/vectors.json"

In [None]:
with open(vectors_path) as f:
    vectors = json.load(f)

In [None]:
vectors[0]['query']

In [None]:
for vec in vectors:
    english_words.append(vec['query'])

In [None]:
english_words

In [None]:
garble_embeddings = {}

In [None]:
for garb in garble:
    garble_embeddings[garb] = return_embedding(garb, tokenizer)


In [None]:
with open('garble_embeddings.pickle', 'wb') as handle:
    pickle.dump(garble_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
word_bert_embeddings = {}

In [None]:
for word in english_words:
    word_bert_embeddings[word] = return_embedding(word, tokenizer)

In [None]:
with open('word_bert_embeddings.pickle', 'wb') as handle:
    pickle.dump(word_bert_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)