# 08 - Glovely Embedded

Since our own word2vec embedding didn't work out so nicely, we'll have a look at Glove!

In [5]:
import tools.processing as pre
import numpy as np

path = "data/embeddings/glove.840B.300d.txt"

def get_glove(path_to_glove,word2index_map):
    embedding_weights = {}
    count_all_words = 0
    
    with open(path_to_glove,'r') as f:
        for line in f:
            vals = line.split(' ')
            word = str(vals[0])
            if word in word2index_map:                
                count_all_words += 1                                 
                coefs = np.asarray(vals[1:],dtype='float32')
                coefs /= np.linalg.norm(coefs)
                embedding_weights[word] = coefs
            if count_all_words== len(word2index_map) -1:
                break
    return embedding_weights

In [2]:
text = pre.get_text("data/ref_text3.txt")
vocab = pre.Vocabulary(text)

words_alpha = pre.get_text("data/words_alpha.txt")
alpha_vocab = pre.Vocabulary(words_alpha)


## Problem: repeated character toooollll

In [3]:
from nltk.corpus import wordnet
import re

def remove_repeated_characters(tokens):
    repeat_pattern = re.compile(r'(\w*)(\w)\2(\w*)')
    match_substitution = r'\1\2\3'
    def replace(old_word):
        if wordnet.synsets(old_word) or old_word in alpha_vocab._dict:
            return old_word
        new_word = repeat_pattern.sub(match_substitution, old_word)
        return replace(new_word) if new_word != old_word else new_word
            
    correct_tokens = [replace(word) for word in tokens]
    return correct_tokens

repeated_removed = remove_repeated_characters(text.split(" "))
repeated_removed = " ".join( repeated_removed )

In [4]:
print(remove_repeated_characters( ["tooooolll"] ))
print(remove_repeated_characters( ["schhhhooooll"] ))

['tool']
['schol']


**Unfortunately, we also have bad corrections: schol should have been school instead**

We can try to recorrect it using our initial text dictionary

In [7]:
import tools.spell_correction as spell

recorrected = spell.recorrect_text(repeated_removed, text)

In [None]:
# pre.write_text("data/removed_repeated.txt", recorrected)

### Use Glove Embedding to look up our vocabulary and translate it into the feature space

In [5]:
text = pre.get_text("data/removed_repeated.txt")
final_vocabulary = pre.Vocabulary(text)

embedding_weights = get_glove(path, final_vocabulary._dict)

In [6]:
words_recognized = len((embedding_weights.keys())) 
words_total      = len(final_vocabulary._dict.keys())
print( f"recognized words: {words_recognized} / {words_total} ")

recognized words: 8011 / 8518 


In [7]:
my_vocab       = set(final_vocabulary._dict.keys())
accepted_vocab = set(embedding_weights.keys())

unk = list( my_vocab - accepted_vocab )
unk.sort()

print( unk [:5])
print( f"#unknowns: {len(unk)}")

['\n', '2fifth', 'adolecsent', 'adrendaline', 'afeni']
#unknowns: 507


### Drop the "\n" since it is irrelevant. In our final input we will replace all linebreaks with ";"

In [8]:
unk = unk[1:]

### We now run our spell correction on the words that have not been recognized

In [14]:
import tools.spell_correction as spell

recorrected = spell.recorrect_text(" ".join(unk), words_alpha).split(" ")

recorrected_weights = get_glove(path, recorrected)

recorrected_vocab = set(recorrected)
reaccepted_vocab  = set(recorrected_weights.keys())

acceptance_list = [ (word in reaccepted_vocab) for word in recorrected ]

residual_unk = list( recorrected_vocab - reaccepted_vocab )
residual_unk.sort()

print( residual_unk [:5])
print( f"#unknowns: {len(residual_unk)} / {len(recorrected)}")

['afenil', 'againagain', 'agnathic', 'agrah', 'ahahahohoh']
#unknowns: 300 / 506


### Out of the previous 506 unknowns, we are now left with 300! We managed to correct 206 words, but let's see if the corrections make sense

In [32]:
import texttable as tt
tab = tt.Texttable()

headings = ['Unknown','Corrected', 'Accepted?']
tab.header(headings)

unit_costs = [40.0, 50.0, 12.0]

comparison = list(zip (unk, recorrected, acceptance_list))

for row in comparison:
    tab.add_row(row)

s = tab.draw()
print (s)

+-----------------------+-----------------------+-----------+
|        Unknown        |       Corrected       | Accepted? |
| 2fifth                | fifth                 | 1         |
+-----------------------+-----------------------+-----------+
| adolecsent            | adolescent            | 1         |
+-----------------------+-----------------------+-----------+
| adrendaline           | adrenaline            | 1         |
+-----------------------+-----------------------+-----------+
| afeni                 | afenil                | 0         |
+-----------------------+-----------------------+-----------+
| againagain            | againagain            | 0         |
+-----------------------+-----------------------+-----------+
| ageiathic             | agnathic              | 0         |
+-----------------------+-----------------------+-----------+
| ahahahohoh            | ahahahohoh            | 0         |
+-----------------------+-----------------------+-----------+
| aighta

In [44]:
valid_embedding_words = {}
correction_map = {}

unknown_default = np.array(300*[0])

for unknown, corrected, accepted in comparison:
    if accepted:
        valid_embedding_words[corrected] = recorrected_weights[corrected]
        correction_map [unknown] = corrected
    # else:
    #     valid_embedding_words[unknown] = unknown_default 

### Now we replace the wrong words in our original text with our corrections, so that more words can be embedded by glove

In [40]:
corrected_text = text

for key, value in correction_map.items():
    corrected_text = corrected_text.replace(key, value)

'scantron ; aw ; surely ; shining ; something ; 88 ; trined ; right ; electric ; coasts ; coast ; tog'

In [42]:
pre.write_text("data/final_2_pac_rakim_kid_cudi.txt", corrected_text)

### From here on, all words that are not contained inside the embedding are mapped to some default value.
We could use the mean for this or some random value or we could completely omit these values

In [6]:
import tools.processing as pre

final_text = pre.get_text("data/final_2_pac_rakim_kid_cudi.txt")
final_vocabulary = pre.Vocabulary(final_text)

embedding_weights = get_glove(path, final_vocabulary._dict)

In [7]:
print( f"recognized words: {len(embedding_weights.keys())} / {len(final_vocabulary._dict.keys())}")

recognized words: 8167 / 8471


In [8]:
weights = np.array(list(embedding_weights.values()))

In [9]:
# We have 8167 word vectors of length 300
weights.shape

(8167, 300)

**The mean word vector is:**

In [10]:
mean = np.mean(weights, axis=0)
mean

array([-2.46031582e-03, -3.15875886e-03, -2.89881392e-03, -7.44687999e-03,
        6.34450279e-03, -2.94484664e-04, -8.76065344e-04, -2.65731796e-04,
       -3.80283082e-03,  1.90850943e-01,  2.54441937e-03, -5.57928160e-03,
       -1.27946585e-03, -5.77619439e-03, -3.01242387e-03, -2.86886032e-04,
        1.79459178e-03,  5.95713630e-02,  3.95095791e-04,  1.53345382e-03,
        1.84276327e-03, -1.13724580e-03,  1.54758431e-03, -2.43751821e-03,
       -7.36646960e-03, -9.04039852e-03, -1.42165273e-03, -2.53893225e-03,
        2.21080077e-03, -9.79978684e-03, -4.56522591e-03,  2.03011185e-03,
       -1.47635275e-02,  4.61883936e-03,  1.91298767e-03, -1.76920241e-03,
        4.73718578e-03,  9.33369808e-03, -3.47765721e-03,  8.02848767e-03,
       -3.70021444e-03, -3.09497095e-03, -2.85498798e-03, -2.70495540e-03,
        4.48540412e-03,  2.27067946e-03,  1.72381569e-03, -3.25393421e-03,
       -1.36920984e-03,  3.38628376e-03, -1.89377461e-03,  8.39639083e-03,
       -5.52345673e-03, -

**Let's check what word this would be using the cosine distance**

In [13]:
def get_closest_words(embedded_word, embedding_dict, limit=10):
    
    embedding_dict.items()
    
    weights = np.array(list(embedding_dict.values()))
    
    cosine_dists = np.dot( weights, embedded_word)
    ff = np.argsort(cosine_dists)[::-1][:limit]
    
    words = list(embedding_dict.keys())
    index2word_map = dict( (key, value) for (key, value) in enumerate(words) )
    
    words = []
    distance = []
    for f in ff:
        words.append(index2word_map[f])
        distance.append(cosine_dists[f])
    
    return list(zip(words, distance))

get_closest_words(mean, embedding_weights)

[('even', 0.25623482),
 ('nothing', 0.25214267),
 ('somehow', 0.24717022),
 ('imagine', 0.24696591),
 ('whatever', 0.24661142),
 ('they', 0.24656454),
 ('hardly', 0.24625331),
 ('rather', 0.24608666),
 ('so', 0.2456822),
 ('enough', 0.2448145)]

In [12]:
get_closest_words(embedding_weights[';'], embedding_weights)

[(';', 1.0000001),
 ('and', 0.50414),
 ('as', 0.4537127),
 ('which', 0.41770586),
 ('while', 0.41335005),
 ('shall', 0.4117339),
 ('or', 0.41107875),
 ('others', 0.40873516),
 ('except', 0.376888),
 ('but', 0.3727518)]

# We will export these embedding related functions into the package tools.embedding

## Let's put these embeddings into action and test them for our rap text