# Pretrained embeddings for our words

In [33]:
import tools.processing as pre
import numpy as np

path = "data/embeddings/glove.840B.300d.txt"

def get_glove(path_to_glove,word2index_map):
    embedding_weights = {}
    count_all_words = 0
    with open(path_to_glove,'r') as f:
        for line in f:
            vals = line.split(' ')
            word = str(vals[0])
            if word in word2index_map:                
                count_all_words += 1                                 
                coefs = np.asarray(vals[1:],dtype='float32')
                coefs /= np.linalg.norm(coefs)
                embedding_weights[word] = coefs
            if count_all_words== len(word2index_map) -1:
                break
    return embedding_weights

In [34]:
text = pre.get_text("data/ref_text3.txt")
vocab = pre.Vocabulary(text)

words_alpha = pre.get_text("data/words_alpha.txt")
alpha_vocab = pre.Vocabulary(words_alpha)


In [35]:
unk = []
for key in vocab._dict.keys():
    if(key not in alpha_vocab._dict):
        unk.append(key)

In [36]:
unk

['threesixty',
 'skanless',
 'jackmack',
 'varmits',
 'travie',
 'matress',
 'hussien',
 'lucious',
 'wowie',
 'evers',
 'hafta',
 'sekon',
 'anyday',
 'yoasiatics',
 'gameplan',
 'ceelo',
 'furier',
 'vette',
 'quizzin',
 'huxtables',
 'edamame',
 'trippy',
 'ohhohohohwhoa',
 'halfblack',
 'raaaaaaager',
 'heheheyah',
 'muhfuckers',
 'streetz',
 'booya',
 'darkskinned',
 'lizzane',
 'disrepect',
 'bonebonebonebone',
 'togination',
 'epps',
 'crusing',
 'raprelated',
 'dunn',
 'problemo',
 'wyndanch',
 'mackers',
 'threefourths',
 'mudering',
 'eightynine',
 'ahhhh',
 'posturepedic',
 'heffers',
 'wieghts',
 'hennessy',
 'timebomb',
 'christenin',
 'arguements',
 'lexxxus',
 'gomars',
 'peformin',
 'yuppers',
 'busta',
 'majorly',
 'steelo',
 'reknowned',
 'bumba',
 'macaveli',
 'eyez',
 'pinkle',
 'preoccuppied',
 'disrespecting',
 'ohhhhhh',
 'exray',
 'tingaling',
 'glocks',
 'puba',
 'montell',
 'splifted',
 'lowlows',
 'hawkin',
 'nefretiti',
 'amibitions',
 'comprimising',
 'uhye

## Problem: toooollll

In [7]:
from nltk.corpus import wordnet
import re

def remove_repeated_characters(tokens):
    repeat_pattern = re.compile(r'(\w*)(\w)\2(\w*)')
    match_substitution = r'\1\2\3'
    def replace(old_word):
        if wordnet.synsets(old_word) or old_word in alpha_vocab._dict:
            return old_word
        new_word = repeat_pattern.sub(match_substitution, old_word)
        return replace(new_word) if new_word != old_word else new_word
            
    correct_tokens = [replace(word) for word in tokens]
    return correct_tokens

repeated_removed = remove_repeated_characters(text.split(" "))
repeated_removed = " ".join( repeated_removed )

In [8]:
remove_repeated_characters( ["tooooolll"] )

['tool']

In [71]:
pre.write_text("data/removed_repeated.txt", repeated_removed)

In [38]:
text = pre.get_text("data/ref_text3.txt")
final_vocabulary = pre.Vocabulary(text)

embedding_weights = get_glove(path, final_vocabulary._dict)

In [39]:
print( "number of recognized words: " + str(len(list(embedding_weights.keys())) ))

number of recognized words: 8088


In [40]:
len(final_vocabulary._dict.keys())

8668

In [41]:
unk = []
for key in final_vocabulary._dict.keys():
    if(key not in embedding_weights):
        unk.append(key)

In [42]:
unk

['',
 'skanless',
 'jackmack',
 'sekon',
 'yoasiatics',
 'furier',
 'quizzin',
 'huxtables',
 'ohhohohohwhoa',
 'halfblack',
 'raaaaaaager',
 'heheheyah',
 'muhfuckers',
 'lizzane',
 'bonebonebonebone',
 'togination',
 'raprelated',
 'wyndanch',
 'mackers',
 'threefourths',
 'eightynine',
 'christenin',
 'lexxxus',
 'gomars',
 'peformin',
 'macaveli',
 'pinkle',
 'preoccuppied',
 'tingaling',
 'splifted',
 'lowlows',
 'nefretiti',
 'amibitions',
 'uhyeah',
 'bassuntill',
 'twentyfourseven',
 'overthugging',
 'fearified',
 'gogetter',
 'blurried',
 "young'uns",
 'twitterin',
 'strugala',
 'assasinator',
 'bearthem',
 'llello',
 'papermates',
 'plainpat',
 'mophreme',
 'oahhhohh',
 'henessee',
 'hyperchrondriac',
 'ooohooooh',
 'oahhhahh',
 'wowwowwow',
 'sleeezy',
 'stresed',
 'mastercrash',
 'notthinki',
 'giseles',
 'oooooooooohhhh',
 'heirem',
 'rainbut',
 'paragraphologist',
 'toastses',
 'ahahahohoh',
 'rosecranz',
 'melodiesunmakable',
 'carcuses',
 "muh'fucking",
 'patternunecsca

In [43]:
import tools.spell_correction as spell

recorrected = spell.recorrect_text(" ".join(unk), words_alpha)

In [44]:
comparison = zip (recorrected.split(" "), unk)
for after,before in comparison:
    print(before + " \t " + after)

 	 o
skanless 	 skinless
jackmack 	 hackmack
sekon 	 seton
yoasiatics 	 yoasiatics
furier 	 furies
quizzin 	 quizzing
huxtables 	 hurtable
ohhohohohwhoa 	 ohhohohohwhoa
halfblack 	 halfback
raaaaaaager 	 raaaaaaager
heheheyah 	 heheheyah
muhfuckers 	 muhfuckers
lizzane 	 lizzie
bonebonebonebone 	 bonebonebonebone
togination 	 domination
raprelated 	 raprelated
wyndanch 	 wyndanch
mackers 	 backers
threefourths 	 threefourths
eightynine 	 eightynine
christenin 	 christening
lexxxus 	 lexxxus
gomars 	 gomart
peformin 	 formin
macaveli 	 maravedi
pinkle 	 pickle
preoccuppied 	 preoccupied
tingaling 	 tingling
splifted 	 splinted
lowlows 	 wowwows
nefretiti 	 nefretiti
amibitions 	 ambitions
uhyeah 	 whydah
bassuntill 	 bassuntill
twentyfourseven 	 twentyfourseven
overthugging 	 overthugging
fearified 	 metrified
gogetter 	 forgetter
blurried 	 blurred
young'uns 	 youngun
twitterin 	 twittering
strugala 	 strigal
assasinator 	 assassinator
bearthem 	 berther
llello 	 hello
papermates 	 pap

In [47]:
corrected_weights = get_glove(path, recorrected.split(" "))

In [50]:
(corrected_weights.keys())

dict_keys(['opined', 'sharped', 'reminisce', 'furies', 'asses', 'howe', 'thugging', 'spectacle', 'sleezy', 'copasetic', 'youngun', 'magnetize', 'hesitation', 'hawaiian', 'metrical', 'cleating', 'balmy', 'toppling', 'forty', 'assassinator', 'dazy', 'adrenaline', 'whisked', 'swills', 'nannies', 'yoyo', 'ninety', 'toaster', 'rot', 'midnight', 'beneficent', 'downset', 'negros', 'suckers', 'quizzing', 'preoccupied', 'formin', 'heartbreaks', 'penitentiaries', 'morpheme', 'subfigure', 'doodler', 'napoleon', 'ashot', 'cogitations', 'dineros', 'alala', 'eel', 'five', 'conveying', 'course', 'pachanga', 'imitators', 'supreme', 'realest', 'firster', 'splinted', 'brainstorming', 'backers', 'therese', 'trigger', 'nuchal', 'bewitch', 'leafed', 'sitch', 'biblical', 'explicit', 'elem', 'mammon', 'lyrics', 'disci', 'aliquot', 'offtrack', 'christening', 'lotions', 'coonhounds', 'tingling', ';', 'multimillionaire', 'ranges', 'jeopardizing', 'rickshaw', 'akia', 'cwo', 'udders', 'blurred', 'stresses', 'thre

In [51]:
len(corrected_weights)

235

In [52]:
unk = []
for key in recorrected.split(" "):
    if(key not in corrected_weights):
        unk.append(key)

In [56]:
repeated_removed = remove_repeated_characters(unk)
comparison = zip (repeated_removed, unk)
for after,before in comparison:
    print(before + " \t " + after)

hackmack 	 hackmack
yoasiatics 	 yoasiatics
hurtable 	 hurtable
ohhohohohwhoa 	 ohohohohwhoa
raaaaaaager 	 rager
heheheyah 	 heheheyah
muhfuckers 	 muhfuckers
bonebonebonebone 	 bonebonebonebone
raprelated 	 raprelated
wyndanch 	 wyndanch
threefourths 	 threfourths
eightynine 	 eightynine
lexxxus 	 lexus
gomart 	 gomart
maravedi 	 maravedi
wowwows 	 wowwows
nefretiti 	 nefretiti
whydah 	 whydah
bassuntill 	 basuntil
twentyfourseven 	 twentyfourseven
overthugging 	 overthuging
metrified 	 metrified
strigal 	 strigal
berther 	 berther
oahhhohh 	 oahoh
hyperchrondriac 	 hyperchrondriac
ooohooooh 	 ohoh
oahhhahh 	 oahah
wowwowwow 	 wowowow
mastercrash 	 mastercrash
notthinki 	 nothinki
cisele 	 cisele
oooooooooohhhh 	 ooh
paragraphologist 	 paragraphologist
ahahahohoh 	 ahahahohoh
rosecranz 	 rosecranz
melodiesunmakable 	 melodiesunmakable
arcuses 	 arcuses
muh'fucking 	 muh'fucking
patternunecscapable 	 paternunecscapable
drammach 	 drammach
dripdrop 	 dripdrop
hoochercoocher 	 hochercoch

In [57]:
recorrected = spell.recorrect_text(" ".join(repeated_removed), words_alpha)

In [59]:
corrected_weights = get_glove(path, recorrected.split(" "))
len(corrected_weights)

40

In [63]:
len(recorrected.split(" "))

340