In [1]:
import tools.processing as pre
text = pre.get_text("data/cleaned-rap-lyrics/clean2_pac.txt")

CONTRACTION_MAP = {"'em":"them","y'know": "you know", "'hem": "them",                   
                   "c'mon": "come on", "'caine": "cocaine",
                   "mo'": "my", "cha'": "ya", "'cha": "ya",
                   "whaddya": "what do ya", "nuttin": "nothing",
                   "thru": "through",
                   "shoulda": "should have",
                   "lets": "let us", "let's": "let us",
                   "'til": "until", "outta": "out of", "i'ma": "i am a", "tho'": "though","til'": "until",
                   "'im": "him", "'bout": "about", "'n": "and", "tha'": "the", "tu'": "to",
                   "uz'": "daheck?", "yo'": "your", "witcha": "with ya",
                   "'tack": "attack", "'head": "ahead", "lil'": "little", "getcha": "get ya",
                   "wit'chu": "with you", "get'cha": "get ya", "sweatcha": "sweat ya", "e'ry": "every",
                   "what'cha": "what ya", "hitcha": "hit ya", "gov'na": "governor", "'fore": "before",
                   "mill'": "million", "ain't": "is not", "aren't": "are not","can't": "can not", 
                   "can't've": "can not have", "'cause": "because", "could've": "could have", 
                   "couldn't": "could not", "couldn't've": "could not have","didn't": "did not", 
                   "doesn't": "does not", "don't": "do not", "hadn't": "had not", 
                   "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", 
                   "he'd": "he would", "he'd've": "he would have", "he'll": "he will", 
                   "he'll've": "he he will have", "he's": "he is", "how'd": "how did", 
                   "how'd'y": "how do you", "how'll": "how will", "how's": "how is", 
                   "i'd": "i would", "i'd've": "i would have", "i'll": "i will",
                   "i'll've": "i will have","i'm": "i am", "i've": "i have", 
                   "i'd": "i would", "i'd've": "i would have", "i'll": "i will", 
                   "i'll've": "i will have","i'm": "i am", "i've": "i have", 
                   "isn't": "is not", "it'd": "it would", "it'd've": "it would have", 
                   "it'll": "it will", "it'll've": "it will have","it's": "it is", 
                   "let's": "let us", "ma'am": "madam", "mayn't": "may not", 
                   "might've": "might have","mightn't": "might not","mightn't've": "might not have", 
                   "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", 
                   "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", 
                   "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
                   "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", 
                   "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", 
                   "she's": "she is", "should've": "should have", "shouldn't": "should not", 
                   "shouldn't've": "should not have", "so've": "so have","so's": "so as", 
                   "this's": "this is",
                   "that'd": "that would", "that'd've": "that would have","that's": "that is", 
                   "there'd": "there would", "there'd've": "there would have","there's": "there is", 
                   "they'd": "they would", "they'd've": "they would have", "they'll": "they will", 
                   "they'll've": "they will have", "they're": "they are", "they've": "they have", 
                   "to've": "to have", "wasn't": "was not", "we'd": "we would", 
                   "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", 
                   "we're": "we are", "we've": "we have", "weren't": "were not", 
                   "what'll": "what will", "what'll've": "what will have", "what're": "what are", 
                   "what's": "what is", "what've": "what have", "when's": "when is", 
                   "when've": "when have", "where'd": "where did", "where's": "where is", 
                   "where've": "where have", "who'll": "who will", "who'll've": "who will have", 
                   "who's": "who is", "who've": "who have", "why's": "why is", "why'd": "why did",
                   "why've": "why have", "will've": "will have", "won't": "will not", 
                   "won't've": "will not have", "would've": "would have", "wouldn't": "would not", 
                   "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
                   "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                   "you'd": "you would", "you'd've": "you would have", "you'll": "you will", 
                   "you'll've": "you will have", "you're": "you are", "you've": "you have", 
                   "gon'": "going to",
                   "an'": "and"
                  } 

# Step 1: Cleaning the data

We need to correct typos such as "imdamcwitdenastymouf" = "i am the mc with the nasty mouth"

In [3]:
import re


remove_lbreak = text.replace("\n", " ; ")
remove_space  = re.sub(" +", " ", remove_lbreak)

remove_space = remove_space.replace("'ll ", " will ")
remove_space = remove_space.replace("'ve ", " have ")
remove_space = remove_space.replace(" u ", " you ")
remove_space = remove_space.replace(" youre ", " you are ")
remove_space = remove_space.replace(" n ", " and ")
remove_space = remove_space.replace(" de ", " the ")
remove_space = remove_space.replace(" gon ", " going to ")

def expand_contractions(sentence, contraction_mapping):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)
        return expanded_contraction
        
    expanded_sentence = contractions_pattern.sub(expand_match, sentence)
    return expanded_sentence


expanded = expand_contractions(remove_space, CONTRACTION_MAP)
expanded = expanded.replace("'s ", " 's ")
expanded = expanded.replace("s'", " 's ")
expanded = expanded.replace("in'", "ing")
expanded = expanded.replace("'ll", " will")
expanded = expanded.replace("mutha", "mother")
expanded = expanded.replace("en'", "ing")
expanded = expanded.replace("brotha", "brother")
# expanded = expanded.replace("outta", "out of")
expanded = expanded.replace("2pac", "tupac")
print(expanded[:1000])

as real as it seems the american dream ; is not nothing but another calculated schemes ; to get us locked up shot up back in chains ; to deny us of the future rob our names ; kept my history of mystery but now i see ; the american dream was not meant for me ; cause lady liberty is a hypocrite she lied to me ; promised me freedom education equality ; never gave me nothing but slavery ; and now look at how dangerous you made me ; calling me a mad man cause i am strong and bold ; with this dump full of knowledge of the lies you told ; promise me emancipation indispute nation ; all you gave my people was our patience ; fathers of our country never cared for me ; they kept my answer shackled up in slavery ; and uncle sam never did a damn thing for me ; except lie about the facts in my history ; so now i am sitting here mad cause i am unemployed ; but the government 's glad cause they enjoyed ; when my people are down so they can screw us around ; time to change the government now panther po

In [18]:
_dict = pre.Vocabulary(expanded)
t = "\n".join(_dict._keys)
pre.write_text("data/my_dict.txt", t)

# Step 2: Building a dictionary
We load the found distinct words into a txt file *data/my_dict.txt*

We slowly build up the dictionary from the given text to make sure we don't accept any typos

In [9]:
text_dict = pre.get_text("data/my_dict.txt")
_dict = pre.Vocabulary(text_dict)

next = True

startIdx = 0
print("The text has been loaded. Build up the dictionary slowly by scanning through the text bit by bit")
startIdx = int( input("Enter index where you want to start") )

print(startIdx)

step_size = 200

all_words = expanded.split(" ")

while startIdx < len(all_words): 
    vocab = all_words[startIdx: startIdx + step_size]
    new_words = list( (set(vocab) - (set(_dict._keys))) )
    
    print(f"startIDX: {startIdx}")
    print(new_words)
    
    user = input("accept? y/n")
    while (user == "n"):
        repl = input("replace word: ")
        sub  = input("substitute: ")
        new_words [ new_words.index(repl) ] = sub
        print(new_words)
        user = input("accept? y/n")
        
    if(user == "break"):
        break
        
    total = sorted( new_words + _dict._keys )
    
    _dict = pre.Vocabulary(" ".join(total))
    
    startIdx += step_size
    
print(" ".join(total))

The text has been loaded. Build up the dictionary slowly by scanning through the text bit by bit


Enter index where you want to start 0


0
startIDX: 0
['fathers', 'when', 'down', 'about', 'are', 'uncle', 'answer', 'sam', 'sitting', 'screw', 'enjoyed', 'government', 'they', 'country', 'here', 'so', 'shackled', 'did', 'cared', 'glad', 'unemployed', 'lie', 'damn', 'thing', 'facts', 'can', 'patience', "'s", 'except', 'around']


accept? y/n y


startIDX: 200
['take', 'fame', 'rich', 'more', 'sell', 'resides', 'put', 'tried', 'think', 'just', 'let', 'forget', 'coming', 'toe', 'less', 'other', 'fight', 'nightmare', 'know', 'or', 'community', 'klu-klux-klan', 'outs', 'poor', 'besides', 'true', 'skin', 'had', 'free', 'could', 'fair', 'sneak', 'no', 'segregation', 'richer', 'time', 'be', 'unity', 'settle', 'win', 'power', 'statistic', 'go', 'survive', 'rest', 'yell', 'suppressed', 'ignorant', 'than', 'using', 'blacks', 'dead', 'intimidation', 'within', 'then', 'drew', 'bats', 'change', 'on', 'capitalistic', 'refuse', 'set', 'were', 'allowed', 'will', 'straight', 'wanted', 'being', 'suffered', 'keep', 'an', 'attack', 'hoping', 'impatient', 'panther', 'way', 'that', 'drugs', 'out', 'hold', 'mother']


accept? y/n y


startIDX: 400
['remember', 'name', 'laugh', 'arteries', 'seem', 'stole', 'eyes', 'open', 'clawed', 'there', 'immunity', 'civilized', 'some', 'strike', 'breath', 'like', 'has', 'hard', 'panter', 'america', 'we', 'been', 'your', 'passed', 'living', 'realize', 'dying', 'yet', 'charged', 'try', 'said', 'running', 'do', 'fighting', 'claim', 'what', 'death', 'have', 'boy', 'oh', 'asking', 'disease', 'them', 'through', 'stop', 'case']


accept? y/n y


startIDX: 600
['sleepin', 'tired', 'wasnt', 'dj', '2pac', 'corrections', 'happened', 'mic', 'whats', 'long', 'woke', 'grip', 'sick', 'gone', 'bizzy', 'finally', 'its', 'typist', 'feeling', 'brotha', 'wrong', 'send', 'hear']


accept? y/n n
replace word:  sleepin
substitute:  sleeping


['sleeping', 'tired', 'wasnt', 'dj', '2pac', 'corrections', 'happened', 'mic', 'whats', 'long', 'woke', 'grip', 'sick', 'gone', 'bizzy', 'finally', 'its', 'typist', 'feeling', 'brotha', 'wrong', 'send', 'hear']


accept? y/n n
replace word:  wasnt
substitute:  was


['sleeping', 'tired', 'was', 'dj', '2pac', 'corrections', 'happened', 'mic', 'whats', 'long', 'woke', 'grip', 'sick', 'gone', 'bizzy', 'finally', 'its', 'typist', 'feeling', 'brotha', 'wrong', 'send', 'hear']


accept? y/n n
replace word:  2pac
substitute:  tupac


['sleeping', 'tired', 'was', 'dj', 'tupac', 'corrections', 'happened', 'mic', 'whats', 'long', 'woke', 'grip', 'sick', 'gone', 'bizzy', 'finally', 'its', 'typist', 'feeling', 'brotha', 'wrong', 'send', 'hear']


accept? y/n n
replace word:  brotha
substitute:  brother


['sleeping', 'tired', 'was', 'dj', 'tupac', 'corrections', 'happened', 'mic', 'whats', 'long', 'woke', 'grip', 'sick', 'gone', 'bizzy', 'finally', 'its', 'typist', 'feeling', 'brother', 'wrong', 'send', 'hear']


accept? y/n y


startIDX: 800
['smoking', 'might', 'clue', 'tms', 'nimth', 'until', 'mouth', 'rock', 'myself', 'paniced', 'use', 'got', 'every', 'health', 'want', 'street', 'house', 'microphone', 'he', 'im', 'strength', 'ya', 'crew', 'battle', 'without', 'guaranteed', 'find', 'stumped', 'over', 'wurd', 'word', 'whipped', 'phone', 'busy', 'first', 'rockin', 'bad', 'picked', 'dizzy', 'musical', 'called', 'mics', 'cigarettes', 'if', 'hit', '6', 'trace', 'sure', 'unbelievable', 'beat', 'cops', 'comes', 'yo', 'before', 'onetwo', 'rhymes']


accept? y/n n
replace word:  paniced
substitute:  panicked


['smoking', 'might', 'clue', 'tms', 'nimth', 'until', 'mouth', 'rock', 'myself', 'panicked', 'use', 'got', 'every', 'health', 'want', 'street', 'house', 'microphone', 'he', 'im', 'strength', 'ya', 'crew', 'battle', 'without', 'guaranteed', 'find', 'stumped', 'over', 'wurd', 'word', 'whipped', 'phone', 'busy', 'first', 'rockin', 'bad', 'picked', 'dizzy', 'musical', 'called', 'mics', 'cigarettes', 'if', 'hit', '6', 'trace', 'sure', 'unbelievable', 'beat', 'cops', 'comes', 'yo', 'before', 'onetwo', 'rhymes']


accept? y/n n
replace word:  im
substitute:  i


['smoking', 'might', 'clue', 'tms', 'nimth', 'until', 'mouth', 'rock', 'myself', 'panicked', 'use', 'got', 'every', 'health', 'want', 'street', 'house', 'microphone', 'he', 'i', 'strength', 'ya', 'crew', 'battle', 'without', 'guaranteed', 'find', 'stumped', 'over', 'wurd', 'word', 'whipped', 'phone', 'busy', 'first', 'rockin', 'bad', 'picked', 'dizzy', 'musical', 'called', 'mics', 'cigarettes', 'if', 'hit', '6', 'trace', 'sure', 'unbelievable', 'beat', 'cops', 'comes', 'yo', 'before', 'onetwo', 'rhymes']


accept? y/n y


startIDX: 1000
['leather', 'head', 'signaled', 'hop', 'sucka', 'happy', 'would', 'drop', 'should', 'strictly', 'confident', 'forced', 'shivered', 'bust', 'fell', 'jacket', 'walked', 'begin', 'tasted', 'payin', 'pocket', 'grabbed', 'wit', 'loose', 'prove', 'lucky', 'why', 'hyped', 'hopin', 'dumb', 'play', 'dope', 'rhyme', 'streets', 'going', 'boss', 'perpetrator', 'because', 'hand', 'hell', 'ready', 'night', 'came', 'give', 'ha', 'where', 'defeat', 'from', 'lost', 'stage', 'started']


accept? y/n n
replace word:  sucka
substitute:  sucker


['leather', 'head', 'signaled', 'hop', 'sucker', 'happy', 'would', 'drop', 'should', 'strictly', 'confident', 'forced', 'shivered', 'bust', 'fell', 'jacket', 'walked', 'begin', 'tasted', 'payin', 'pocket', 'grabbed', 'wit', 'loose', 'prove', 'lucky', 'why', 'hyped', 'hopin', 'dumb', 'play', 'dope', 'rhyme', 'streets', 'going', 'boss', 'perpetrator', 'because', 'hand', 'hell', 'ready', 'night', 'came', 'give', 'ha', 'where', 'defeat', 'from', 'lost', 'stage', 'started']


accept? y/n n
replace word:  payin
substitute:  paying


['leather', 'head', 'signaled', 'hop', 'sucker', 'happy', 'would', 'drop', 'should', 'strictly', 'confident', 'forced', 'shivered', 'bust', 'fell', 'jacket', 'walked', 'begin', 'tasted', 'paying', 'pocket', 'grabbed', 'wit', 'loose', 'prove', 'lucky', 'why', 'hyped', 'hopin', 'dumb', 'play', 'dope', 'rhyme', 'streets', 'going', 'boss', 'perpetrator', 'because', 'hand', 'hell', 'ready', 'night', 'came', 'give', 'ha', 'where', 'defeat', 'from', 'lost', 'stage', 'started']


accept? y/n n
replace word:  hopin
substitute:  hoping


['leather', 'head', 'signaled', 'hop', 'sucker', 'happy', 'would', 'drop', 'should', 'strictly', 'confident', 'forced', 'shivered', 'bust', 'fell', 'jacket', 'walked', 'begin', 'tasted', 'paying', 'pocket', 'grabbed', 'wit', 'loose', 'prove', 'lucky', 'why', 'hyped', 'hoping', 'dumb', 'play', 'dope', 'rhyme', 'streets', 'going', 'boss', 'perpetrator', 'because', 'hand', 'hell', 'ready', 'night', 'came', 'give', 'ha', 'where', 'defeat', 'from', 'lost', 'stage', 'started']


accept? y/n y


startIDX: 1200
['reason', 'die', 'lives', 'tricks', 'build', 'end', 'must', 'cell', 'friend', 'backstab', 'thinking', 'suckers', 'dropped', 'communities', 'mission', 'mind', 'top', 'trick', 'past', 'soul', 'rocking', 'come', 'fake', 'grab', 'cream', 'pin', 'crop', 'wink', 'his', 'by', 'opposition', 'excel', 'sold', 'hole', 'move', 'sweated', 'brothers', 'only', 'cry', 'gold']


accept? y/n y


startIDX: 1400
['reach', 'proud', 'mountain', 'thought', 'black', 'toast', 'calm', 'folks', 'educated', 'too', 'who', 'source', 'teach', 'loving', 'hearing', 'aiyyo', 'preach', 'stay', 'hated', 'putting', 'scared', 'wondering', 'tossed', 'stopping', 'suffocate', 'caught', 'dilly', 'sweating', 'write', 'ay', 'many', 'boom', 'dropping', 'gets', 'napalm', 'force', 'always', 'bomb', 'constantly', 'nervous', 'dose', 'building', 'listen', 'fact', 'bite', 'buying', 'close', 'simple', 'hype', 'checking', 'flavor', 'ballers']


accept? y/n y


startIDX: 1600
['bass', 'god', 'girls', 'busting', 'truck', 'base', 'adversaries', 'mediocrity', 'speakers', 'neccessary', 'security', 'defeated', 'scary', 'stricly', 'breathe', 'deal', 'vote', 'pop', 'minute', 'means', 'steal', 'drops', 'present', 'jump', 'intro', 'guerilla', 'lonely', 'larger', 'any', 'waiting', 'damaged', 'beats', 'titan', 'makes', 'pumps', 'control', 'run', 'hands', 'kind', 'jealous', 'appealed', 'chance', 'leader', 'pay', 'conceited', 'goal', 'amateurs']


accept? y/n y


startIDX: 1800
['huh', 'kidding', 'fast', 'ray', 'stepping', 'fumble', 'fool', 'together', 'better', 'girl', 'care', 'bragging', 'tumble', 'swear', 'light', 'mine', 'born', 'still', 'weapon', 'well', 'tyson', 'gonna', 'since', 'good', 'playing', 'capable', 'battling', 'anywhere', 'slow', 'place', 'one', 'competition', 'her', 'knew', 'yourself']


accept? y/n y


startIDX: 2000
['troop', 'pant', 'fuck', 'caution', 'battles', 'scarred', 'puff', 'trap', 'deadly', 'cannot', 'veteran', 'holds', 'stopped', 'holding', 'piece', 'vocalist', 'ass', 'most', 'recoup', 'yeast', 'rising', 'anybody', 'rope', 'feel', 'rumble', 'beast', 'inside', 'eat', 'musician', 'lock', 'discouraged', 'others', 'climb', 'fall', 'wrath', 'burnt', 'vicious', 'competitor', 'predator', 'musically', 'forever', 'boldy', 'into', 'toss', 'barred', 'raw', 'success', 'sentry', 'consoloist', 'step', 'proceed', 'salad']


accept? y/n y


startIDX: 2200
['lesson', 'exact', 'silly', 'upset', 'ant', 'stomped', 'body', 'tracks', 'punk', 'game', 'proven', 'getting', 'things', 'choke', 'diss', 'letting', 'drano', 'cordless', 'hawalian', 'wheels', 'tying', 'making', 'worst', 'moving', 'volcano', 'pull', 'tight', 'grooving', 'paid', 'freddy', 'blowing', 'motherfucker']


accept? y/n y


startIDX: 2400
['played', 'chilling', 'loading', 'paragraph', 'burn', 'freaks', 'make', 'shoot', 'baseball', 'habit', 'daddio', 'useless', 'mac', 'tons', 'ooh', 'piranha', 'motherfucking', 'uzi', 'moves', 'wanna', 'funky', 'toothless', 'ear', 'discuss', 'ruthless', 'penetrating', 'while', 'jams', 'destroyed', 'bullet', 'radiation', 'toy', 'harmed', 'face', 'quick', 'lyrics', 'taste', 'armed', 'writing', 'bat', 'stand', 'def', 'nuclear', 'patio', 'homeboy']


accept? y/n n
replace word:  ooh
substitute:  oh


['played', 'chilling', 'loading', 'paragraph', 'burn', 'freaks', 'make', 'shoot', 'baseball', 'habit', 'daddio', 'useless', 'mac', 'tons', 'oh', 'piranha', 'motherfucking', 'uzi', 'moves', 'wanna', 'funky', 'toothless', 'ear', 'discuss', 'ruthless', 'penetrating', 'while', 'jams', 'destroyed', 'bullet', 'radiation', 'toy', 'harmed', 'face', 'quick', 'lyrics', 'taste', 'armed', 'writing', 'bat', 'stand', 'def', 'nuclear', 'patio', 'homeboy']


accept? y/n y


startIDX: 2600
['maxium', 'melody', 'woofers', 'thumping', 'rhymefumble', 'bird', 'overload', 'large', 'dilate', 'voice', 'mysterious', 'heard', 'humming', 'stomping', 'speaker', 'thunder', 'wiz', 'young', 'stretch', 'expand', 'hood', 'charge', 'explode', 'countless', 'needle', 'hiphop', 'ears', 'furious', 'record', 'mister', 'nuance', 'conisseur', 'poetry', 'prodigy', 'phd', 'serious', 'clever', 'stuck', 'after', 'drums']


accept? y/n n
replace word:  maxium
substitute:  maximum


['maximum', 'melody', 'woofers', 'thumping', 'rhymefumble', 'bird', 'overload', 'large', 'dilate', 'voice', 'mysterious', 'heard', 'humming', 'stomping', 'speaker', 'thunder', 'wiz', 'young', 'stretch', 'expand', 'hood', 'charge', 'explode', 'countless', 'needle', 'hiphop', 'ears', 'furious', 'record', 'mister', 'nuance', 'conisseur', 'poetry', 'prodigy', 'phd', 'serious', 'clever', 'stuck', 'after', 'drums']


accept? y/n y


startIDX: 2800
['roc', 'outro', 'check']


accept? y/n y


startIDX: 3000
['car', 'lagging', 'wonder', 'broke', 'backing', 'diz', 'center', 'talking', '2pac', 'team', 'illing', 'limbwalk', 'big', 'sagging', 'posse', 'park', 'west', 'outside', 'till', 'postal', 'boys', 'kick', 'lightweight', 'cruiser', 'understand', 'toughest', 'box', 'bringing', 'say', 'slack', 'e', 'old', 'life', 'winners', 'noise', 'levi', 'cheat', 'twenty', 'choose', 'rolled', 'stepped', 'pad', 'border', 'day', 'hoop', 'side', 'neck', 'ten', 'smacking', 'live', 'handle']


accept? y/n n
replace word:  2pac
substitute:  tupac


['car', 'lagging', 'wonder', 'broke', 'backing', 'diz', 'center', 'talking', 'tupac', 'team', 'illing', 'limbwalk', 'big', 'sagging', 'posse', 'park', 'west', 'outside', 'till', 'postal', 'boys', 'kick', 'lightweight', 'cruiser', 'understand', 'toughest', 'box', 'bringing', 'say', 'slack', 'e', 'old', 'life', 'winners', 'noise', 'levi', 'cheat', 'twenty', 'choose', 'rolled', 'stepped', 'pad', 'border', 'day', 'hoop', 'side', 'neck', 'ten', 'smacking', 'live', 'handle']


accept? y/n n
replace word:  illing
substitute:  killing


['car', 'lagging', 'wonder', 'broke', 'backing', 'diz', 'center', 'talking', 'tupac', 'team', 'killing', 'limbwalk', 'big', 'sagging', 'posse', 'park', 'west', 'outside', 'till', 'postal', 'boys', 'kick', 'lightweight', 'cruiser', 'understand', 'toughest', 'box', 'bringing', 'say', 'slack', 'e', 'old', 'life', 'winners', 'noise', 'levi', 'cheat', 'twenty', 'choose', 'rolled', 'stepped', 'pad', 'border', 'day', 'hoop', 'side', 'neck', 'ten', 'smacking', 'live', 'handle']


accept? y/n y


startIDX: 3200
['aggravating', 'three', 'crazy', 'outta', '2pac', 'shower', 'late', 'alone', 'monday', 'morning', 'shocks', 'bit', 'glue', 'brush', 'school', 'show', 'frustrations', 'salutations', 'little', 'matress', 'bed', 'stinking', 'teeth', 'disturbing', 'ring', 'speaking', 'leave', 'roct', 'michael', 'him', 'ahead', 'half', 'vacation', 'lay', 'tone']


accept? y/n y


startIDX: 3400
['really', 'enough', 'cute', 'rapping', 'hour', 'hits', 'turn', 'two', 'clapping', 'tip', 'clothes', 'introduction', 'limo', 'bodygurad', 'audience', 'jumped', 'air', 'sweatsuit', 'backstage', 'crowd', 'scene', 'colessium', 'everywhere', 'macking', 'moved', 'sweat', 'loud', 'white', 'fans', 'fresh', 'off', 'wait', 'start', 'spare', 'screaming', 'tear', 'trying', 'minutes', 'barely', 'party', 'rolling', 'new', 'finish']


accept? y/n n
replace word:  colessium
substitute:  colesseum


['really', 'enough', 'cute', 'rapping', 'hour', 'hits', 'turn', 'two', 'clapping', 'tip', 'clothes', 'introduction', 'limo', 'bodygurad', 'audience', 'jumped', 'air', 'sweatsuit', 'backstage', 'crowd', 'scene', 'colesseum', 'everywhere', 'macking', 'moved', 'sweat', 'loud', 'white', 'fans', 'fresh', 'off', 'wait', 'start', 'spare', 'screaming', 'tear', 'trying', 'minutes', 'barely', 'party', 'rolling', 'new', 'finish']


accept? y/n n
replace word:  colesseum
substitute:  colosseum


['really', 'enough', 'cute', 'rapping', 'hour', 'hits', 'turn', 'two', 'clapping', 'tip', 'clothes', 'introduction', 'limo', 'bodygurad', 'audience', 'jumped', 'air', 'sweatsuit', 'backstage', 'crowd', 'scene', 'colosseum', 'everywhere', 'macking', 'moved', 'sweat', 'loud', 'white', 'fans', 'fresh', 'off', 'wait', 'start', 'spare', 'screaming', 'tear', 'trying', 'minutes', 'barely', 'party', 'rolling', 'new', 'finish']


accept? y/n n
replace word:  macking
substitute:  smacking


['really', 'enough', 'cute', 'rapping', 'hour', 'hits', 'turn', 'two', 'clapping', 'tip', 'clothes', 'introduction', 'limo', 'bodygurad', 'audience', 'jumped', 'air', 'sweatsuit', 'backstage', 'crowd', 'scene', 'colosseum', 'everywhere', 'smacking', 'moved', 'sweat', 'loud', 'white', 'fans', 'fresh', 'off', 'wait', 'start', 'spare', 'screaming', 'tear', 'trying', 'minutes', 'barely', 'party', 'rolling', 'new', 'finish']


accept? y/n y


startIDX: 3600
['tipsy', 'floor', 'violence', 'cutie', 'signing', 'stumbled', 'sexual', 'hate', 'dance', 'looking', 'disco', 'autographs', 'pen', 'sipping', 'mean', 'totions', 'neighborhood', 'motion', 'age', 'positive', 'seventeen', 'silence', 'cooling', 'sit', 'cisco', 'girlies', 'door', 'whole', 'limousine', 'keeps', 'paper']


accept? y/n y


startIDX: 3800
['teenage', 'emcee', 'liar', 'call', 'favorite', 'redeem', 'error', 'rap', 'spin', 'begins', 'room', 'inflation', 'feels', 'high', 'genius', 'whack', 'seen', 'style', 'starts', 'terror', 'desire', 'soon', 'scare', 'pass', 'value', 'beside', 'forte', 'each', 'emcees', 'illtrip', 'awake', 'right', 'dissing', 'tonight', 'ama', 'station', 'admire', "wit'", 'hoods', 'playfully', 'stomach', 'kiss', 'recreation', 'type', 'ladies', 'hypetip', 'avoid', 'hoe']


accept? y/n n
replace word:  emcee
substitute:  emcee


['teenage', 'emcee', 'liar', 'call', 'favorite', 'redeem', 'error', 'rap', 'spin', 'begins', 'room', 'inflation', 'feels', 'high', 'genius', 'whack', 'seen', 'style', 'starts', 'terror', 'desire', 'soon', 'scare', 'pass', 'value', 'beside', 'forte', 'each', 'emcees', 'illtrip', 'awake', 'right', 'dissing', 'tonight', 'ama', 'station', 'admire', "wit'", 'hoods', 'playfully', 'stomach', 'kiss', 'recreation', 'type', 'ladies', 'hypetip', 'avoid', 'hoe']


accept? y/n y


startIDX: 4000
['much', 'words', 'situation', 'sleepin', 'purity', 'closer', 'guilty', 'second', 'stronger', 'blind', 'contact', 'fallin', 'filled', 'men', 'confused', 'does', 'obvious', 'longer', 'leaves', 'stare', 'thinkin', 'touch', 'trustin', 'offers', 'feelin', 'thoughts', 'posessed', 'undressin', 'muttered', 'offer', 'heart', 'stutter', 'lot', 'drink', 'shake', 'passion', 'nowin', 'these', 'girlfriend', 'temptation', 'sweaty', 'eye', 'sinkin', 'kinda', 'best', 'wake', 'gotten']


accept? y/n n
replace word:  sleepin
substitute:  sleeping


['much', 'words', 'situation', 'sleeping', 'purity', 'closer', 'guilty', 'second', 'stronger', 'blind', 'contact', 'fallin', 'filled', 'men', 'confused', 'does', 'obvious', 'longer', 'leaves', 'stare', 'thinkin', 'touch', 'trustin', 'offers', 'feelin', 'thoughts', 'posessed', 'undressin', 'muttered', 'offer', 'heart', 'stutter', 'lot', 'drink', 'shake', 'passion', 'nowin', 'these', 'girlfriend', 'temptation', 'sweaty', 'eye', 'sinkin', 'kinda', 'best', 'wake', 'gotten']


accept? y/n n
replace word:  fallin
substitute:  falling


['much', 'words', 'situation', 'sleeping', 'purity', 'closer', 'guilty', 'second', 'stronger', 'blind', 'contact', 'falling', 'filled', 'men', 'confused', 'does', 'obvious', 'longer', 'leaves', 'stare', 'thinkin', 'touch', 'trustin', 'offers', 'feelin', 'thoughts', 'posessed', 'undressin', 'muttered', 'offer', 'heart', 'stutter', 'lot', 'drink', 'shake', 'passion', 'nowin', 'these', 'girlfriend', 'temptation', 'sweaty', 'eye', 'sinkin', 'kinda', 'best', 'wake', 'gotten']


accept? y/n n
replace word:  thinkin
substitute:  thinking


['much', 'words', 'situation', 'sleeping', 'purity', 'closer', 'guilty', 'second', 'stronger', 'blind', 'contact', 'falling', 'filled', 'men', 'confused', 'does', 'obvious', 'longer', 'leaves', 'stare', 'thinking', 'touch', 'trustin', 'offers', 'feelin', 'thoughts', 'posessed', 'undressin', 'muttered', 'offer', 'heart', 'stutter', 'lot', 'drink', 'shake', 'passion', 'nowin', 'these', 'girlfriend', 'temptation', 'sweaty', 'eye', 'sinkin', 'kinda', 'best', 'wake', 'gotten']


accept? y/n n
replace word:  trustin
substitute:  trusting


['much', 'words', 'situation', 'sleeping', 'purity', 'closer', 'guilty', 'second', 'stronger', 'blind', 'contact', 'falling', 'filled', 'men', 'confused', 'does', 'obvious', 'longer', 'leaves', 'stare', 'thinking', 'touch', 'trusting', 'offers', 'feelin', 'thoughts', 'posessed', 'undressin', 'muttered', 'offer', 'heart', 'stutter', 'lot', 'drink', 'shake', 'passion', 'nowin', 'these', 'girlfriend', 'temptation', 'sweaty', 'eye', 'sinkin', 'kinda', 'best', 'wake', 'gotten']


accept? y/n n
replace word:  undressin
substitute:  undressing


['much', 'words', 'situation', 'sleeping', 'purity', 'closer', 'guilty', 'second', 'stronger', 'blind', 'contact', 'falling', 'filled', 'men', 'confused', 'does', 'obvious', 'longer', 'leaves', 'stare', 'thinking', 'touch', 'trusting', 'offers', 'feelin', 'thoughts', 'posessed', 'undressing', 'muttered', 'offer', 'heart', 'stutter', 'lot', 'drink', 'shake', 'passion', 'nowin', 'these', 'girlfriend', 'temptation', 'sweaty', 'eye', 'sinkin', 'kinda', 'best', 'wake', 'gotten']


accept? y/n n
replace word:  feelin
substitute:  feeling


['much', 'words', 'situation', 'sleeping', 'purity', 'closer', 'guilty', 'second', 'stronger', 'blind', 'contact', 'falling', 'filled', 'men', 'confused', 'does', 'obvious', 'longer', 'leaves', 'stare', 'thinking', 'touch', 'trusting', 'offers', 'feeling', 'thoughts', 'posessed', 'undressing', 'muttered', 'offer', 'heart', 'stutter', 'lot', 'drink', 'shake', 'passion', 'nowin', 'these', 'girlfriend', 'temptation', 'sweaty', 'eye', 'sinkin', 'kinda', 'best', 'wake', 'gotten']


accept? y/n n
replace word:  sinkin
substitute:  sinking


['much', 'words', 'situation', 'sleeping', 'purity', 'closer', 'guilty', 'second', 'stronger', 'blind', 'contact', 'falling', 'filled', 'men', 'confused', 'does', 'obvious', 'longer', 'leaves', 'stare', 'thinking', 'touch', 'trusting', 'offers', 'feeling', 'thoughts', 'posessed', 'undressing', 'muttered', 'offer', 'heart', 'stutter', 'lot', 'drink', 'shake', 'passion', 'nowin', 'these', 'girlfriend', 'temptation', 'sweaty', 'eye', 'sinking', 'kinda', 'best', 'wake', 'gotten']


accept? y/n y


startIDX: 4200
['departed', 'need', 'prolonged', 'sayin', 'fire', 'caress', 'burnin', 'angel', 'affair', 'wet', 'beach', 'pretend', 'treat', 'suffer', 't', 'walkin', 'casanova', 'tender', 'smart', 'seein', 'rougher', 'bein', 'warm', 'sad', 'sweet', 'peach', 'buy', 'callin', 'broken', 'along', 'desires', 'surrender', 'gettin', 'baby', 'love', 'anything']


accept? y/n n
replace word:  sayin
substitute:  saying


['departed', 'need', 'prolonged', 'saying', 'fire', 'caress', 'burnin', 'angel', 'affair', 'wet', 'beach', 'pretend', 'treat', 'suffer', 't', 'walkin', 'casanova', 'tender', 'smart', 'seein', 'rougher', 'bein', 'warm', 'sad', 'sweet', 'peach', 'buy', 'callin', 'broken', 'along', 'desires', 'surrender', 'gettin', 'baby', 'love', 'anything']


accept? y/n n
replace word:  burnin
substitute:  burning


['departed', 'need', 'prolonged', 'saying', 'fire', 'caress', 'burning', 'angel', 'affair', 'wet', 'beach', 'pretend', 'treat', 'suffer', 't', 'walkin', 'casanova', 'tender', 'smart', 'seein', 'rougher', 'bein', 'warm', 'sad', 'sweet', 'peach', 'buy', 'callin', 'broken', 'along', 'desires', 'surrender', 'gettin', 'baby', 'love', 'anything']


accept? y/n n
replace word:  seein
substitute:  seeing


['departed', 'need', 'prolonged', 'saying', 'fire', 'caress', 'burning', 'angel', 'affair', 'wet', 'beach', 'pretend', 'treat', 'suffer', 't', 'walkin', 'casanova', 'tender', 'smart', 'seeing', 'rougher', 'bein', 'warm', 'sad', 'sweet', 'peach', 'buy', 'callin', 'broken', 'along', 'desires', 'surrender', 'gettin', 'baby', 'love', 'anything']


accept? y/n n
replace word:  gettin
substitute:  getting


['departed', 'need', 'prolonged', 'saying', 'fire', 'caress', 'burning', 'angel', 'affair', 'wet', 'beach', 'pretend', 'treat', 'suffer', 't', 'walkin', 'casanova', 'tender', 'smart', 'seeing', 'rougher', 'bein', 'warm', 'sad', 'sweet', 'peach', 'buy', 'callin', 'broken', 'along', 'desires', 'surrender', 'getting', 'baby', 'love', 'anything']


accept? y/n n
replace word:  callin
substitute:  calling


['departed', 'need', 'prolonged', 'saying', 'fire', 'caress', 'burning', 'angel', 'affair', 'wet', 'beach', 'pretend', 'treat', 'suffer', 't', 'walkin', 'casanova', 'tender', 'smart', 'seeing', 'rougher', 'bein', 'warm', 'sad', 'sweet', 'peach', 'buy', 'calling', 'broken', 'along', 'desires', 'surrender', 'getting', 'baby', 'love', 'anything']


accept? y/n y


startIDX: 4400
['next', 'casue', 'reality', 'seconds', 'fiend', 'conceal', 'world', 'fantasizin', 'help', 'passionately', 'matter', 'met', 'heavily', 'clockin', 'watched', 'both', 'bond', 'swears', 'totally', 'times', 'desperately', 'watchin', 'already', 'trapped', 'addiction', 'position', 'looks', 'reinforce', 'role', 'curve', 'tounge', 'tied']


accept? y/n n
replace word:  casue
substitute:  cause


['next', 'cause', 'reality', 'seconds', 'fiend', 'conceal', 'world', 'fantasizin', 'help', 'passionately', 'matter', 'met', 'heavily', 'clockin', 'watched', 'both', 'bond', 'swears', 'totally', 'times', 'desperately', 'watchin', 'already', 'trapped', 'addiction', 'position', 'looks', 'reinforce', 'role', 'curve', 'tounge', 'tied']


accept? y/n n
replace word:  fantasizin
substitute:  fantasizing


['next', 'cause', 'reality', 'seconds', 'fiend', 'conceal', 'world', 'fantasizing', 'help', 'passionately', 'matter', 'met', 'heavily', 'clockin', 'watched', 'both', 'bond', 'swears', 'totally', 'times', 'desperately', 'watchin', 'already', 'trapped', 'addiction', 'position', 'looks', 'reinforce', 'role', 'curve', 'tounge', 'tied']


accept? y/n n
replace word:  clockin
substitute:  clocking


['next', 'cause', 'reality', 'seconds', 'fiend', 'conceal', 'world', 'fantasizing', 'help', 'passionately', 'matter', 'met', 'heavily', 'clocking', 'watched', 'both', 'bond', 'swears', 'totally', 'times', 'desperately', 'watchin', 'already', 'trapped', 'addiction', 'position', 'looks', 'reinforce', 'role', 'curve', 'tounge', 'tied']


accept? y/n n
replace word:  watchin
substitute:  watching


['next', 'cause', 'reality', 'seconds', 'fiend', 'conceal', 'world', 'fantasizing', 'help', 'passionately', 'matter', 'met', 'heavily', 'clocking', 'watched', 'both', 'bond', 'swears', 'totally', 'times', 'desperately', 'watching', 'already', 'trapped', 'addiction', 'position', 'looks', 'reinforce', 'role', 'curve', 'tounge', 'tied']


accept? y/n y


startIDX: 4600
['nights', 'ecstacy', 'sayin', 'burnin', 'drives', 'sittin', 'rolls', 'somebody', 'though', 'yearin', 'cried', 'thinkin', 'cheek', 'chest', 'feelin', 'problems', 'reads', 'tell', 'even', 'hope', 'else', 'wana', 'hurt', 'rub']


accept? y/n n
replace word:  sittin
substitute:  sitting


['nights', 'ecstacy', 'sayin', 'burnin', 'drives', 'sitting', 'rolls', 'somebody', 'though', 'yearin', 'cried', 'thinkin', 'cheek', 'chest', 'feelin', 'problems', 'reads', 'tell', 'even', 'hope', 'else', 'wana', 'hurt', 'rub']


accept? y/n n
replace word:  yearin
substitute:  yearning


['nights', 'ecstacy', 'sayin', 'burnin', 'drives', 'sitting', 'rolls', 'somebody', 'though', 'yearning', 'cried', 'thinkin', 'cheek', 'chest', 'feelin', 'problems', 'reads', 'tell', 'even', 'hope', 'else', 'wana', 'hurt', 'rub']


accept? y/n n
replace word:  thinkin
substitute:  thinking


['nights', 'ecstacy', 'sayin', 'burnin', 'drives', 'sitting', 'rolls', 'somebody', 'though', 'yearning', 'cried', 'thinking', 'cheek', 'chest', 'feelin', 'problems', 'reads', 'tell', 'even', 'hope', 'else', 'wana', 'hurt', 'rub']


accept? y/n n
replace word:  feelin
substitute:  feeling


['nights', 'ecstacy', 'sayin', 'burnin', 'drives', 'sitting', 'rolls', 'somebody', 'though', 'yearning', 'cried', 'thinking', 'cheek', 'chest', 'feeling', 'problems', 'reads', 'tell', 'even', 'hope', 'else', 'wana', 'hurt', 'rub']


accept? y/n y


startIDX: 4800
['fallin', 'wife', 'become', 'art', 'sorry', 'feelin', 'honey', 'cruel', 'act', 'beautiful', 'church', 'work', 'cool', 'pray']


accept? y/n n
replace word:  fallin
substitute:  falling


['falling', 'wife', 'become', 'art', 'sorry', 'feelin', 'honey', 'cruel', 'act', 'beautiful', 'church', 'work', 'cool', 'pray']


accept? y/n n
replace word:  feelin
substitute:  feeling


['falling', 'wife', 'become', 'art', 'sorry', 'feeling', 'honey', 'cruel', 'act', 'beautiful', 'church', 'work', 'cool', 'pray']


accept? y/n y


startIDX: 5000
['projects', 'sleepin', 'trustin', 'sinkin', 'feelin', 'undressin', 'gettin', 'minnie', 'callin', 'thinkin']


accept? y/n n
replace word:  sleepin
substitute:  sleeping


['projects', 'sleeping', 'trustin', 'sinkin', 'feelin', 'undressin', 'gettin', 'minnie', 'callin', 'thinkin']


accept? y/n trustin


startIDX: 5200
['took', 'getter', 'expert', 'key', 'heartbreaker', 'ignition', 'none', 'freaky', 'legend', 'experience', 'romance', 'wallet', 'whatever', 'hoochercoocher', 'quite', 'parking', 'smile', 'wild', 'own', 'personality', 'once', 'moocher', 'block', 'boyfriend', 'enormous', 'believe', 'catching', 'wore', 'turned', 'reputation', 'shook', 'maybe', 'pox', 'diamond', 'chicken', 'shimmy', 'penny']


accept? y/n y


startIDX: 5400
['social', 'held', 'lovers', 'possessed', 'kid', 'poverty', 'poker', 'became', 'excitement', 'crack', 'addicted', 'gun', 'hoodlum', 'helping', 'misery', 'dissed', 'hardrap', 'chinatown', 'tape', 'climbing', 'relationship', 'memory', 'addict', 'static', 'smokey', 'gambler', 'peace', 'politicians', 'instantly', 'pipe', 'their', 'killed', 'ghetto', 'sounds', 'nightstand', 'harlem', 'undressed', 'suddenly', 'miccops', 'smoker', 'inditement', 'payment', 'heading', 'same', 'alcoholic', 'pistol', 'bought', 'basshead', 'everyone']


accept? y/n y


startIDX: 5600
['sadly', 'king', 'sat', 'needing', 'sweden', 'dozen', '1', 'counted', 'ate', 'courses', 'jocking', 'horses', 'wheel', 'meal', 'stable', 'stuff', 'fortune', 'bank', 'plush', 'thoroughbred', 'yeah', 'walk', 'billion', 'nickels', 'steel', 'race', 'herself', 'dimes', 'verse', 'dreams', 'home', 'platt', 'sleep']


accept? y/n y


startIDX: 5800
['sleepin', 'brotha', 'wasnt', 'self', 'paniced']


accept? y/n n
replace word:  sleepin
substitute:  sleeping


['sleeping', 'brotha', 'wasnt', 'self', 'paniced']


accept? y/n n
replace word:  brotha
substitute:  brother


['sleeping', 'brother', 'wasnt', 'self', 'paniced']


accept? y/n n
replace word:  wasnt
substitute:  was


['sleeping', 'brother', 'was', 'self', 'paniced']


accept? y/n n
replace word:  paniced
substitute:  panicked


['sleeping', 'brother', 'was', 'self', 'panicked']


accept? y/n y


startIDX: 6000
['sucka', 'hopin', 'im', 'payin']


accept? y/n n
replace word:  sucka
substitute:  sucker


['sucker', 'hopin', 'im', 'payin']


accept? y/n n
replace word:  hopin
substitute:  hoping


['sucker', 'hoping', 'im', 'payin']


accept? y/n n
replace word:  im
substitute:  i


['sucker', 'hoping', 'i', 'payin']


accept? y/n n
replace word:  payin
substitute:  paying


['sucker', 'hoping', 'i', 'paying']


accept? y/n y


startIDX: 6200
['retire', 'truth', 'hotter', 'ever', 'em', 'rarely', 'hot', 'sex', 'hobbies', 'whiff', "em'", 'becasue', 'last', 'cuz', 'wax', 'niggas', 'windows', 'haha', 'indo', 'rubber', 'loosin', 'candle', 'breaking', '2', 'keeping', 'misplaced', 'companion', 'impossible', 'notion', 'livin', 'shit', 'medication', 'understanding', 'babies', 'board']


accept? y/n n
replace word:  em
substitute:  them


['retire', 'truth', 'hotter', 'ever', 'them', 'rarely', 'hot', 'sex', 'hobbies', 'whiff', "em'", 'becasue', 'last', 'cuz', 'wax', 'niggas', 'windows', 'haha', 'indo', 'rubber', 'loosin', 'candle', 'breaking', '2', 'keeping', 'misplaced', 'companion', 'impossible', 'notion', 'livin', 'shit', 'medication', 'understanding', 'babies', 'board']


accept? y/n break


 
 
 's 1 2pac 6 ; \n a about act addict addicted addiction admire adversaries affair after age aggravating ahead air aiyyo alcoholic all allowed alone along already always am ama amateurs america american an and angel another answer ant any anybody anything anywhere appealed are armed around art arteries as asking ass at ate attack audience autographs avoid awake ay baby back backing backstab backstage bad ballers bank barely barred base baseball bass basshead bat bats battle battles battling be beach beast beat beats beautiful became because become bed been before begin begins bein being believe beside besides best better big billion bird bit bite bizzy black blacks blind block blowing body bodygurad bold boldy bomb bond boom border born boss both bought box boy boyfriend boys bragging breath breathe bringing broke broken brother brothers brush build building bullet burn burnin burning burnt bust busting busy but buy buying by calculated call called callin calling calm came can canno

**The final dictionary will be stored inside the file final_dict.txt**

At this point we skipped a lot of our actual work. In fact, we had to manually go through many words to check if they are actual words or just typos



# Step 3: Correcting Typos using our new dictionary

In [58]:
import re
import collections

def tokens(text):
    """
    Get all words from corpus
    """
    return re.findall(r'\w+', text.lower())

wordlist = pre.get_text('data/final_dict.txt')

WORDS = tokens(wordlist) + [";"]
WORD_COUNTS = collections.Counter(WORDS)

WORD_COUNTS.most_common(20)

[('oh', 9),
 ('i', 5),
 ('fo', 5),
 ('to', 4),
 ('ay', 4),
 ('yeah', 4),
 ('uh', 4),
 ('ah', 4),
 ('o', 4),
 ('the', 3),
 ('nothing', 3),
 ('locked', 3),
 ('me', 3),
 ('sitting', 3),
 ('wit', 3),
 ('who', 3),
 ('making', 3),
 ('motherfucking', 3),
 ('old', 3),
 ('smacking', 3)]

In [59]:
def edits0(word):
    """
    Return all strings that are zero edits away (i.e. the word itself).
    """
    return{word}

def edits1(word):
    """
    Return all strings that are one edits away.
    """
    alphabet = 'abcdefghijklmnopqrstuvwxyz'
    def splits(word):
        """
        return a list of all possible pairs
        that the input word is made of
        """
        return [(word[:i], word[i:]) for i in range(len(word)+1)]
    pairs = splits(word)
    deletes = [a+b[1:] for (a,b) in pairs if b]
    transposes = [a+b[1]+b[0]+b[2:] for (a,b) in pairs if len(b) >1]
    replaces = [a+c+b[1:] for (a,b) in pairs for c in alphabet if b]
    inserts = [a+c+b for (a,b) in pairs for c in alphabet]
    return(set(deletes + transposes + replaces + inserts))

def edits2(word):
    """
    return all strings that are two edits away.
    """
    return {e2 for e1 in edits1(word) for e2 in edits1(e1)}

def known(words):
    return {w for w in words if w in WORD_COUNTS}

def correct(word):
    candidates = (known(edits0(word)) or
                 known(edits1(word)) or
                 known(edits2(word)) or
                 [word])
    # print(candidates)
    return max(candidates, key=WORD_COUNTS.get)

def correct_text(text):
    processed = text.replace('\n', ' \n ')
    corrected = [ correct(word) for word in processed.split(" ") ]
    
    return " ".join(corrected)
    
print (correct("goin"))

going


In [60]:
expanded[:60]

'as real as it seems the american dream ; is not nothing but '

In [61]:
corrected = correct_text(expanded)

In [62]:
words = corrected.split(" ")
numbers_str = filter( lambda word: word.isnumeric(), words )
print(list(numbers_str))

['6', '6', '90', '12', '3', '4', '5', '6', '12', '2', '2', '2', '50', '357', '2', '3', '93', '10', '1', '1', '2', '3', '380', '187', '6', '25', '9', '9', '9', '10', '10', '45', '165', '25', '1', '2', '1', '1', '2', '1', '1', '2', '1', '1', '2', '275', '90', '165', '1995', '1', '1', '2', '2', '35', '45', '2', '911', '2', '25', '50', '60', '11', '18', '12', '9', '95', '1', '2', '9', '9', '2020', '500', '6', '12', '500', '9', '3', '50', '12', '923', '106', '16', '71', '3', '12', '6', '8231549', '8599', '50', '30', '90', '62', '90', '62', '12', '45', '45', '500', '44', '45', '95', '1', '1', '2', '45', '80', '40', '70', '12', '23', '45', '45', '9', '10', '10', '187', '10', '10', '35', '22', '45', '45', '60', '13', '1']


In [64]:
import inflect

def number_to_word(number_str):
    
    p = inflect.engine()
    
    if   (len(number_str) <= 2):
        number = p.number_to_words(int(number_str))
        if(number == "zero"):
            return "ou"
        else: 
            return number.replace("-"," ")
    
    elif (len(number_str) == 4):
        digit_1 = int( number_str[:2] )
        digit_2 = int( number_str[2:] )
        number = p.number_to_words(digit_1) + " " + p.number_to_words(digit_2)
        return number.replace("-", " ")
        
    else:
        return " ".join( [p.number_to_words(int(digit)) for digit in number_str] )

print( number_to_word("55"), number_to_word("911"))

fifty five nine one one


In [65]:
def get_number_mapping( text ):
    words = text.split(" ")
    numbers_str = filter( lambda word: word.isnumeric(), words )
    
    mapping = {}
    for number in set(numbers_str):
        mapping[number] = number_to_word(number)
    return mapping

encoding = get_number_mapping(corrected)
encoding

{'5': 'five',
 '11': 'eleven',
 '2020': 'twenty twenty',
 '60': 'sixty',
 '30': 'thirty',
 '187': 'one eight seven',
 '911': 'nine one one',
 '50': 'fifty',
 '9': 'nine',
 '923': 'nine two three',
 '106': 'one zero six',
 '80': 'eighty',
 '45': 'forty five',
 '13': 'thirteen',
 '6': 'six',
 '12': 'twelve',
 '35': 'thirty five',
 '4': 'four',
 '165': 'one six five',
 '275': 'two seven five',
 '22': 'twenty two',
 '93': 'ninety three',
 '10': 'ten',
 '62': 'sixty two',
 '1': 'one',
 '70': 'seventy',
 '71': 'seventy one',
 '90': 'ninety',
 '3': 'three',
 '500': 'five zero zero',
 '23': 'twenty three',
 '95': 'ninety five',
 '8231549': 'eight two three one five four nine',
 '1995': 'nineteen ninety five',
 '40': 'forty',
 '16': 'sixteen',
 '2': 'two',
 '18': 'eighteen',
 '44': 'forty four',
 '357': 'three five seven',
 '380': 'three eight zero',
 '25': 'twenty five',
 '8599': 'eighty five ninety nine'}

In [66]:
def encode_numbers(text, encoding):
    encoded = text
    
    items = list( encoding.items() )
    items.sort(key=lambda item: len( item[0]), reverse=True)
    
    for number, substitute in items:
        encoded = encoded.replace(number, substitute)
    return encoded

number_encoded = encode_numbers(corrected, encoding)
               
print( number_encoded[:500])

as real as it seems the american dream ; is not nothing but another calculated schemes ; to get us locked up shot up back in chains ; to deny us of the future rob our names ; kept my history of mystery but now i see ; the american dream was not meant for me ; cause lady liberty is a hypocrite she lied to me ; promised me freedom education equality ; never gave me nothing but slavery ; and now look at how dangerous you made me ; calling me a mad man cause i am strong and bold ; with this dump ful


In [67]:
pre.write_text("data/prepped/clean2_pac.txt", corrected.replace(";", "\n") )

In [68]:
import tools.architectures as nn


text = pre.get_text("data/prepped/clean2_pac.txt")[:20000]

#print(corrected.replace(" lbreak ", "\n"))
# processed
# text = corrected.replace(" lbreak ", "\n")

vocab = pre.Vocabulary(text)

NUM_LAYERS = 2
HIDDEN_LAYER_SIZE = 512
VOCAB_SIZE = vocab.get_size()
TIMESTEPS = 50

EPOCHS = 30
BATCH_SIZE = 256

EMBEDDING_SIZE = 500


data, labels = vocab.making_embedded_one_hot(text, TIMESTEPS)

embedding = nn.LeanableEmbedding(name = "learnable-embedding")
embedding.build(VOCAB_SIZE, EMBEDDING_SIZE)

rnn = nn.SingleLayerRNN(name = "single-pac")
rnn.build(NUM_LAYERS, VOCAB_SIZE, TIMESTEPS, l2_reg=0.0, embedding=embedding)

nn.train(rnn, data, labels, vocab, epochs=EPOCHS, batch_size=BATCH_SIZE, temperature=.6, embedding=True)

(999, 500)
(?, 50)


Epoch 1/30
Loss:    	 6.872012615203857
Accuracy:	 1.171875
as real as it seems the american dream 
 is not nothing but another calculated schemes 
 to get us locked up shot up back in chains 
 to deny us of the future rob our names 
 kept my history of
Seed:as real as it seems the american dream  
  is not nothing but another calculated schemes  
  to get us locked up shot up back in chains  
  to deny us of the future rob our names  
  kept my history of 
Result:as real as it seems the american dream  
  is not nothing but another calculated schemes  
  to get us locked up shot up back in chains  
  to deny us of the future rob our names  
  kept my history of deal intro lie should promise hits freddy confident bragging temptation affair pop tipsy many totally park speaking vocalist mister brother lucky rockin tms street filled intro offers humming lesson trying claim ears handle warm rhyme oh stricly moving hell large health lyrics upset thought crazy thumping c

In [None]:
import tools.processing as pre
import tools.architectures as nn

vocab = pre.Vocabulary(text)

NUM_LAYERS = 3
HIDDEN_LAYER_SIZE = 512
VOCAB_SIZE = vocab.get_size()
TIMESTEPS = 30

EPOCHS = 20
BATCH_SIZE = 256

EMBEDDING_SIZE = 500


data, labels = vocab.making_one_hot(text, TIMESTEPS)

# embedding = nn.LeanableEmbedding(name = "learnable-embedding")
# embedding.build(VOCAB_SIZE, EMBEDDING_SIZE)

rnn = nn.MultiLayerRNN(name = "multi-pac")
rnn.build(NUM_LAYERS, HIDDEN_LAYER_SIZE, VOCAB_SIZE, TIMESTEPS, l2_reg=0.0)

nn.train(rnn, data, labels, vocab, epochs=EPOCHS, batch_size=BATCH_SIZE, temperature=.5)



Epoch 1/20
Loss:    	 7.360078811645508
Accuracy:	 0.04258490353822708
Seed:as real as it seems the american dream 
 is not nothing but another calculated schemes 
 to get us locked up shot up back in chains 
 to deny 
Result:as real as it seems the american dream 
 is not nothing but another calculated schemes 
 to get us locked up shot up back in chains 
 to deny is 
 
 
 i 
 
 
 
 the the me 
 the huh 
 a 
 to the a 
 get 
 the the the 
 
 
 
 a a i 
 know a 
 
 
 i 
 
 
 
 me and 
 
 
 
 
 
 
 the 
 i in the 
 
 
 a 
 
 
 they am 
 
 
 ya the she with 
 
 the the the 
 the 
 
 the 
 the 
 
 the 
 
 
 
 i 
 
 
 i 
 
 the 
 a 
 
 
 
 
 to 
 the the 
 
 
 
 
 
 
 
 
 a a a ya i 
 
 
 
 
 
 a and the a 
 that a 
 
 
 am 
 the a its 
 motion 
 i 
 
 
 
 
 
 the 
 the a 
 
 
 he 
 to i a me 
 
 
 i 
 i me the 
 a a 
 a 
 the 
 i 
 
 a 
 i the i 
 
 a 
 
 they to 
 
 do 
 to 
 
 i 
 is 
 
 
 i my the the a 
 i 
 
 
 
 it 
 i 
 
 
 
 
 
 the 
 to i 
 to the a 
 
 
 
 a the my the 
 
 the