In [13]:
import re, collections
from nltk.corpus import words as w


def words(text):
    return re.findall('[a-z]+', text.lower())

def train(features):
    model = collections.defaultdict(lambda: 1)
    for f in features:
        model[f] += 1   
    return model

with open("big.txt", "r") as big:
    word_corpus = big.read()
for word in w.words():
    word_corpus += word
    

NWORDS = train(words(word_corpus))
with open("list_of_abuses.txt", "r") as abuse_list:
    abuses = abuse_list.read().split()
    for abuse in abuses:
        NWORDS[abuse] = 100

alphabet = 'abcdefghijklmnopqrstuvwxyz'

def edits1(word):
#     print(word)
    s = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes    = [a + b[1:] for a, b in s if b]
    transposes = [a + b[1] + b[0] + b[2:] for a, b in s if len(b)>1]
    replaces   = [a + c + b[1:] for a, b in s for c in alphabet if b]
    inserts    = [a + c + b     for a, b in s for c in alphabet]
    return set(deletes + transposes + replaces + inserts)

def known_edits2(word):
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)

def known(words):
    try:
        return [int(w) for w in words] #to take care of purely numeric words
    except:
        return set(w for w in words if w.lower() in NWORDS)

def correct(word):
    if word[0] not in alphabet: 
        return word
    else:
        word = re.sub(r'(.)\1+', r'\1\1', word)
        candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word]
        return max(candidates, key=NWORDS.get)

In [14]:
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w!@#$%^&*]+)', # To group symbols together
    r'(?:[\w_]+)', # other words
    
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    tokens = tokens_re.findall(s)
    for i in range(len(tokens)):
        clean_token = correct(tokens[i])
        tokens[i] = clean_token
    return tokens
 
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens
 
tweet = 'RT @marcobonzanini: just an example! :D http://example.com #NLP'
print(preprocess(tweet))

['RT', '@marcobonzanini', ':', 'just', 'an', 'example', '!', ':D', 'http://example.com', '#NLP']


In [15]:
tweet2 = "Your a retard go post your head up your f*ck"
preprocess(tweet2)

['Your', 'a', 'retard', 'go', 'post', 'your', 'head', 'up', 'your', 'fuck']

In [9]:
tweet3 = "You are\xa0 a fukin moron. \xa0\xa0 You are just butthurt that you got rejected on WIkipedia.\n\n Call yoursilf scolar or whatever.\xa0 I am better than you,\xa0 just 14 and already a Wikipedia administrator.\xa0 You are just a stupid ashole."
preprocess(tweet3)

['You',
 'are',
 'a',
 'fuckin',
 'moron',
 '.',
 'You',
 'are',
 'just',
 'butthurt',
 'that',
 'you',
 'got',
 'rejected',
 'on',
 'WIkipedia',
 '.',
 'Call',
 'yourself',
 'solar',
 'or',
 'whatever',
 '.',
 'I',
 'am',
 'better',
 'than',
 'you',
 ',',
 'just',
 '14',
 'and',
 'already',
 'a',
 'Wikipedia',
 'administrator',
 '.',
 'You',
 'are',
 'just',
 'a',
 'stupid',
 'asshole',
 '.']

In [12]:
tweet4 = "heeellllooooo, I am soooooo haaaaaaappppppyyyyy 999999 "
preprocess(tweet4)

['hello', ',', 'I', 'am', 'so', 'happy', '999999']

In [24]:
t5 = "FANS ARE SOOOO BLIND DUMB. refs handed game to spurs??? THUNDER MADE two more free throws than the spurs (19 to 17) AND SPURS STILL WON! shut it haters"
preprocess(t5)

['FANS',
 'ARE',
 'SOOOO',
 'BLIND',
 'DUMB',
 '.',
 'left',
 'handed',
 'game',
 'to',
 'spurs',
 '?',
 '?',
 '?',
 'THUNDER',
 'MADE',
 'two',
 'more',
 'free',
 'throws',
 'than',
 'the',
 'spurs',
 '(',
 '19',
 'to',
 '17',
 ')',
 'AND',
 'SPURS',
 'STILL',
 'WON',
 '!',
 'shut',
 'it',
 'waters']

In [21]:
"mom" in w.words()

False