In [None]:
"""
preprocess-twitter.py
python preprocess-twitter.py "Some random text with #hashtags, @mentions and http://t.co/kdjfkdjf (links). :)"
Script for preprocessing tweets by Romain Paulus
with small modifications by Jeffrey Pennington
with translation to Python by Motoki Wu
Translation of Ruby script to create features for GloVe vectors for Twitter data.
http://nlp.stanford.edu/projects/glove/preprocess-twitter.rb
"""

import sys
import regex as re

FLAGS = re.MULTILINE | re.DOTALL

def hashtag(text):
    text = text.group()
    hashtag_body = text[1:]
    if hashtag_body.isupper():
        result = "<hashtag> {} <allcaps>".format(hashtag_body)
    else:
        result = " ".join(["<hashtag>"] + re.split(r"(?=[A-Z])", hashtag_body, flags=FLAGS))
    return result

def allcaps(text):
    text = text.group()
    return text.lower() + " <allcaps>"


def tokenize(text):
    # Different regex parts for smiley faces
    eyes = r"[8:=;]"
    nose = r"['`\-]?"

    # function so code less repetitive
    def re_sub(pattern, repl):
        return re.sub(pattern, repl, text, flags=FLAGS)

    text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
    text = re_sub(r"/"," / ")
    text = re_sub(r"@\w+", "<user>")
    text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
    text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
    text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
    text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
    text = re_sub(r"<3","<heart>")
    text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
    text = re_sub(r"#\S+", hashtag)
    text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
    text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")

    ## -- I just don't understand why the Ruby script adds <allcaps> to everything so I limited the selection.
    # text = re_sub(r"([^a-z0-9()<>'`\-]){2,}", allcaps)
    text = re_sub(r"([A-Z]){2,}", allcaps)

    return text.lower()

In [None]:
text = "I TEST alllll kinds of #hashtags and #HASHTAGS, @mentions and 3000 (http://t.co/dkfjkdf). w/ <3 :) haha!!!!!"
preproc_text=tokenize(text)

print(preproc_text)

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

tknzr = TweetTokenizer()
nltk.download('stopwords')
stop = stopwords.words('english')
stop += ['<hashtag>', '<url>', '<allcaps>', '<number>', '<user>', '<repeat>', '<elong>', 'websummit','http','https']
#stop+=['\,','.','!','-','/','\','~',':',';','|','$','%','_']

#for tweet in tweets:
    #parts = tknzr.tokenize(tweet_processor.preprocess(tweet["text"]))
parts = tknzr.tokenize(preproc_text)
clean = [i for i in parts if i not in stop]
    #tweet["processed"] = clean
print(clean)

In [None]:
from nltk.metrics import edit_distance

class SpellingReplacer(object):
    def __init__(self, dict_name = 'en_GB', max_dist = 2):
        self.spell_dict = enchant.Dict(dict_name)
        self.max_dist = 2

    def replace(self, word):
        if self.spell_dict.check(word):
            return word
        suggestions = self.spell_dict.suggest(word)

        if suggestions and edit_distance(word, suggestions[0]) <= self.max_dist:
            return suggestions[0]
        else:
            return word
def spell_check(word_list):
    checked_list = []
    for item in word_list:
        replacer = SpellingReplacer()
        r = replacer.replace(item)
        checked_list.append(r)
    return checked_list

In [None]:
import nltk
from pathlib import Path
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
import os
import gensim 
from gensim import corpora, similarities, models
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
import enchant
import timeit
import pickle 


lemmatizer = WordNetLemmatizer()

tokenizer_nopunct = RegexpTokenizer(r'\w+')

total_tokens=[]
os.chdir(userdir)
count=0
print("Starting to read files:")

for filepath in filepaths: 
    if count<len(filepaths)+1: 
        #print(filepath)
 
        contents = Path(filepath).read_text()
        #print(contents)
        tokens_nopunct=tokenizer_nopunct.tokenize(contents)
        concat_str=' '.join(tokens_nopunct)
        preproc_text=tokenize(concat_str) 
         
        #print(preproc_text)
        #tokens=nltk.word_tokenize(prepoc_text)
        parts = tknzr.tokenize(preproc_text)
        
        
        
        clean1=[lemmatizer.lemmatize(i.lower()) for i in parts if i not in stop]
        clean1=[word for word in clean1 if len(word)>1]
        
        #clean1=[word for word in clean1 if word in english_vocab]
        #clean1=[lemmatizer.lemmatize(word) for word in clean1 if word in glove_model]
        #start_time = timeit.default_timer()
        
        clean1=spell_check(clean1)
        
        #time = timeit.default_timer()
        #print("Spell check done in:")
        #print(time-start_time)        

 
        
        total_tokens.append(clean1)

        if count%30==0:
            print(str(count+1)+" files processed.")
            

    count=count+1




os.chdir('..')
with open("preprocessed_user2.txt", "wb") as fp:   #Pickling
    pickle.dump(total_tokens, fp)