In [None]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)
import csv
import spacy


# DATA PREPROCESSING

### Load Data

In [None]:
f = open("/home/xiaochenzheng/Desktop/cil-spring20-project-data_preprocessing/twitter-datasets/train_pos.txt")
tweets = f.readlines()
f.close()

### To DataFrame

In [None]:
train_pos =  pd.DataFrame(tweets, columns =['origin'])

In [None]:
train_pos['label']=0 # assumption 0 for positive

### 1) Remove  &lt;user&gt;,  &lt;url&gt;, \\n

In [None]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt  

In [None]:
train_pos['tidy'] = np.vectorize(remove_pattern)(train_pos['origin'], "<user>")
train_pos['tidy'] = np.vectorize(remove_pattern)(train_pos['tidy'], "<url>")
train_pos['tidy'] = np.vectorize(remove_pattern)(train_pos['tidy'], "\n")

In [None]:
train_pos

If the punctuation is removed, then haven't ===> haven t.

### 2) Replace Abbreviations + Spell Correction using Text File

In [None]:
# https://medium.com/nerd-stuff/python-script-to-turn-text-message-abbreviations-into-actual-phrases-d5db6f489222

def translator(user_string):
    user_string = user_string.split(" ")
    j = 0
    for _str in user_string:
        # File path which consists of Abbreviations.
        fileName = "./slang.txt"
        # File Access mode [Read Mode]
        accessMode = "r"
        with open(fileName, accessMode) as myCSVfile:
            # Reading file as CSV with delimiter as "=", so that abbreviation are stored in row[0] and phrases in row[1]
            dataFromFile = csv.reader(myCSVfile, delimiter="=")
            # Removing Special Characters.
            _str = re.sub('[^a-zA-Z0-9-_.]', '', _str)
            for row in dataFromFile:
                # Check if selected word matches short forms[LHS] in text file.
                if _str.upper() == row[0]:
                    # If match found replace it with its appropriate phrase in text file.
                    user_string[j] = row[1]
            myCSVfile.close()
        j = j + 1
    # Replacing commas with spaces for final output.
    output_string = ' '.join(user_string)
    
    return output_string

In [None]:
for i,_string in enumerate(train_pos['tidy']):
    train_pos.loc[i, 'tidy'] = translator(_string)

In [None]:
train_pos

### 3) Keep the n't as not (have, be, can, will, may, must, shall, do)

In [None]:
train_pos['tidy'] = train_pos['tidy'].str.replace("won't", "will not")

In [None]:
train_pos['tidy'] = train_pos['tidy'].str.replace("n't", " not")

### 4) Remove Punctuation, Numbers, and Special Characters

In [None]:
train_pos['tidy'] = train_pos['tidy'].str.replace("[^a-zA-Z#]", " ")

### 5) Remove short words (len(word)<=3) (optional)

In [None]:
train_pos['tidy'] = train_pos['tidy'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

### 6) Tokenized

In [None]:
tokenized_tweets = train_pos['tidy'].apply(lambda x: x.split())

### 7) Extract Root

In [None]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

tokenized_tweets = tokenized_tweets.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming

### Detokenized

In [None]:
from sacremoses import MosesDetokenizer

detokenizer = MosesDetokenizer()

for i in range(len(tokenized_tweets)):
    tokenized_tweets[i] = detokenizer.detokenize(tokenized_tweets[i], return_str=True)

train_pos['tidy'] = tokenized_tweets

# Summary

In [None]:
all_words = ' '.join([text for text in train_pos['tidy']])
from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.savefig("test.png", dpi=600)

# Stemming 
Step 6 and 7 can be replaced by the following stemming method

= process of reducing inflected (or derived) words to their word stem, base or root form.

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

### Porter Stemmer

In [None]:
ps = nltk.stem.PorterStemmer()

In [None]:
for _tweet in tweets[1:10]:
    print(_tweet)
    # split into words
    tokens = word_tokenize(_tweet)
    # filter out <user>
    words = [w for w in tokens if not w in "<user>"]
    # filter stand-alone punctuation out
    words = [word for word in words if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    print(words)
    for _word in words:
        ps.stem(_word)
    print("------------------------------------------------------------------")

### Snowball Stemmer

In [None]:
sno = nltk.stem.SnowballStemmer('english')

In [None]:
for _tweet in tweets[1:10]:
    print(_tweet)
    # split into words
    tokens = word_tokenize(_tweet)
    # filter out <user>
    words = [w for w in tokens if not w in "<user>"]
    # filter stand-alone punctuation out
    words = [word for word in words if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    print(words)
    for _word in words:
        sno.stem(_word)
    print("------------------------------------------------------------------")

# LEMMATIZATION

=  process of grouping together the inflected forms of a word so they can be analysed as a single item, identified by the word’s lemma, or dictionary form. 

### NLTK Lemmatizer

In [None]:
lemma = nltk.wordnet.WordNetLemmatizer()

In [None]:
for _tweet in tweets[1:10]:
    print(_tweet)
    # split into words
    tokens = word_tokenize(_tweet)
        # filter out <user>
    words = [w for w in tokens if not w in "<user>"]
    # filter stand-alone punctuation out
    words = [word for word in words if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    print(words)
    for _word in words:
        lemma.lemmatize(_word)
    print("------------------------------------------------------------------")

### SpaCy Lemmatizer

In [None]:
sp = spacy.load('en_core_web_sm')

In [None]:
for _tweet in tweets[1:10]:
    print(_tweet)
    words = sp(_tweet)
    lemmas = []
    for word in words:
        lemmas.append(word.lemma_)
    print(lemmas)
    print("------------------------------------------------------------------")