## Helper Text Preprocessing

In [1128]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.metrics.distance  import edit_distance
from nltk.corpus import words
import os
from nltk.corpus import stopwords
from textblob import TextBlob
from spellchecker import SpellChecker
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from autocorrect import Speller

In [1129]:
data=pd.read_csv('A1_dataset.csv')

In [1130]:
data.head(2)

Unnamed: 0,LABEL,DATE_TIME,TEXT
0,0,Fri Jun 05 14:26:50 2009,About to get threaded and scared
1,1,Thu May 14 10:13:55 2009,@awaisnaseer I like Shezan Mangooo too!!! I ha...


In [1131]:
data_new = data[['LABEL', 'TEXT']].copy()

In [1132]:
data_new.head(2)

Unnamed: 0,LABEL,TEXT
0,0,About to get threaded and scared
1,1,@awaisnaseer I like Shezan Mangooo too!!! I ha...


## Helper Functions

In [1133]:
def tokenize(data):
    data_new['TEXT'] = data_new['TEXT'].apply(word_tokenize) 
    return data_new['TEXT']

def lower(data):
    data_new['TEXT'] = data_new['TEXT'].astype(str).str.lower()
    return data_new['TEXT']

def remove_punctuations(data):
    for punctuation in string.punctuation:
        data = data.replace(punctuation, ' ')
    return data

def remove_punctuation_tokens(data):
    data_new['TEXT'] = data_new['TEXT'].apply(remove_punctuations)
    return data_new['TEXT']


def remove_stopwords(data):
    stop = list(stopwords.words("english"))
    data_new['TEXT'] = data_new['TEXT'].apply(lambda x: ' '.join([w for w in x.split() if w not in stop]))
    return data_new['TEXT']

def remove_URL(data):
    return re.sub(r'http\S+','', data)

def remove_URL_data(data):
    data_new['TEXT'] = data_new['TEXT'].apply(remove_URL)
    return data_new['TEXT']

def remove_HTMLTag(data):
    return re.sub(r'&\w+;',' ', data)

def remove_HTMLTag_data(data):
    data_new['TEXT'] = data_new['TEXT'].apply(remove_HTMLTag)
    return data_new['TEXT']

def lemmatize(data):
        lemmatizer = WordNetLemmatizer()
        lemmatize = [lemmatizer.lemmatize(x) for x in data]
        return lemmatize

def lemmatize_data(data):
    data_new['TEXT'] = data_new['TEXT'].apply(lemmatize)
    return data_new['TEXT']

def stemming(data):
        # Porter stemmer
        ps = PorterStemmer()
        stem = [ps.stem(x) for x in data]
        return stem
    
def stemming_data(data):
    data_new['TEXT'] = data_new['TEXT'].apply(stemming)
    return data_new['TEXT']

def remove_username_func(data):
    return re.sub(r'\@\w+|\#|\d+', '', data)

def remove_username(data):
    data_new['TEXT'] = data_new['TEXT'].apply(remove_username_func)
    return data_new['TEXT']

def remove_words_func(data):
    return re.sub(r'\b\w{1,3}\b', '', data)

def remove_words(data):
    data_new['TEXT'] = data_new['TEXT'].apply(remove_words_func)
    return data_new['TEXT']

def remove_repeated_words(data):
    data_new['TEXT'] = data_new['TEXT'].str.replace(r'\b(\w+)(\s+\1)+\b', r'\1')
    return data_new['TEXT']

def remove_white_spaces(data):
    data_new['TEXT'] = data_new['TEXT'].apply(lambda x: x.strip())
    return data_new['TEXT']

In [1134]:
def spell_correction(data):
    spell = Speller()
    data_new['TEXT'] = [' '.join([spell(i) for i in x.split()]) for x in data_new['TEXT']]
    return data_new['TEXT']

## Main

In [1135]:
def main():
       # x = a(data)
       # print(x)
        t = remove_URL_data(data)
        #print(t5)
        t2 = remove_HTMLTag_data(t)
        #print(t5)
        t3 = remove_username((t2))
        #print(t3)
        x = spell_correction(str(t3))
        #print(x)
        t4 = tokenize(str(t3))
        #print(t)
        t5 = lemmatize_data(t4)
        #print(t4)   
        t6 = stemming_data(t5)
       # print(t5)   
        t7 = lower(t5)
        #print(t5)  
      
        t8 = remove_punctuation_tokens(t7)
      #  print(t6)   
        t9 = remove_stopwords(t8)
      #  print(t9)      
        t10 = remove_words(t9)
      #  print(t10)
        t11 = remove_repeated_words(t10)
       # print(t11)
        t12 = remove_white_spaces(t11)
        print(t12)
      #  print(type(t10))      

In [1136]:
main()

0                                            thread scare
1                              like sean mango  yesterday
2       work show   sooooooooooo tire sparrow sign cowboy
3       actual start  afternoon  someth  slow process ...
4                            worry  vote  stop  love much
                              ...                        
4282                                  perform  test shock
4283                        true blood episod demand onli
4284                 return forest sarah merci lost  wood
4285                          proud   piec work keep papa
4286    woke    pizza breakfast also dentist appoint  ...
Name: TEXT, Length: 4287, dtype: object


In [1139]:
data_new.head(5)

Unnamed: 0,LABEL,TEXT
0,0,thread scare
1,1,like sean mango yesterday
2,1,work show sooooooooooo tire sparrow sign cowboy
3,1,actual start afternoon someth slow process ...
4,1,worry vote stop love much


## Storing the data in csv file for further use

In [1140]:
data_new.to_csv('A1_datset_processed.csv',encoding='utf-8-sig', index=False) 