# II. Text Preprocessing

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.metrics.distance  import edit_distance
from nltk.corpus import words
import os
from nltk.corpus import stopwords
from textblob import TextBlob
from nltk.corpus import words
from nltk.metrics.distance import jaccard_distance
from nltk.metrics.distance  import edit_distance
from nltk.util import ngrams

In [49]:
data=pd.read_csv('A1_dataset.csv')

In [50]:
data.head(5)

Unnamed: 0,LABEL,DATE_TIME,TEXT
0,0,Fri Jun 05 14:26:50 2009,About to get threaded and scared
1,1,Thu May 14 10:13:55 2009,@awaisnaseer I like Shezan Mangooo too!!! I ha...
2,1,Fri Jun 05 21:02:20 2009,worked on my car after work. showering then go...
3,1,Sun Jun 14 22:25:52 2009,@Marama Actually we start this afternoon! I w...
4,1,Sun May 31 00:42:12 2009,@gfalcone601 Aww Gi.don't worry.we'll vote for...


## Helper Functions

## URL and HTML Tag Removal

In [51]:
def remove_URL(tokens):
    return re.sub(r'http\S+','', tokens)
#print(remove_URL("worked on my car after work. showering then going to bed. sooooooooooo tired. sparrow signing out  &lt;Cowboy Up&gt;"))

In [52]:
def remove_HTMLTag(tokens):
    return re.sub(r'&\w+;','', tokens)
print(remove_HTMLTag("@nakulshenoy Lol, that and &quot;twiiter killed the blogger&quot; are far apart. Btw, what is &quot;blogging in the traditional sense&quot; may i know? "))

@nakulshenoy Lol, that and twiiter killed the blogger are far apart. Btw, what is blogging in the traditional sense may i know? 


## Word Tokenization

In [53]:
def tokenization(text):
     return (word_tokenize(text.lower()))

## Stemming

In [54]:
def stemming_tokens(tokens):
        # Porter stemmer
        ps = PorterStemmer()
        stem_tokens = [ps.stem(x) for x in tokens]
        
        return stem_tokens

## Lemmatization

In [55]:
def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatize_tokens = [lemmatizer.lemmatize(x) for x in tokens]
        
    return lemmatize_tokens

## Remove Puntuations

In [56]:
def remove_punctuation_tokens(tokens):
    tokens_sans_punctuation = [x.translate(str.maketrans('', '', string.punctuation)) for x in tokens]
    return tokens_sans_punctuation 

## Remove Whitespaces

In [57]:
def remove_white_spaces_tokens(tokens):
    tokenized = re.findall(r'(\w+)', tokens)
    return tokenized

## Spelling Correction

In [58]:
def spelling_correction(tokens):
    txt = TextBlob(tokens)
    return txt.correct()
#print(spelling_correction("@Smithycurt NOBBBBBYYY NOBBY NOB ROTTY  "))

## Remove Stop words

In [120]:
def remove_stopwords(tokens):
    stopwords = {'is','s','am','or','who','as','from','him','each','the','themselves','until','below','are','we','these','your','too','to','still','now','with','me','actually','this','i','will','have','by','it','be','a','of','off','on','many','something','just','got','about','go','get','and','his','like','his','some','her','hers','ourselves','but','again','between','yourself','there','once','during','out','very','having','they','own','an','be','for','do','its','it','yours','such','into','who','as','from','him','each','themselves','until','below','are','we','these','your','his','through','nor','me','were','her','more','himself','this','down','should','their','while','above','both','up','ours','she','all','no','after','before','few','how','further','here','than','doing','if','iff','theirs','my','against','whom','over','why','so','can','did','not','does','don','t','myself','been','same','under'}
    res  = [word for word in re.split("\W+",str(tokens)) if word.lower() not in stopwords]
    return res
#print(remove_stopwords(['just', 'got', 'out', 'the', 'hot', 'tub', 'about', 'to', 'go', 'get', 'a', 'movie', 'and', 'ice', 'cream', 'with', 'mt', 'll', 'his', 'and', 'call', 'it', 'a', 'night', 'nothing', 'like', 'some', 'his', 'time'])
#)

## Main function

In [125]:
def main():
    text = input("Enter your text: ")
   
    task1 = remove_URL(text)
    print("\n Remove URL \n")
    print(task1)
    
    task2 = remove_HTMLTag(task1)
    print("\n Remove HTML Tag \n")
    print(task2)
    
    task3 = spelling_correction(str(task2))
    print("\n Spelling Correction \n")
    print(task3)
    
    task4 = tokenization(str(task3))
    print("\n Tokenization \n")
    print(task4)
    
    task5 = remove_punctuation_tokens(task4)
    print("\n Remove Puntuations \n")
    print(task5) 
   
    task6 = remove_stopwords((task5))
    print("\n Remove stopwords \n")
    print(task6)
    
    task7 = remove_white_spaces_tokens(str(task6))
    print("\n Remove extra white spaces \n")
    print(task7)
 
    task8 = stemming_tokens(task7)
    print("\n Stemming \n")
    print(task8)
    
    task9 = lemmatize_tokens(task8)
    print("\n Lemmatization \n")
    print(task9)

In [126]:
main()

Enter your text: just got out the hot tub....about to go get a movie and ice cream with mt lil sis and call it a night!!!  nothing like some sis time!!! 

 Remove URL 

just got out the hot tub....about to go get a movie and ice cream with mt lil sis and call it a night!!!  nothing like some sis time!!! 

 Remove HTML Tag 

just got out the hot tub....about to go get a movie and ice cream with mt lil sis and call it a night!!!  nothing like some sis time!!! 

 Spelling Correction 

just got out the hot tub....about to go get a movie and ice cream with mt ll his and call it a night!!!  nothing like some his time!!! 

 Tokenization 

['just', 'got', 'out', 'the', 'hot', 'tub', '....', 'about', 'to', 'go', 'get', 'a', 'movie', 'and', 'ice', 'cream', 'with', 'mt', 'll', 'his', 'and', 'call', 'it', 'a', 'night', '!', '!', '!', 'nothing', 'like', 'some', 'his', 'time', '!', '!', '!']

 Remove Puntuations 

['just', 'got', 'out', 'the', 'hot', 'tub', '', 'about', 'to', 'go', 'get', 'a', 'movi

In [148]:
#pip install textblob