* Lowercasing the data
* Remove Special Characters
* Remove Whitespace
* Tokenize text
* Remove stop words
* Lemmatization/Stemming

In [1]:
import nltk
import numpy as np
import pandas as pd
import re

from collections import defaultdict, Counter
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [2]:
data = pd.read_csv('twitter_disaster_tweet_clean.csv')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,id,keyword,location,text,target,emoji_count,hashtags,mentions
0,0,1,,,our deeds are the reason of this may allah for...,1,0,"('', 0)",[]
1,1,4,,,forest fire near la ronge sask canada,1,0,"('', 0)",[]
2,2,5,,,all residents asked to shelter in place are be...,1,0,"('', 0)",[]
3,3,6,,,people receive evacuation orders in california,1,0,"('', 0)",[]
4,4,7,,,just got sent this photo from ruby as smoke fr...,1,0,"('', 0)",[]


### Tokenization

In [4]:
def tokenize(text):
    split=re.split("\W+",text) 
    return split

data['tokenized_text']=data['text'].apply(lambda x: tokenize(x.lower()))

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,id,keyword,location,text,target,emoji_count,hashtags,mentions,tokenized_text
0,0,1,,,our deeds are the reason of this may allah for...,1,0,"('', 0)",[],"[our, deeds, are, the, reason, of, this, may, ..."
1,1,4,,,forest fire near la ronge sask canada,1,0,"('', 0)",[],"[forest, fire, near, la, ronge, sask, canada]"
2,2,5,,,all residents asked to shelter in place are be...,1,0,"('', 0)",[],"[all, residents, asked, to, shelter, in, place..."
3,3,6,,,people receive evacuation orders in california,1,0,"('', 0)",[],"[, people, receive, evacuation, orders, in, ca..."
4,4,7,,,just got sent this photo from ruby as smoke fr...,1,0,"('', 0)",[],"[just, got, sent, this, photo, from, ruby, as,..."


In [8]:
data['tokenized_text'][0]

['our',
 'deeds',
 'are',
 'the',
 'reason',
 'of',
 'this',
 'may',
 'allah',
 'forgive',
 'us',
 'all']

In [9]:
data['text'][0]

'our deeds are the reason of this may allah forgive us all'

### Remove Stop Words

In [10]:
stopword = nltk.corpus.stopwords.words('english')
print(stopword)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [11]:
def remove_stopwords(text):
    text=[word for word in text if word not in stopword]
    return text

data['clean']=data['tokenized_text'].apply(lambda x: remove_stopwords(x))

In [12]:
data.head()

Unnamed: 0.1,Unnamed: 0,id,keyword,location,text,target,emoji_count,hashtags,mentions,tokenized_text,clean
0,0,1,,,our deeds are the reason of this may allah for...,1,0,"('', 0)",[],"[our, deeds, are, the, reason, of, this, may, ...","[deeds, reason, may, allah, forgive, us]"
1,1,4,,,forest fire near la ronge sask canada,1,0,"('', 0)",[],"[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, ronge, sask, canada]"
2,2,5,,,all residents asked to shelter in place are be...,1,0,"('', 0)",[],"[all, residents, asked, to, shelter, in, place...","[residents, asked, shelter, place, notified, o..."
3,3,6,,,people receive evacuation orders in california,1,0,"('', 0)",[],"[, people, receive, evacuation, orders, in, ca...","[, people, receive, evacuation, orders, califo..."
4,4,7,,,just got sent this photo from ruby as smoke fr...,1,0,"('', 0)",[],"[just, got, sent, this, photo, from, ruby, as,...","[got, sent, photo, ruby, smoke, pours, school, ]"


In [13]:
data['tokenized_text'][0]

['our',
 'deeds',
 'are',
 'the',
 'reason',
 'of',
 'this',
 'may',
 'allah',
 'forgive',
 'us',
 'all']

In [14]:
data['clean'][0]

['deeds', 'reason', 'may', 'allah', 'forgive', 'us']

### Lemmatization / Stemming

Stemming and Lemmatizing is the process of reducing a word to its root form. The main purpose is to reduce variations of the same word, thereby reducing the corpus of words we include in the model. The difference between stemming and lemmatizing is that, stemming chops off the end of the word without taking into consideration the context of the word. Whereas, Lemmatizing considers the context of the word and shortens the word into its root form based on the dictionary definition. Stemming is a faster process compared to Lemmantizing. Hence, it a trade-off between speed and accuracy.

Let’s consider the word “belief” for example. The different variations of believe can be believing, believed, believes, and believe .

In [15]:
ps = PorterStemmer()

In [16]:
print(ps.stem('believe'))
print(ps.stem('believing'))
print(ps.stem('believed'))
print(ps.stem('believes'))

believ
believ
believ
believ


In [17]:
from nltk.stem import WordNetLemmatizer
  
lemmatizer=nltk.stem.WordNetLemmatizer()

print(lemmatizer.lemmatize('believe'))
print(lemmatizer.lemmatize('believing'))
print(lemmatizer.lemmatize('believed'))
print(lemmatizer.lemmatize('believes'))

believe
believing
believed
belief


In [18]:
def lemmatize_text(word_list):
    lemmatized_output = [lemmatizer.lemmatize(w) for w in word_list]
    return lemmatized_output

data['clean_lemma']=data['clean'].apply(lambda x: lemmatize_text(x))

In [19]:
data.to_csv('twitter_disaster_tweet_preprocess', index=False)

In [1]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize

print(stopwords.words('english'))

# random sentecnce with lot of stop words
sample_text = "Oh man, this is pretty cool. We will do more such things."
text_tokens = word_tokenize(sample_text)

tokens_without_sw = [word for word in text_tokens if not word in stopwords.words('english')]

print(text_tokens)
print(tokens_without_sw)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rheyannmagcalas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rheyannmagcalas\AppData\Roaming\nltk_data...


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data]   Unzipping tokenizers\punkt.zip.
