# NLP setup and overview

In [2]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [4]:
from nltk.corpus import stopwords

stopwords.words('english')[0:500:25]

['i', 'herself', 'been', 'with', 'here', 'very', 'doesn', 'won']

In [5]:
# Reading in text data
# Read in semi-structured text data

In [8]:
# Read in the raw text (file downloaded from kaggle and renamed https://www.kaggle.com/uciml/sms-spam-collection-dataset)
rawData = open("SMSSpamCollection.csv").read()
rawData[0:500]

'v1,v2,,,\nham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",,,\nham,Ok lar... Joking wif u oni...,,,\nspam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C\'s apply 08452810075over18\'s,,,\nham,U dun say so early hor... U c already then say...,,,\nham,"Nah I don\'t think he goes to usf, he lives around here though",,,\nspam,"FreeMsg Hey there darling it\'s been'

In [13]:
parsedData = rawData.replace("ham,", "ham\t")
parsedData = parsedData.replace("spam,", "spam\t")
parsedData = parsedData.replace("\t", "\n").split("\n")
parsedData = parsedData[1:]
parsedData[:5]

['ham',
 '"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",,,',
 'ham',
 'Ok lar... Joking wif u oni...,,,',
 'spam']

In [14]:
labelList = parsedData[0::2]
textList = parsedData[1::2]
labelList[:5]

['ham', 'ham', 'spam', 'ham', 'ham']

In [15]:
textList[:5]

['"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",,,',
 'Ok lar... Joking wif u oni...,,,',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's,,,",
 'U dun say so early hor... U c already then say...,,,',
 '"Nah I don\'t think he goes to usf, he lives around here though",,,']

In [23]:
import pandas as pd

fullCorpus = pd.DataFrame({'label': labelList,
                           'body_text': textList})
fullCorpus.head()

Unnamed: 0,label,body_text
0,ham,"""Go until jurong point, crazy.. Available only..."
1,ham,"Ok lar... Joking wif u oni...,,,"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"""Nah I don't think he goes to usf, he lives ar..."


# Exploring the dataset

In [20]:
# What is the shape of the dataset?
print("Input data has {} rows and {} columns".format(len(fullCorpus), len(fullCorpus.columns)))

Input data has 5573 rows and 2 columns


In [21]:
# How many spam/ham are there?
print("Out of {} rows, {} are spam, {} are ham".format(len(fullCorpus),
                                                       len(fullCorpus[fullCorpus['label'] == 'spam']),
                                                       len(fullCorpus[fullCorpus['label'] == 'ham'])))

Out of 5573 rows, 382 are spam, 2497 are ham


In [24]:
# How much missing data is there
print("Number of null in label: {}".format(fullCorpus['label'].isnull().sum()))
print("Number of null in text: {}".format(fullCorpus['body_text'].isnull().sum()))

Number of null in label: 0
Number of null in text: 0


# Learning how to use regular expressions

In [26]:
import re

re_test = 'This is made up string to test 2 different regex methods'
re_test_messy = 'This      is a made up       string to test 2     different regex methods'
re_test_messy1 = 'This-is-a-made/up.string*to>>>>test----2"""""different-regex-methods'

In [27]:
re.split('\s', re_test)

['This',
 'is',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [28]:
re.split('\s', re_test_messy)

['This',
 '',
 '',
 '',
 '',
 '',
 'is',
 'a',
 'made',
 'up',
 '',
 '',
 '',
 '',
 '',
 '',
 'string',
 'to',
 'test',
 '2',
 '',
 '',
 '',
 '',
 'different',
 'regex',
 'methods']

In [29]:
re.split('\s+', re_test_messy)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [30]:
re.split('\s+', re_test_messy1)

['This-is-a-made/up.string*to>>>>test----2"""""different-regex-methods']

In [31]:
re.split('\W+', re_test_messy1)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [34]:
re.findall('\S+', re_test)

['This',
 'is',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [35]:
re.findall('\S+', re_test_messy)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [36]:
re.findall('\S+', re_test_messy1)

['This-is-a-made/up.string*to>>>>test----2"""""different-regex-methods']

In [37]:
re.findall('\w+', re_test_messy1)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

# Replace a specific string

In [38]:
pep8_test = 'I try to follow PEP8 guidelines'
pep7_test = 'I try to follow PEP7 guidelines'
peep8_test = 'I try to follow PEEP8 guidelines'

In [39]:
import re

re.findall('[a-z]+', pep8_test)

['try', 'to', 'follow', 'guidelines']

In [40]:
re.findall('[A-Z]+', pep8_test)

['I', 'PEP']

In [44]:
re.findall('[A-Z]+[0-9]+', peep8_test)

['PEEP8']

In [48]:
re.sub('[A-Z]+[0-9]+', 'PEP8 Python Styleguide', pep7_test)

'I try to follow PEP8 Python Styleguide guidelines'

# Implementing a pipeline to clean text

## Remove punctuation

In [58]:
data = fullCorpus.copy()
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [57]:
"I like NLP." == "I like NLP"

False

In [62]:
def remove_punct(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

data['body_text_clean'] = data['body_text'].apply(lambda x: remove_punct(x))
data.head()

Unnamed: 0,label,body_text,body_text_clean
0,ham,"""Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there ...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...
1,ham,"Ok lar... Joking wif u oni...,,,",Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...
3,ham,"U dun say so early hor... U c already then say...,,,",U dun say so early hor U c already then say
4,ham,"""Nah I don't think he goes to usf, he lives around here though"",,,",Nah I dont think he goes to usf he lives around here though


## Tokenization

In [63]:
import re

def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

data['body_text_tokenized'] = data['body_text_clean'].apply(lambda x: tokenize(x.lower()))
data.head()

Unnamed: 0,label,body_text,body_text_clean,body_text_tokenized
0,ham,"""Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there ...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...,"[go, until, jurong, point, crazy, available, only, in, bugis, n, great, world, la, e, buffet, ci..."
1,ham,"Ok lar... Joking wif u oni...,,,",Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to..."
3,ham,"U dun say so early hor... U c already then say...,,,",U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, then, say]"
4,ham,"""Nah I don't think he goes to usf, he lives around here though"",,,",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]"


In [64]:
'NLP' == 'nlp'

False

## Remove stop words

In [66]:
import nltk

stopword = nltk.corpus.stopwords.words('english')
stopword[1:30:500]

['me']

In [70]:
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopword]
    return text

data['body_text_nostop'] = data['body_text_tokenized'].apply(lambda x: remove_stopwords(x))
data.head()

Unnamed: 0,label,body_text,body_text_clean,body_text_tokenized,body_text_nostop
0,ham,"""Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there ...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...,"[go, until, jurong, point, crazy, available, only, in, bugis, n, great, world, la, e, buffet, ci...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]"
1,ham,"Ok lar... Joking wif u oni...,,,",Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
3,ham,"U dun say so early hor... U c already then say...,,,",U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, then, say]","[u, dun, say, early, hor, u, c, already, say]"
4,ham,"""Nah I don't think he goes to usf, he lives around here though"",,,",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]","[nah, dont, think, goes, usf, lives, around, though]"
