In [1]:
import nltk

In [2]:
# nltk.download() will prompt a new window to open - we download all available options in this pop up

nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [3]:
# all components have been installed - we can view functionality of nltk with dir()

dir(nltk)

['AbstractLazySequence',
 'AffixTagger',
 'AlignedSent',
 'Alignment',
 'AnnotationTask',
 'ApplicationExpression',
 'Assignment',
 'BigramAssocMeasures',
 'BigramCollocationFinder',
 'BigramTagger',
 'BinaryMaxentFeatureEncoding',
 'BlanklineTokenizer',
 'BllipParser',
 'BottomUpChartParser',
 'BottomUpLeftCornerChartParser',
 'BottomUpProbabilisticChartParser',
 'Boxer',
 'BrillTagger',
 'BrillTaggerTrainer',
 'CFG',
 'CRFTagger',
 'CfgReadingCommand',
 'ChartParser',
 'ChunkParserI',
 'ChunkScore',
 'ClassifierBasedPOSTagger',
 'ClassifierBasedTagger',
 'ClassifierI',
 'ConcordanceIndex',
 'ConditionalExponentialClassifier',
 'ConditionalFreqDist',
 'ConditionalProbDist',
 'ConditionalProbDistI',
 'ConfusionMatrix',
 'ContextIndex',
 'ContextTagger',
 'ContingencyMeasures',
 'CoreNLPDependencyParser',
 'CoreNLPParser',
 'Counter',
 'CrossValidationProbDist',
 'DRS',
 'DecisionTreeClassifier',
 'DefaultTagger',
 'DependencyEvaluator',
 'DependencyGrammar',
 'DependencyGraph',
 'Depen

In [4]:
# important funtions include:

# pos_tag handles parts of speech tagging

# tokenize will take a text file and split it into words - we will make our own function to tokenize

In [5]:
# let's look at nltk's stop words

from nltk.corpus import stopwords

In [6]:
# need to specify that we want the english stop words

stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

## Reading in data

Text files are often unstructured, meaning that it could be binary data, might not have delimiters, and poor formatting (e.g. no indication of rows)

To show how to use this data, we will load Spam Email data from the UCI Machine Learning Repository

This is semi-structured data with labels for spam or ham

In [7]:
# start by opening file and reading it - this assumes no previous knowledge of text file structure

rawData = open("SMSSpamCollection.tsv").read()

In [9]:
# let's get a first look at the data before determining next steps - showing first 500 characters

rawData[0:500]

"ham\tI've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.\nspam\tFree entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\nham\tNah I don't think he goes to usf, he lives around here though\nham\tEven my brother is not like to speak with me. They treat me like aid"

In [10]:
# there are a few diff ways we could import this data with proper formatting

# we will start by replacing tabs (\t) with line breaks (\n) and splitting into a list on \n

parsedData = rawData.replace('\t', '\n').split('\n')

In [11]:
# the data now alternates between the labels and the messages

parsedData[0:5]

['ham',
 "I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.",
 'spam',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 'ham']

In [12]:
# create new label and text lists with every other item from parsedData

# indexing says go from 0 to end in increments of 2 for labels and start from 1 for text messages

labelList = parsedData[0::2]

textList = parsedData[1::2]

In [13]:
labelList[0:5]

['ham', 'spam', 'ham', 'ham', 'ham']

In [15]:
textList[0:5]

["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.",
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 "Nah I don't think he goes to usf, he lives around here though",
 'Even my brother is not like to speak with me. They treat me like aids patent.',
 'I HAVE A DATE ON SUNDAY WITH WILL!!']

In [16]:
# next step is importing pandas

import pandas as pd

In [17]:
# create dataframe called fullCorpus

# for data, we pass in a dictionary where keys are names of columns and values are data in columns

fullCorpus = pd.DataFrame({'label':labelList, 'text':textList})

ValueError: arrays must all be same length

In [18]:
# showing error bc they are not the same length - lets see where issue is

len(labelList)

5571

In [19]:
len(textList)

5570

In [20]:
# label list has one extra entry - likely something got messed up at the end - let's see what the final values are

labelList[-5:]

['ham', 'ham', 'ham', 'ham', '']

In [21]:
# final value was empty - need to remove last value in labelList

labelList.pop()

''

In [22]:
# trying DataFrame again - works now

fullCorpus = pd.DataFrame({'label':labelList, 'text':textList})

In [23]:
# we can look at the first 5 values with head and see that our dataset is ready

fullCorpus.head()

Unnamed: 0,label,text
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [24]:
# this goes over a general process for any text file reading, but pd.read_csv() makes process easier for this example

# we specify the separator as tabs and the header as None (there are no headers in this data)

dataset = pd.read_csv("SMSSpamCollection.tsv", sep = '\t', header = None)

In [25]:
dataset.head()

Unnamed: 0,0,1
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


# Data Exploration

In [27]:
# we can pass in a list for the column names

dataset.columns = ['label', 'text']

In [28]:
dataset.head()

Unnamed: 0,label,text
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [32]:
# what is the shape of the dataset?

dataset.shape

(5568, 2)

In [33]:
print(f"The data has {len(dataset)} rows and {len(dataset.columns)} columns")

The data has 5568 rows and 2 columns


In [34]:
# how many ham and spam values are there?

dataset['label'].value_counts()

ham     4822
spam     746
Name: label, dtype: int64

In [38]:
print(f"Out of {len(dataset)} rows, there are {len(dataset[dataset['label'] == 'ham'])} ham messages.")

Out of 5568 rows, there are 4822 ham messages.


In [43]:
# How much missing data is there?

dataset['label'].isnull().sum()

0

In [44]:
dataset['text'].isnull().sum()

0

In [45]:
# no missing data - good to go

# Regular Expressions (Regex)

Regex is a text string for describing a certain search pattern

For example, `nlp` will search for this string in a body of text

Using `[j-q]` would also identify the characters in the string and any other letters between j and q in the alphabet (e.g. k, m, o)

Using `[j-q]+` would require that the letters between j and q would have to be followed by at least 1 character between j and q

`[0-9]+` would return all numbers that are more than 10 as they are presented (e.g. year 2019 instead of 2,0,1,9)

`[j-q0-9]+` would return sequences of characters between j and q and any numbers that show up

Regex are often used to create some structure in a text document that may not currently have any structure

Regex can be used to:

-- identify whitespace or tell Python where to split up a sentence

-- identify and create delimiters or end of line escape characters

-- remove punctuations or numbers from text

-- clean HTML tags from text

-- identify text patterns of interest 

Real world use cases include:

-- confirming passwords meet criteria

-- searching URL for some substring

-- searching for files on your computer

-- document scraping

In [46]:
# Python's re package is the most commonly used regex resource

# We can demonstrate this with different strings

import re

re_test = 'This is a made up string to test 2 different regex methods'
re_test_messy = 'This      is a made up     string to test 2    different regex methods'
re_test_messy1 = 'This-is-a-made/up.string*to>>>>test----2""""""different~regex-methods'

In [47]:
# splitting a sentence into a list of words

# \s tells python to look for a single whitespace to split the string

re.split('\s', re_test)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [48]:
# trying on second example - not quite as clean as previous string bc of extra whitespace

re.split('\s', re_test_messy)

['This',
 '',
 '',
 '',
 '',
 '',
 'is',
 'a',
 'made',
 'up',
 '',
 '',
 '',
 '',
 'string',
 'to',
 'test',
 '2',
 '',
 '',
 '',
 'different',
 'regex',
 'methods']

In [49]:
# we will add a + to tell python to look for one or more whitespaces

re.split('\s+', re_test_messy)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [51]:
# third example - nothing gets split at all because there are no whitespaces

re.split('\s+', re_test_messy1)

['This-is-a-made/up.string*to>>>>test----2""""""different~regex-methods']

In [52]:
# we replace \s+ with \W+ to tell python to search for any sequences of non-word characters

re.split('\W+', re_test_messy)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [54]:
# the re.findall() method works similar to split but will search for all instances of a specific word

# \S+ will flip \s+ and look for all non-whitespace characters

re.findall('\S+', re_test)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [55]:
# the same code works on the first messy dataset as well

re.findall('\S+', re_test_messy)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [57]:
# third example - we need to change \W+ from re.split() to \w+ in re.findall()

# this searches for all word characters instead of all non-word characters

re.findall('\w+', re_test_messy1)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

**Main takeaways:**

-- There are two useful methods in re for tokenizing: re.split() and re.findall()

-- re.split() will split words at a certain split value (based on regex)

-- re.findall() will find all words that match a certain criteria (based on regex)

-- Anything based on \w is based on words (or \W for non-words)

-- Anything based on \s is based on whitespace (or \S for non-spaces)

# replacing a specific string value

In [58]:
# say we want to specify that PEP8 is a style guideline

# however, we may have made some spelling errors

# here are three versions of code that may need to be fixed

# the process of replacing must account for a pattern that encompasses PEP8 and the spelling errors

pep8_test = 'I try to follow PEP8 guidelines'
pep7_test = 'I try to follow PEP7 guidelines'
peep8_test = 'I try to follow PEEP8 guidelines'

In [59]:
# starting with version that has no typos

re.findall('[a-z]+', pep8_test)

['try', 'to', 'follow', 'guidelines']

In [60]:
# this only returns values that are lower case - need to expand to include upper case characters too

re.findall('[A-Za-z]+', pep8_test)

['I', 'try', 'to', 'follow', 'PEP', 'guidelines']

In [61]:
# to specifically find PEP8, we need to specify a string of upper case characters followed by 1 or more numbers

re.findall('[A-Z]+[0-9]+', pep8_test)

['PEP8']

In [62]:
# trying on second example

re.findall('[A-Z]+[0-9]+', pep7_test)

['PEP7']

In [63]:
# trying on third example

re.findall('[A-Z]+[0-9]+', peep8_test)

['PEEP8']

In [64]:
# now that we have found the right regex, we need to replace the tokens with our desired values

# we will use re.sub() to search for a pattern and replace it with another value

re.sub('[A-Z]+[0-9]+', repl = "PEP8 Python Style Guide", string = pep8_test)

'I try to follow PEP8 Python Style Guide guidelines'

In [65]:
# trying on second example

re.sub('[A-Z]+[0-9]+', repl = "PEP8 Python Style Guide", string = pep7_test)

'I try to follow PEP8 Python Style Guide guidelines'

In [66]:
# trying on third example

re.sub('[A-Z]+[0-9]+', repl = "PEP8 Python Style Guide", string = peep8_test)

'I try to follow PEP8 Python Style Guide guidelines'

### Other examples of regex methods

- re.search()
- re.match()
- re.fullmatch()
- re.finditer()
- re.escape()

### ML Pipeline Process

1. Start with raw text (model cannot distinguish words)

2. Tokenize words (tell model what to look at)

3. Clean text - remove stop words, punctuation, stemming, etc.

4. Vectorize - convert to a numeric matrix format that Python can use to build a model

5. Fit/train model with a machine learning algorithm using training data

## Focusing on steps 2 and 3 of pipeline now (tokenizing and cleaning text)

During text cleaning, the following four steps are usually accomplished:

1. Remove punctuation

2. Tokenization

3. Remove stop words

4. Lemmatize/Stem

There are packages that can do pretty much all of this for us, but we will build them to show how they work

In [68]:
# first going to set view for how many characters we can see in a pandas dataframe and reread in data

pd.set_option('display.max_colwidth', 100)

data = pd.read_csv("SMSSpamCollection.tsv", sep = '\t', header = None)

data.columns = ['label', 'text']

data.head()

Unnamed: 0,label,text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
2,ham,"Nah I don't think he goes to usf, he lives around here though"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [69]:
# what does the clean data look like?

clean_data = pd.read_csv("SMSSpamCollection_cleaned.tsv", sep = '\t')

clean_data.head()

Unnamed: 0,label,body_text,body_text_nostop
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,"['ive', 'searching', 'right', 'words', 'thank', 'breather', 'promise', 'wont', 'take', 'help', '..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"['free', 'entry', '2', 'wkly', 'comp', 'win', 'fa', 'cup', 'final', 'tkts', '21st', 'may', '2005..."
2,ham,"Nah I don't think he goes to usf, he lives around here though","['nah', 'dont', 'think', 'goes', 'usf', 'lives', 'around', 'though']"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"['even', 'brother', 'like', 'speak', 'treat', 'like', 'aids', 'patent']"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"['date', 'sunday']"


In [70]:
# first step is to remove punctuation - we will use string and re packages to assist us

import string

In [71]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [72]:
# create function to remove punctuation

def remove_punc(text):
    
    text_nopunc = [char for char in text if char not in string.punctuation]
    
    return text_nopunc

In [75]:
# we will use a lambda expression to apply this to our dataset in a new column

data['text_clean'] = data['text'].apply(lambda x: remove_punc(x))

In [76]:
data.head()

Unnamed: 0,label,text,text_clean
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,"[I, v, e, , b, e, e, n, , s, e, a, r, c, h, i, n, g, , f, o, r, , t, h, e, , r, i, g, h, t,..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[F, r, e, e, , e, n, t, r, y, , i, n, , 2, , a, , w, k, l, y, , c, o, m, p, , t, o, , w,..."
2,ham,"Nah I don't think he goes to usf, he lives around here though","[N, a, h, , I, , d, o, n, t, , t, h, i, n, k, , h, e, , g, o, e, s, , t, o, , u, s, f, ,..."
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[E, v, e, n, , m, y, , b, r, o, t, h, e, r, , i, s, , n, o, t, , l, i, k, e, , t, o, , s,..."
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[I, , H, A, V, E, , A, , D, A, T, E, , O, N, , S, U, N, D, A, Y, , W, I, T, H, , W, I, L, L]"


In [78]:
# we have made it so that each letter is broken up in the list within text_clean

# we can fix this by wrapping the list in the punctuation function in a ''.join() function

def remove_punc(text):
    
    text_nopunc = ''.join([char for char in text if char not in string.punctuation])
    
    return text_nopunc

data['text_clean'] = data['text'].apply(lambda x: remove_punc(x))

data.head()

Unnamed: 0,label,text,text_clean
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,Ive been searching for the right words to thank you for this breather I promise i wont take your...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...
2,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL


## Tokenizing words

In [79]:
def tokenize(text):
    
    tokens = re.split('\W+', text) # split where there are 1 or more non word characters
    
    return tokens

In [81]:
# we will do the same lambda expression in an apply method to create a new column for lower case tokenized words

data['text_tokenized'] = data['text_clean'].apply(lambda x: tokenize(x.lower()))

In [82]:
data.head()

Unnamed: 0,label,text,text_clean,text_tokenized
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,Ive been searching for the right words to thank you for this breather I promise i wont take your...,"[ive, been, searching, for, the, right, words, to, thank, you, for, this, breather, i, promise, ..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to..."
2,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent,"[even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL,"[i, have, a, date, on, sunday, with, will]"


## Eliminate stop words

In [83]:
# need to get stop words from nltk and remove them if they appear in the list

stopwords = nltk.corpus.stopwords.words('english')

In [84]:
def remove_stops(token_list):
    
    return [word for word in token_list if word not in stopwords]

In [85]:
# use lambda function to make new column 

data['text_nostops'] = data['text_tokenized'].apply(lambda x: remove_stops(x))

In [86]:
data.head()

Unnamed: 0,label,text,text_clean,text_tokenized,text_nostops
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,Ive been searching for the right words to thank you for this breather I promise i wont take your...,"[ive, been, searching, for, the, right, words, to, thank, you, for, this, breather, i, promise, ...","[ive, searching, right, words, thank, breather, promise, wont, take, help, granted, fulfil, prom..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
2,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]","[nah, dont, think, goes, usf, lives, around, though]"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent,"[even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]","[even, brother, like, speak, treat, like, aids, patent]"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL,"[i, have, a, date, on, sunday, with, will]","[date, sunday]"
