# CMPE 297: Homework 1: Regular expressions, text normalization, and edit distance

The parts that you need to complete are marked as Exercises.

## Part 0: Initialization & Setup

In [None]:
# importing required libraries
import re
import nltk
from nltk.corpus import movie_reviews
import string
import pandas as pd
from nltk.corpus import stopwords

nltk.download('movie_reviews')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True



## Part 1: Regular Expressions

### Extracting license plate numbers, IDs, emails and mailing addresses from a document


#### Document creation

In [None]:
sentence = 'I am 20 years old. My previous license plate number was 4XUI302 and my new one is 3A-278. My ID is J987492 and my address is 123 Main street, San Jose, CA. Please email me at myemail123+spam@google.cg or jane.doe@sjsu.edu'
sentence

'I am 20 years old. My previous license plate number was 4XUI302 and my new one is 3A-278. My ID is J987492 and my address is 123 Main street, San Jose, CA. Please email me at myemail123+spam@google.cg or jane.doe@sjsu.edu'

Extracting license plate numbers

In [None]:
# The format of license plate number is a digit then 2 or 3 letters (one of which can be a "-"), and then 3 digits

regex = re.compile(r'(\d{1}[A-Za-z-]{2,3}\d{3})')
lincense_plate_numbers = regex.findall(sentence)
lincense_plate_numbers

['4XUI302', '3A-278']

### Exercise 1-1: Extract the ID numbers from the document.

In [None]:
# The format of the IDs is one character/letter and then 6 digits
regex = re.compile(r'(\w{1}\d{6})')
ids = regex.findall(sentence)
ids

['J987492']

### Exercise 1-2: Extract the email IDs from the document

In [None]:
regex = re.compile(r'([\d\w+.]*[@]+[\w\d]+[.]+[a-z]*)')
emails = regex.findall(sentence)
emails

['myemail123+spam@google.cg', 'jane.doe@sjsu.edu']

### Exercise 1-3: Extract the mailing address from the document

In [None]:
regex = re.compile(r'([0-9 ]{3}[A-Za-z ]*[,]{1}[A-Za-z0-9 ]*[,]{1}[\w ]*)')
mailing_address = regex.findall(sentence)
mailing_address

['123 Main street, San Jose, CA']

### Exercise 1-4: Anonymize the license plate numbers by replacing them with the text "LP_NUM"

The re.sub function is described here: https://docs.python.org/3/library/re.html

In [None]:
# Now replacing license plate numbers with the string "LP_NUM"
sentence_modified = re.sub(r'(\d{1}[A-Za-z-]{2,3}\d{3})',r'LP_NUM',sentence)
sentence_modified

'I am 20 years old. My previous license plate number was LP_NUM and my new one is LP_NUM. My ID is J987492 and my address is 123 Main street, San Jose, CA. Please email me at myemail123+spam@google.cg or jane.doe@sjsu.edu'

### Exercise 1-5: Replace the ID numbers with the text "ID_NUM"

In [None]:
sentence_modified = re.sub(r'(\w{1}\d{6})',r'ID_NUM',sentence)
sentence_modified

'I am 20 years old. My previous license plate number was 4XUI302 and my new one is 3A-278. My ID is ID_NUM and my address is 123 Main street, San Jose, CA. Please email me at myemail123+spam@google.cg or jane.doe@sjsu.edu'

## Part 2: Text Processing

Count the number of words in the movie_reviews dataset (dataset uploaded in the beginning of this notebook under "Part 0: Initialization and Setup")

In [None]:
# print number of words in the movie review dataset
len(movie_reviews.words())

1583820

Load the standard list of punctuation marks

In [None]:
punctuations = string.punctuation
punctuations

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

Remove punctation from movie reviews


In [None]:
words_wo_puncts = [x for x in movie_reviews.words() if x not in punctuations]
len(words_wo_puncts)

1338788

Count the number of unique words

In [None]:
unique_words = set(words_wo_puncts)
len(unique_words)

39737

Find the 20 most frequent words in the dataset

In [None]:
# top 20 highest freq words
pd.Series(words_wo_puncts).value_counts()[:20]

the     76529
a       38106
and     35576
of      34123
to      31937
is      25195
in      21822
s       18513
it      16107
that    15924
as      11378
with    10792
for      9961
his      9587
this     9578
film     9517
i        8889
he       8864
but      8634
on       7385
dtype: int64

Load the standard list of stopwords

In [None]:
# getting english stopwords
eng_stopwords = stopwords.words('english')
eng_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

Count the number of stopwords

In [None]:
len(eng_stopwords)

179

### Exercise 2-1: Remove the stopwords from the dataset (similarly to how we removed punctuation above)

In [None]:
words_wo_puncts_stopwords = [x for x in words_wo_puncts if (x not in eng_stopwords)]
len(words_wo_puncts_stopwords)

710578

### Exercise 2-2: Find the number of uniques words in the dataset now that the stop words have been removed

In [None]:
# unique words without stopwords
unique_words = set(words_wo_puncts_stopwords)
len(unique_words)

39586

### Exercise 2-3: Find the top 20 highest frequency words now that we have removed the stopwords

In [None]:
# top 20 highest freq words after removing stopwords
pd.Series(words_wo_puncts_stopwords).value_counts()[:20]
# pd.Series(words_wo_puncts).value_counts()[:20]



film          9517
one           5852
movie         5771
like          3690
even          2565
time          2411
good          2411
story         2169
would         2109
much          2049
character     2020
also          1967
get           1949
two           1911
well          1906
characters    1859
first         1836
--            1815
see           1749
way           1693
dtype: int64

Find the words that are used only once in the corpus (and print the first few).  

In [None]:
# 20 words that are used only once in corpus using hapaxes() function
nltk.FreqDist(words_wo_puncts_stopwords).hapaxes()[:20]

['looooot',
 'schnazzy',
 'timex',
 'indiglo',
 'jessalyn',
 'gilsig',
 'ruber',
 'jaleel',
 'balki',
 'wavers',
 'statistics',
 'snapshot',
 'guesswork',
 'maryam',
 'daylights',
 'terraformed',
 'stagnated',
 'napolean',
 'millimeter',
 'enmeshed']

### Exercise 2-4: Use the PorterStemmer to stem the words in the dataset.

Display the first few words.

In [None]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()
words = words_wo_puncts_stopwords[1:200]
stemmedwords = []

for w in words:
  print(w,"---",ps.stem(w))

two --- two
teen --- teen
couples --- coupl
go --- go
church --- church
party --- parti
drink --- drink
drive --- drive
get --- get
accident --- accid
one --- one
guys --- guy
dies --- die
girlfriend --- girlfriend
continues --- continu
see --- see
life --- life
nightmares --- nightmar
deal --- deal
watch --- watch
movie --- movi
sorta --- sorta
find --- find
critique --- critiqu
mind --- mind
fuck --- fuck
movie --- movi
teen --- teen
generation --- gener
touches --- touch
cool --- cool
idea --- idea
presents --- present
bad --- bad
package --- packag
makes --- make
review --- review
even --- even
harder --- harder
one --- one
write --- write
since --- sinc
generally --- gener
applaud --- applaud
films --- film
attempt --- attempt
break --- break
mold --- mold
mess --- mess
head --- head
lost --- lost
highway --- highway
memento --- memento
good --- good
bad --- bad
ways --- way
making --- make
types --- type
films --- film
folks --- folk
snag --- snag
one --- one
correctly --- correc

In [None]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()
words = words_wo_puncts_stopwords
stemmedwords = []

for w in words:
  # print(w,"---",ps.stem(w))
#   dis = ps.stem(w)
  stemmedwords.append(ps.stem(w))

diststemwords = set(stemmedwords)
len(diststemwords)

26101

### Exercise 2-5: Use the

---

WordNetLemmatizer to lemmatize the words in the dataset.

Display the first few words.

In [None]:
from nltk import WordNetLemmatizer

wl = WordNetLemmatizer()
wl_dataset = words_wo_puncts_stopwords[1:200]
wl_dataset

distinct_wl = []

for w in wl_dataset:
  print(w,"-->",wl.lemmatize(w))

two --> two
teen --> teen
couples --> couple
go --> go
church --> church
party --> party
drink --> drink
drive --> drive
get --> get
accident --> accident
one --> one
guys --> guy
dies --> dy
girlfriend --> girlfriend
continues --> continues
see --> see
life --> life
nightmares --> nightmare
deal --> deal
watch --> watch
movie --> movie
sorta --> sorta
find --> find
critique --> critique
mind --> mind
fuck --> fuck
movie --> movie
teen --> teen
generation --> generation
touches --> touch
cool --> cool
idea --> idea
presents --> present
bad --> bad
package --> package
makes --> make
review --> review
even --> even
harder --> harder
one --> one
write --> write
since --> since
generally --> generally
applaud --> applaud
films --> film
attempt --> attempt
break --> break
mold --> mold
mess --> mess
head --> head
lost --> lost
highway --> highway
memento --> memento
good --> good
bad --> bad
ways --> way
making --> making
types --> type
films --> film
folks --> folk
snag --> snag
one --> on

In [None]:
from nltk import WordNetLemmatizer

wl = WordNetLemmatizer()
wl_dataset = words_wo_puncts_stopwords
wl_dataset

distinct_wl = []

for w in wl_dataset:
  # print(w,"-->",wl.lemmatize(w))
  distinct_wl.append(wl.lemmatize(w))

lemmawords =set(distinct_wl)
len(lemmawords)

35172

### Exercise 2-6:
a) How many unique words are there once stemming is applied? (show the that performs the computation and outputs the result)

b) How many unique words are there once lemmatization is applied? (show the code that performs the computation and outputs the result)

***Above Codes have found all the distinct words in Stem and Lemma***

In [None]:
print("Distinct Stem Words are:",len(diststemwords))
print("Distinct Lemma Words are:",len(lemmawords))


Distinct Stem Words are: 26101
Distinct Lemma Words are: 35172


## Part 3. Tokenization

### Exercise 3-1: Use the Penn Tree Bank tokenizer to tokenize the sentence

---

below

Print the tokens that the tokenizer produces.

In [13]:
from nltk.tokenize import TreebankWordTokenizer
s = 'Please pay $100.55 to settle your bill.  Send confirmation to confirm@gmail.com.'

tokens = TreebankWordTokenizer().tokenize(s)

# Print the tokens
print(tokens)

['Please', 'pay', '$', '100.55', 'to', 'settle', 'your', 'bill.', 'Send', 'confirmation', 'to', 'confirm', '@', 'gmail.com', '.']


## Part 4: Levenshtein Distance & Alignment

Relevant nltk documentation: https://www.nltk.org/api/nltk.metrics.distance.html

### Exercise 4-1: Use the nltk functions edit_distance to compute the Levenshtein edit-distance between the strings "intention" and "execution"

In [39]:
from nltk.metrics.distance import edit_distance

w1 = "intention"
w2 = "execution"

# help (edit_distance)
dist = edit_distance(w1,w2)
dist

5

### Exercise 4-2: Use the nltk function edit_distance_align to compute the minimum Levenshtein edit-distance based alignment mapping between the two strings "intention" and "execution"

In [47]:
from nltk.metrics.distance import edit_distance_align

w1 = "intention"
w2 = "execution"

# help (edit_distance)
dist2 = edit_distance_align(w1,w2)
dist2

[(0, 0),
 (1, 1),
 (2, 2),
 (3, 3),
 (4, 4),
 (5, 5),
 (6, 6),
 (7, 7),
 (8, 8),
 (9, 9)]