In [1]:
import numpy as np
import pandas as pd
from textblob import TextBlob
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
document1 = '''It’s become a familiar sight for the Chapman community: President Daniele 
Struppa standing on stage at the Musco Center, addressing a combined 
audience of university faculty, staff and university supporters.'''

document2 = '''What was different this year — as has been the case for most events this 
year — was that everyone in the audience was viewing the address from home.'''

doc1 = TextBlob(document1)
doc2 = TextBlob(document2)

### Problem 1

In [3]:
# Filter by list comprehension
doc1_words = doc1.words.lower()
doc2_words = doc2.words.lower()
puncs = set((',','.',';',':','--','-','!','—','?',':','`',"''", '(',')','[',']', '’'))

doc1_filtered = [word for word in doc1_words if word not in puncs]
doc2_filtered = [word for word in doc2_words if word not in puncs]
print(doc1_filtered, "\n")
print(doc2_filtered)

['it', 's', 'become', 'a', 'familiar', 'sight', 'for', 'the', 'chapman', 'community', 'president', 'daniele', 'struppa', 'standing', 'on', 'stage', 'at', 'the', 'musco', 'center', 'addressing', 'a', 'combined', 'audience', 'of', 'university', 'faculty', 'staff', 'and', 'university', 'supporters'] 

['what', 'was', 'different', 'this', 'year', 'as', 'has', 'been', 'the', 'case', 'for', 'most', 'events', 'this', 'year', 'was', 'that', 'everyone', 'in', 'the', 'audience', 'was', 'viewing', 'the', 'address', 'from', 'home']


In [4]:
# Filter by lambda
doc1_filtered2 = list(filter(lambda word: word not in puncs, doc1_words))
doc2_filtered2 = list(filter(lambda word: word not in puncs, doc2_words))
print(doc1_filtered2, "\n")
print(doc2_filtered2)

['it', 's', 'become', 'a', 'familiar', 'sight', 'for', 'the', 'chapman', 'community', 'president', 'daniele', 'struppa', 'standing', 'on', 'stage', 'at', 'the', 'musco', 'center', 'addressing', 'a', 'combined', 'audience', 'of', 'university', 'faculty', 'staff', 'and', 'university', 'supporters'] 

['what', 'was', 'different', 'this', 'year', 'as', 'has', 'been', 'the', 'case', 'for', 'most', 'events', 'this', 'year', 'was', 'that', 'everyone', 'in', 'the', 'audience', 'was', 'viewing', 'the', 'address', 'from', 'home']


### Problem 2

In [5]:
from nltk.tokenize import word_tokenize
tokenize_list = [word_tokenize(word) for word in doc1_words if '’' not in word]
tokenize_lambda = [(lambda word: word_tokenize(word))(word) for word in doc1_words if '’' not in word]
print(tokenize_list[:10], "\n")
print(tokenize_lambda[:10])

[['it'], ['s'], ['become'], ['a'], ['familiar'], ['sight'], ['for'], ['the'], ['chapman'], ['community']] 

[['it'], ['s'], ['become'], ['a'], ['familiar'], ['sight'], ['for'], ['the'], ['chapman'], ['community']]


### Problem 3

In [6]:
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

doc1_sw_list = [word for word in doc1_words if word not in stop_words and '—' not in word and '’' not in word]
doc2_sw_list = [word for word in doc2_words if word not in stop_words and '—' not in word and '’' not in word]
print(doc1_sw_list, "\n")
print(doc2_sw_list)

['become', 'familiar', 'sight', 'chapman', 'community', 'president', 'daniele', 'struppa', 'standing', 'stage', 'musco', 'center', 'addressing', 'combined', 'audience', 'university', 'faculty', 'staff', 'university', 'supporters'] 

['different', 'year', 'case', 'events', 'year', 'everyone', 'audience', 'viewing', 'address', 'home']


In [7]:
doc1_sw_lambda = list(filter(lambda word: word not in stop_words and '-' not in word and '’' not in word, doc1_words))
doc2_sw_lambda = list(filter(lambda word: word not in stop_words and '—' not in word and '’' not in word, doc2_words))
print(doc1_sw_lambda, "\n")
print(doc2_sw_lambda)

['become', 'familiar', 'sight', 'chapman', 'community', 'president', 'daniele', 'struppa', 'standing', 'stage', 'musco', 'center', 'addressing', 'combined', 'audience', 'university', 'faculty', 'staff', 'university', 'supporters'] 

['different', 'year', 'case', 'events', 'year', 'everyone', 'audience', 'viewing', 'address', 'home']


### Problem 4

In [8]:
doc1_stems_list = [word.stem() for word in doc1_words if '’' not in word]
doc2_stems_list = [word.stem() for word in doc2_words if '—' not in word]
print(doc1_stems_list, "\n")
print(doc2_stems_list)

['it', 's', 'becom', 'a', 'familiar', 'sight', 'for', 'the', 'chapman', 'commun', 'presid', 'daniel', 'struppa', 'stand', 'on', 'stage', 'at', 'the', 'musco', 'center', 'address', 'a', 'combin', 'audienc', 'of', 'univers', 'faculti', 'staff', 'and', 'univers', 'support'] 

['what', 'wa', 'differ', 'thi', 'year', 'as', 'ha', 'been', 'the', 'case', 'for', 'most', 'event', 'thi', 'year', 'wa', 'that', 'everyon', 'in', 'the', 'audienc', 'wa', 'view', 'the', 'address', 'from', 'home']


In [9]:
stem_lambda = lambda word:word.stem()
doc1_stems_lambda = [(stem_lambda)(word) for word in doc1_words if '’' not in word]
doc2_stems_lambda = [(stem_lambda)(word) for word in doc2_words if '—' not in word]
print(doc1_stems_lambda, "\n")
print(doc2_stems_lambda)

['it', 's', 'becom', 'a', 'familiar', 'sight', 'for', 'the', 'chapman', 'commun', 'presid', 'daniel', 'struppa', 'stand', 'on', 'stage', 'at', 'the', 'musco', 'center', 'address', 'a', 'combin', 'audienc', 'of', 'univers', 'faculti', 'staff', 'and', 'univers', 'support'] 

['what', 'wa', 'differ', 'thi', 'year', 'as', 'ha', 'been', 'the', 'case', 'for', 'most', 'event', 'thi', 'year', 'wa', 'that', 'everyon', 'in', 'the', 'audienc', 'wa', 'view', 'the', 'address', 'from', 'home']


### Problem 5

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
corpus = [document1, document2]

In [11]:
X = cv.fit(corpus)
words = X.get_feature_names()
X = cv.transform(corpus)
df = pd.DataFrame(X.toarray(), columns = words).T
df.columns = ['Doc1', 'Doc2']
df.index.name = 'Words'
df.head(10)

Unnamed: 0_level_0,Doc1,Doc2
Words,Unnamed: 1_level_1,Unnamed: 2_level_1
address,0,1
addressing,1,0
and,1,0
as,0,1
at,1,0
audience,1,1
become,1,0
been,0,1
case,0,1
center,1,0


### Problem 6

In [13]:
cv = CountVectorizer(ngram_range=(2,3))

In [15]:
import string
import re 
import nltk
#nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

In [23]:
def clean_text_ngrams(txt):
    txt = "".join([c for c in txt if c not in string.punctuation])
    tokens = re.split('\W+', txt)
    txt = " ".join([ps.stem(word) for word in tokens if word not in stopwords])
    return txt

In [27]:
dfCorpus = pd.DataFrame(corpus, columns = ['msg'])
dfCorpus['msg clean'] = dfCorpus['msg'].apply(lambda x: clean_text_ngrams(x))

In [29]:
X = cv.fit_transform(dfCorpus['msg clean'])
df = pd.DataFrame(X.toarray(), columns = cv.get_feature_names())
df

Unnamed: 0,address combin,address combin audienc,address home,audienc univers,audienc univers faculti,audienc view,audienc view address,becom familiar,becom familiar sight,case event,...,univers faculti staff,univers support,view address,view address home,what differ,what differ year,year case,year case event,year everyon,year everyon audienc
0,1,1,0,1,1,0,0,1,1,0,...,1,1,0,0,0,0,0,0,0,0
1,0,0,1,0,0,1,1,0,0,1,...,0,0,1,1,1,1,1,1,1,1
