### Warm-up Question 1

In [40]:
# regex for removing punctuation
import re
# nltk: useful text processing library 
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from collections import Counter
def get_part_of_speech(word):
  probable_part_of_speech = wordnet.synsets(word)
  pos_counts = Counter()
  pos_counts["n"] = len(  [ item for item in probable_part_of_speech if item.pos()=="n"]  )
  pos_counts["v"] = len(  [ item for item in probable_part_of_speech if item.pos()=="v"]  )
  pos_counts["a"] = len(  [ item for item in probable_part_of_speech if item.pos()=="a"]  )
  pos_counts["r"] = len(  [ item for item in probable_part_of_speech if item.pos()=="r"]  )
  
  most_likely_part_of_speech = pos_counts.most_common(1)[0][0]
  return most_likely_part_of_speech

In [41]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\45098\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [38]:
text = "feet, foot, foots, footing"

cleaned = re.sub('\W+', ' ', text) #removing punctuation
tokenized = word_tokenize(cleaned) #breaking text into individual words

stemmer = PorterStemmer() 
stemmed = [stemmer.stem(token) for token in tokenized] #Stemming each word

lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(token, get_part_of_speech(token)) for token in tokenized] #Lemmatizating each word

In [39]:
print("Stemmed text:")
print(stemmed)

print("\nLemmatized text:")
print(lemmatized)

Stemmed text:
['feet', 'foot', 'foot', 'foot']

Lemmatized text:
['foot', 'foot', 'foot', 'footing']


### Warm-up Question 2

In [44]:
docA = "I never said the pandemic was a Hoax! Who would say such a thing? I said that the Do Nothing Democrats, together with their Mainstream Media partners, are the Hoax. They have been called out & embarrassed on this, even admitting they were wrong, but continue to spread the lie!"
docB = "The people that know me and know the history of our Country say that I am  the hardest working President in history. I don’t know about that, but I am a hard worker and have probably gotten more done in the first 3 1/2 years than any President in history. The Fake News hates it!"
docC = "Does anybody get the meaning of what a so-called Noble (not Nobel) Prize is, especially as it pertains to Reporters and Journalists? Noble is defined as, “having or showing fine personal qualities or high moral principles and ideals.” Does sarcasm ever work?"
bowA = word_tokenize(re.sub('\W+', ' ', docA)) #bow stands for bag of words
bowB = word_tokenize(re.sub('\W+', ' ', docB))
bowC = word_tokenize(re.sub('\W+', ' ', docC))
print("Document A: "+str(bowA))
print("Document B: "+str(bowB))
print("Document C: "+str(bowC))

Document A: ['I', 'never', 'said', 'the', 'pandemic', 'was', 'a', 'Hoax', 'Who', 'would', 'say', 'such', 'a', 'thing', 'I', 'said', 'that', 'the', 'Do', 'Nothing', 'Democrats', 'together', 'with', 'their', 'Mainstream', 'Media', 'partners', 'are', 'the', 'Hoax', 'They', 'have', 'been', 'called', 'out', 'embarrassed', 'on', 'this', 'even', 'admitting', 'they', 'were', 'wrong', 'but', 'continue', 'to', 'spread', 'the', 'lie']
Document B: ['The', 'people', 'that', 'know', 'me', 'and', 'know', 'the', 'history', 'of', 'our', 'Country', 'say', 'that', 'I', 'am', 'the', 'hardest', 'working', 'President', 'in', 'history', 'I', 'don', 't', 'know', 'about', 'that', 'but', 'I', 'am', 'a', 'hard', 'worker', 'and', 'have', 'probably', 'gotten', 'more', 'done', 'in', 'the', 'first', '3', '1', '2', 'years', 'than', 'any', 'President', 'in', 'history', 'The', 'Fake', 'News', 'hates', 'it']
Document C: ['Does', 'anybody', 'get', 'the', 'meaning', 'of', 'what', 'a', 'so', 'called', 'Noble', 'not', 'Nobe

In [12]:
### 1) Run the following code to construct a term-frequency matrix in a simple way
#First create a word set that contains all words in all documents
wordSet = set(bowA).union(set(bowB)).union(set(bowC))
#Second create dictionaries to keep my word counts.
wordDictA = dict.fromkeys(wordSet,0)
wordDictB = dict.fromkeys(wordSet,0)
wordDictC = dict.fromkeys(wordSet,0)
#Then count the words in each document
for word in bowA:
    wordDictA[word] += 1
for word in bowB:
    wordDictB[word] += 1
for word in bowC:
    wordDictC[word] += 1    

In [14]:
import pandas as pd
pd.DataFrame([wordDictA, wordDictB, wordDictC])

Unnamed: 0,1,2,3,Country,Democrats,Do,Does,Fake,Hoax,I,...,was,were,what,with,work,worker,working,would,wrong,years
0,0,0,0,0,1,1,0,0,2,2,...,1,1,0,1,0,0,0,1,1,0
1,1,1,1,1,0,0,0,1,0,3,...,0,0,0,0,0,1,1,0,0,1
2,0,0,0,0,0,0,2,0,0,0,...,0,0,1,0,1,0,0,0,0,0


In [23]:
### 2) Complete the following code to construct the matrix using TF-IDF
#First define a function to compute the term frequency TF
def computeTF(wordDict, bow):
    tfDict = {}
    bowCount = len(bow)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bowCount)
    return tfDict

In [24]:
tfBowA = computeTF(wordDictA, bowA)
tfBowB = computeTF(wordDictB, bowB)
tfBowC = computeTF(wordDictC, bowC)

In [29]:
#Second define a function to compute the inverse document frequency IDF
def computeIDF(docList):
    import math
    idfDict = {}
    N = len(docList)
    
    #counts the number of documents that contain a word w
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for doc in docList:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1
    
    #divide N by denominator above, take the log of that
    for word, val in idfDict.items():
        #Complete the code here
        idfDict[word] = math.log(N / float(val))
    
    return idfDict

In [30]:
idfs = computeIDF([wordDictA, wordDictB, wordDictC])

In [32]:
#Lastly, compute TF-IDF
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [39]:
tfidfBowA = computeTFIDF(tfBowA, idfs)
tfidfBowB = computeTFIDF(tfBowB, idfs)
tfidfBowC = computeTFIDF(tfBowC, idfs)
pd.DataFrame([tfidfBowA, tfidfBowB, tfidfBowC])

Unnamed: 0,1,2,3,Country,Democrats,Do,Does,Fake,Hoax,I,...,was,were,what,with,work,worker,working,would,wrong,years
0,0.0,0.0,0.0,0.0,0.022421,0.022421,0.0,0.0,0.044841,0.01655,...,0.022421,0.022421,0.0,0.022421,0.0,0.0,0.0,0.022421,0.022421,0.0
1,0.019274,0.019274,0.019274,0.019274,0.0,0.0,0.0,0.019274,0.0,0.02134,...,0.0,0.0,0.0,0.0,0.0,0.019274,0.019274,0.0,0.0,0.019274
2,0.0,0.0,0.0,0.0,0.0,0.0,0.051098,0.0,0.0,0.0,...,0.0,0.0,0.025549,0.0,0.025549,0.0,0.0,0.0,0.0,0.0


### Warm-up Question 3

In [101]:
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [107]:
#In this question, we will use a dataset that contains posts from a data science forum
#Change the directory path here
posts = open('.../raw_forum_posts.dat', 'r').read()

In [108]:
#Data preprocessing
soup = BeautifulSoup(posts, 'lxml') #The package BeautifulSoup is here used to parse the XML data
postTxt = soup.findAll('text')  #all posts <text> 
postDocs = [x.text for x in postTxt]
postDocs.pop(0)
postDocs = [x.lower() for x in postDocs]

In [109]:
#Create own list of stopwords
stopset = set(stopwords.words('english'))
stopset.update(['lt','p','/p','br','amp','quot','field','font','normal','span','0px','rgb','style','51', 
                'spacing','text','helvetica','size','family', 'space', 'arial', 'height', 'indent', 'letter'
                'line','none','sans','serif','transform','line','variant','weight','times', 'new','strong', 'video', 'title'
                'white','word','letter', 'roman','0pt','16','color','12','14','21', 'neue', 'apple', 'class',  ])

In [110]:
#Use scikit-learn's TF-IDF vectorizer to take my corpus and convert each document into a sparse matrix of TFIDF Features
#Before vectorizing
postDocs[0]

'<p>data science is about analyzing relevant data to obtain patterns of information in order to help achieve a goal. the main focus of the data analysis is the goal rather then the methodology on how it will achieved. this allows for creative thinking and allowing for the optimal solution or model to be found wihtout the constraint of a specific methodology.</p>'

In [132]:
vectorizer = TfidfVectorizer(stop_words=stopset, use_idf=True, ngram_range=(1, 3)) #ngram_range allows the vectorizer to tokenize not only a single word but also bigram and trigram
X = vectorizer.fit_transform(postDocs)
X

<27x3428 sparse matrix of type '<class 'numpy.float64'>'
	with 4064 stored elements in Compressed Sparse Row format>

The vectorized X is a sparse matrix with lots of 0 elements, which means there are lots of TF-IDF scores equal to 0.To save some memory of the machine, the matrix only stores the elements that have a score and washes out the 0 elements.

In [131]:
X[0]

<1x3428 sparse matrix of type '<class 'numpy.float64'>'
	with 89 stored elements in Compressed Sparse Row format>

In [130]:
#After vectorizing
print(X[0])

  (0, 649)	0.08989762673289421
  (0, 2473)	0.031055663877907407
  (0, 160)	0.06750601481982778
  (0, 2402)	0.0882767769894731
  (0, 2031)	0.10904753915911843
  (0, 2145)	0.09689742217847831
  (0, 1577)	0.052957348020145734
  (0, 2076)	0.07612666000883297
  (0, 1461)	0.07612666000883297
  (0, 47)	0.09689742217847831
  (0, 1378)	0.1765535539789462
  (0, 1806)	0.10904753915911843
  (0, 1250)	0.0882767769894731
  (0, 143)	0.07150739991692207
  (0, 2367)	0.10904753915911843
  (0, 1907)	0.21809507831823685
  (0, 52)	0.10904753915911843
  (0, 108)	0.10904753915911843
  (0, 625)	0.09689742217847831
  (0, 2969)	0.09689742217847831
  (0, 105)	0.10904753915911843
  (0, 2070)	0.10904753915911843
  (0, 2744)	0.09689742217847831
  (0, 1935)	0.07612666000883297
  (0, 1284)	0.0882767769894731
  :	:
  (0, 2033)	0.10904753915911843
  (0, 2147)	0.10904753915911843
  (0, 1589)	0.10904753915911843
  (0, 2082)	0.10904753915911843
  (0, 1463)	0.10904753915911843
  (0, 49)	0.10904753915911843
  (0, 1380)	0.10

Even though 3428 elements are in the original matrix, the vectorizer only scores 89 of them. For example, the 649th word in the first document has a TF-IDF score of 0.08989762673289421 

In [116]:
##Latent Semantic Analysis
X.shape

(27, 3428)

In [117]:
lsa = TruncatedSVD(n_components=27, n_iter=100) # n_components specifies the number of topics.
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=27, n_iter=100,
             random_state=None, tol=0.0)

### LSA

$$X \approx USV^{T}$$

Input: X, a matrix where m is the number of documents, and n is the number of terms.

U will be a m x k matrix. The rows will be documents and the columns will be 'topics'

S will be a k x k diagnal matrix. The elements will be the amount of variation captured from each topic.

V will be a n x k matrix. The rows will be terms and the columns will be topics.  

In [118]:
#This is the first row for V
lsa.components_[0] #lsa.components_ is designed to represent V, which is the term by topic matrix

array([0.00477467, 0.00477467, 0.00477467, ..., 0.00477467, 0.00477467,
       0.00477467])

In [136]:
#Attach the terms to the components and print the first 10 terms that make up the topics
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_): 
    termsInComp = zip (terms,comp)
    sortedTerms =  sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]
    print("Topic %d:" % i )
    for term in sortedTerms:
        print(term[0])
    print (" ")

Topic 0:
data
large amounts
large amounts data
amounts
amounts data
different
science
large
used
data science
 
Topic 1:
large amounts
large amounts data
amounts
amounts data
used
large
procedures聽could plots mainly
according data
according data science
amounts data procedures聽could
 
Topic 2:
white
converted
white converted
white white
big
big data
hello
and聽 white
and聽 white converted
big data and聽
 
Topic 3:
make
decisions
problem
make better
data science analyzing
science analyzing
white
better
better decisions
make better decisions
 
Topic 4:
goal
data science analyzing
science analyzing
achieve
solution
methodology
relevant
relevant data
information
answer
 
Topic 5:
business
goal
methods
competitive edge
edge
especially
achieve
solution
perspective
analyzing
 
Topic 6:
art
answer
using
relevant
relevant data
using relevant
using relevant data
part
good
able learn data
 
Topic 7:
data scientist
scientist
questions
answer
competitive
intelligent
amounts
amounts data
canada
contact