# Project2 Part1 - Text Analysis through TFIDF computation


In [1]:
from text_analyzer import read_sonnets, clean_corpus, tf, get_top_k, idf, tf_idf, cosine_sim

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%load_ext autoreload
%autoreload 2

In [2]:
# run text_analyzer.py with default arguments
!python text_analyzer.py


Sonnet 1 TF (Top 20):
[('the', 6), ('thy', 4), ('to', 3), ('might', 2), ('But', 2), ('by', 2), ('tender', 2), ('thine', 2), ('own', 2), ('self', 2), ('worlds', 2), ('And', 2), ('From', 1), ('fairest', 1), ('creatures', 1), ('we', 1), ('desire', 1), ('increase', 1), ('That', 1), ('thereby', 1)]
Corpus TF (Top 20):
[('my', 365), ('the', 356), ('of', 351), ('I', 343), ('to', 330), ('in', 285), ('thy', 258), ('and', 247), ('And', 244), ('that', 237), ('thou', 209), ('with', 163), ('me', 162), ('thee', 161), ('is', 158), ('not', 156), ('love', 155), ('a', 147), ('be', 133), ('all', 107)]
Corpus IDF (Top 20):
[('bootless', 5.0369526024136295), ('Desiring', 5.0369526024136295), ('fate', 5.0369526024136295), ('trouble', 5.0369526024136295), ('sings', 5.0369526024136295), ('despising', 5.0369526024136295), ('deaf', 5.0369526024136295), ('Haply', 5.0369526024136295), ('Wishing', 5.0369526024136295), ('gate', 5.0369526024136295), ('cries', 5.0369526024136295), ('outcast', 5.036952602413629

## a. Read about argparse.
Look at its implementation in the Python Script. Follow the instruction and answer the questions in the Argparse section.

Argparse response:
Argparse provides an easy user interface for command-line input. The module is built on an instance of argparse.ArgumentParser. The implementer can specify the number of commands, the type of each command-line input, positional arguments, options that accept values, and on/off flags. We attach individual argument specs using the add_argument function within the method suites. For each individual argument, we can specify how we handle the argument, for example, finding the max or min values of an array of integers. To access the parsed arguments, we use parse_args() which places the extracted data in a argparse.Namespace object.

## b. Read and Clean the data

In [3]:
d_corpus='data/shakespeare_sonnets/'

# return dictionary with keys corresponding to file names and values being the respective contents
corpus = read_sonnets(d_corpus)

# return corpus (dict) with each sonnet cleaned and tokenized for further processing
corpus = clean_corpus(corpus)

In [4]:
# a dic where file name is the key, and the content are the values. The paragraph is split into an array of individual word. No comma, no numbers, no signs, no special characters...
print(type(corpus))
corpus['1']

<class 'dict'>


['From',
 'fairest',
 'creatures',
 'we',
 'desire',
 'increase',
 'That',
 'thereby',
 'beautys',
 'rose',
 'might',
 'never',
 'die',
 'But',
 'as',
 'the',
 'riper',
 'should',
 'by',
 'time',
 'decease',
 'His',
 'tender',
 'heir',
 'might',
 'bear',
 'his',
 'memory',
 'But',
 'thou',
 'contracted',
 'to',
 'thine',
 'own',
 'bright',
 'eyes',
 'Feedst',
 'thy',
 'lights',
 'flame',
 'with',
 'selfsubstantial',
 'fuel',
 'Making',
 'a',
 'famine',
 'where',
 'abundance',
 'lies',
 'Thy',
 'self',
 'thy',
 'foe',
 'to',
 'thy',
 'sweet',
 'self',
 'too',
 'cruel',
 'Thou',
 'that',
 'art',
 'now',
 'the',
 'worlds',
 'fresh',
 'ornament',
 'And',
 'only',
 'herald',
 'to',
 'the',
 'gaudy',
 'spring',
 'Within',
 'thine',
 'own',
 'bud',
 'buriest',
 'thy',
 'content',
 'And',
 'tender',
 'churl',
 'makst',
 'waste',
 'in',
 'niggarding',
 'Pity',
 'the',
 'world',
 'or',
 'else',
 'this',
 'glutton',
 'be',
 'To',
 'eat',
 'the',
 'worlds',
 'due',
 'by',
 'the',
 'grave',
 'and',

## c. TF

In [5]:
# assign 1.txt to variable sonnet to process and find its TF (Note corpus is of type dic, but sonnet1 is just a str)
sonnet1 = corpus['1']

# determine tf of sonnet
sonnet1_tf = tf(sonnet1)

# get sorted list and slice out top 20
sonnet1_top20 = get_top_k(sonnet1_tf)

df = pd.DataFrame(sonnet1_top20, columns=["word", "count"])
df.head(20)

Unnamed: 0,word,count
0,the,6
1,thy,4
2,to,3
3,might,2
4,But,2
5,by,2
6,tender,2
7,thine,2
8,own,2
9,self,2


In [6]:
# TF of entire corpus
flattened_corpus = [word for sonnet in corpus.values() for word in sonnet]
corpus_tf = tf(flattened_corpus)
corpus_top20 = get_top_k(corpus_tf)
# print
# print("Corpus TF (Top 20):")
df = pd.DataFrame(corpus_top20, columns=["word", "count"])
df.head(20)

Unnamed: 0,word,count
0,my,365
1,the,356
2,of,351
3,I,343
4,to,330
5,in,285
6,thy,258
7,and,247
8,And,244
9,that,237


### Q: Discussion
Do you believe the most frequent words would discriminate between documents well? Why or why not? Any thoughts on how we can improve this representation? Does there appear to be any ‘noise’? If so, where? If not, it should be clear by the end of the assignment.

Relying solely on term frequency (TF) to discriminate between documents is not ideal, as it does not account for the presence of common words like "the", "a", "and", etc., which appear frequently but do not contribute to meaningful distinctions between documents. These common words can be considered "noise" in our representation. To improve this representation, we need the inverse document frequency (IDF) component to balance the measure. IDF helps in emphasizing the importance of rare words, reducing the impact of common words. When a term is highly common across all documents, the IDF component diminishes the TF-IDF score close to zero, allowing for more meaningful words to take precedence.

## d. IDF

In [10]:
# IDF of corpus
corpus_idf = idf(corpus)
corpus_tf_ordered = get_top_k(corpus_idf)
# print top 20 to add to report
df = pd.DataFrame(corpus_tf_ordered, columns=["word", "score"])
df

Unnamed: 0,word,score
0,beweep,5.036953
1,deaf,5.036953
2,arising,5.036953
3,mans,5.036953
4,Featured,5.036953
5,Wishing,5.036953
6,enjoy,5.036953
7,gate,5.036953
8,Haply,5.036953
9,outcast,5.036953


### Q: observe and briefly comment on the difference in top 20 lists (comparing TF of corpus vs its IDF).

Since we are only basing our measure on IDF now, our ordering favors terms which appear only once. The IDF formula, log(#documents / #documents containing the term), reaches its maximum value when a word is unique to a document and occurs only once in the entire corpus. It is evident that the top 20 values all share a value of 5.0369, which is not coincidental, as log(#documents / 1) equals 5.0369.

## e. TF-IDF

In [11]:
# Compute the TF-IDF values of the specific sonnet with respect to the corpus
sonnet1_tfidf = tf_idf(corpus_idf, sonnet1_tf)
sonnet1_tfidf_ordered = get_top_k(sonnet1_tfidf)

# Print the results
df = pd.DataFrame(sonnet1_tfidf_ordered, columns=["word", "score"])
df

Unnamed: 0,word,score
0,worlds,7.301316
1,tender,6.490386
2,Feedst,5.036953
3,lights,5.036953
4,selfsubstantial,5.036953
5,fuel,5.036953
6,famine,5.036953
7,foe,5.036953
8,herald,5.036953
9,gaudy,5.036953


### Q. What is different with this list than just using TF and IRF alone?

Now we have isolated the terms that distinguish between other documents the most. Notice that words "worlds" and "tender"now appear at the top of the list, whereas they were not visible initially, likely due to being tied with other terms with a score of 5.0369. This prominence is because these two words probably occur in just one or very few documents and appear multiple times within that document. They receive a boost both from their frequent occurrence within a document (TF) and from their exclusivity to that document or just a few others (IDF).

## f. Compare all documents

In [None]:
# Compute TF-IDF for each document in the corpus
corpus_tfidf = {}

for doc_name, doc_content in corpus.items():
    doc_tf = tf(doc_content)
    doc_tfidf = tf_idf(corpus_idf, doc_tf)
    corpus_tfidf[doc_name] = doc_tfidf

# Create an empty similarity matrix
similarity_matrix = np.zeros((len(corpus.items()), len(corpus.items())))

# Get the document names
doc_names = list(corpus.keys())

# Compute cosine similarity for each pair of documents
for i, doc1 in enumerate(doc_names):
    for j, doc2 in enumerate(doc_names):
        similarity_matrix[i, j] = cosine_sim(corpus_tfidf[doc1], corpus_tfidf[doc2])
        
# Plot heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(similarity_matrix, annot=True, cmap='coolwarm', xticklabels=doc_names, yticklabels=doc_names)
plt.title("Cosine Similarity Heatmap")
plt.show()
