# HW 05: BOW and TFIDF

Charlie Perez (cwp5xyj)

### Part 1: Everything I need to get started

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly_express as px

In [2]:
import configparser
config = configparser.ConfigParser()
config.read("../../../env.ini")
data_home = config['DEFAULT']['data_home'] 
output_dir = config['DEFAULT']['output_dir']
data_prefix = 'austen-melville'

In [4]:
OHCO = ['book_id', 'chap_id', 'para_num', 'sent_num', 'token_num']

LIB = pd.read_csv(f"{output_dir}/{data_prefix}-LIB.csv").set_index('book_id')
TOKEN = pd.read_csv(f'{output_dir}/{data_prefix}-CORPUS.csv').set_index(OHCO).dropna()

In [5]:
TOKEN.reset_index().book_id.value_counts().sort_index()

book_id
105       83613
121       77586
141      160366
158      160884
161      119858
946       23115
1212      33241
1342     122089
1900     108015
2701     215461
4045     102347
8118     119230
10712    143251
13720     96874
13721    102078
15422     65510
15859     75232
21816     95169
34970    155024
Name: count, dtype: int64

In [6]:
# I already created this vocab table for the Austen-Melville set, so just gonna read it in
VOCAB = pd.read_csv(f'{output_dir}/{data_prefix}-VOCAB.csv').set_index('term_str').dropna()

In [7]:
VOCAB.head()

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,max_pos_group,n_pos_group,cat_pos_group,n_pos,cat_pos,stop,stem_porter,stem_snowball,stem_lancaster
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,2,1,9.713651e-07,19.973483,CD,CD,1,{'CD'},1,{'CD'},0,0,0,0
1,23,1,1.11707e-05,16.449921,CD,CD,2,"{'CD', 'NN'}",3,"{'CD', 'NNP', 'NN'}",0,1,1,1
10,6,2,2.914095e-06,18.38852,CD,CD,1,{'CD'},1,{'CD'},0,10,10,10
100,2,3,9.713651e-07,19.973483,CD,CD,1,{'CD'},1,{'CD'},0,100,100,100
1000,2,4,9.713651e-07,19.973483,CD,CD,1,{'CD'},1,{'CD'},0,1000,1000,1000


In [8]:
TOKEN.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str,pos_group
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
105,1,1,0,0,"('Sir', 'NNP')",NNP,Sir,sir,NN
105,1,1,0,1,"('Walter', 'NNP')",NNP,Walter,walter,NN
105,1,1,0,2,"('Elliot,', 'NNP')",NNP,"Elliot,",elliot,NN
105,1,1,0,3,"('of', 'IN')",IN,of,of,IN
105,1,1,0,4,"('Kellynch', 'NNP')",NNP,Kellynch,kellynch,NN


In [9]:
LIB.head()

Unnamed: 0_level_0,source_file_path,author,title,chap_regex,book_len,n_chaps
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
105,/home/cwp5xyj/Documents/MSDS/DS5001/data/auste...,"AUSTEN, JANE",PERSUASION,^Chapter\s+\d+$,83624,24
121,/home/cwp5xyj/Documents/MSDS/DS5001/data/auste...,"AUSTEN, JANE",NORTHANGER ABBEY,^CHAPTER\s+\d+$,77601,31
141,/home/cwp5xyj/Documents/MSDS/DS5001/data/auste...,"AUSTEN, JANE",MANSFIELD PARK,^CHAPTER\s+[IVXLCM]+$,160378,48
158,/home/cwp5xyj/Documents/MSDS/DS5001/data/auste...,"AUSTEN, JANE",EMMA,^\s*CHAPTER\s+[IVXLCM]+\s*$,160926,55
161,/home/cwp5xyj/Documents/MSDS/DS5001/data/auste...,"AUSTEN, JANE",SENSE AND SENSIBILITY,^CHAPTER\s+\d+$,119873,50


### Part 2: The Questions

#### Question 1: BOW and TFIDF functions

In [10]:
# things look fine
# let's write a BOW function

def bag_of_words(tokens, bag):
    
    # we're just gonna instantiate stuff within the function just in case
    OHCO = ['book_id', 'chap_id', 'para_num', 'sent_num', 'token_num']
    bags = dict(
        SENTS = OHCO[:4],
        PARAS = OHCO[:3],
        CHAPS = OHCO[:2],
        BOOKS = OHCO[:1]
    )
    
    # bit of error handling (mostly just for practice)
    bag = bag.upper()
    if bag not in bags:
        raise ValueError(F'Invalid input: Value must be one of {list(bags.keys())}')
        
        
    BOW = tokens.groupby(bags[bag]+['term_str']).term_str.count().to_frame('n')
    
    return BOW

In [11]:
# write the TFIDF function
# this is going to contain a lot more

def TFIDF_from_BOW(BOW, tf_method):
    # 'parameters' that we aren't gonna make variable for this function
    tf_norm_k = .5
    gradient_cmap = 'YlGnBu'
    
    # first make DTCM and calculate N
    DTCM = BOW.n.unstack(fill_value=0)
    N = DTCM.shape[0]
    
    tf_methods = {
        'sum': (DTCM.T / DTCM.T.sum()).T,
        'max': (DTCM.T / DTCM.T.max()).T,
        'log': (np.log2(1 + DTCM.T)).T,
        'raw':  DTCM,
        'double_norm': (DTCM.T / DTCM.T.max()).T,
        'binary': DTCM.T.astype('bool').astype('int').T
    }
    
    # just for fun
    tf_method = tf_method.lower()
    if tf_method not in tf_methods:
        raise ValueError(F'Invalid input: Value must be one of {list(tf_methods.keys())}')
    
    TF = tf_methods[tf_method]
    DF = DTCM.astype('bool').sum()
    IDF = np.log2(N / DF) # not variable according to assignment rules, but could easily be
    TFIDF = TF * IDF
    
    return TFIDF
    

In [12]:
# I can also print the functions by typing something like the below
# so if this was what you wanted, here's proof I know how to do it
# but no need to show things twice ya know? just takes up space, so I commented it out


# bag_of_words??
# TFIDF_from_BOW??

#### Question 2: Top 20 words using 'max', 'book'

In [19]:
BOW_q2 = bag_of_words(TOKEN, 'BOOKS')

TFIDF_q2 = TFIDF_from_BOW(BOW_q2, 'max')

BOW_q2['tfidf'] = TFIDF_q2.stack()
BOW_q2 = BOW_q2.groupby('term_str')['tfidf'].mean().sort_values(ascending=False).head(20)
BOW_q2

term_str
elinor        0.642969
vernon        0.493614
darcy         0.366742
reginald      0.351225
frederica     0.341733
crawford      0.337235
elliot        0.324016
weston        0.315232
pierre        0.293657
knightley     0.288490
tilney        0.262482
elton         0.259316
bingley       0.252012
wentworth     0.243650
courcy        0.242061
woodhouse     0.225281
churchhill    0.218329
marianne      0.202796
babbalanja    0.173393
mainwaring    0.170866
Name: tfidf, dtype: float64

All names. Makes sense I think.

#### Question 3: Top 20 using 'sum', 'chapter'

In [20]:
BOW_q3 = bag_of_words(TOKEN, 'CHAPS')

TFIDF_q3 = TFIDF_from_BOW(BOW_q3, 'sum')

BOW_q3['tfidf'] = TFIDF_q3.stack()
BOW_q3 = BOW_q3.groupby('term_str')['tfidf'].mean().sort_values(ascending=False).head(20)
BOW_q3

term_str
hypothetical     0.962653
slushing         0.769841
charmers         0.584106
tusculan         0.576389
disputations     0.576389
um               0.560363
inquest          0.515494
unbends          0.467914
increases        0.413067
communion        0.395583
confuting        0.330378
consents         0.293096
moredock         0.290542
moot             0.259887
transact         0.228927
introduces       0.228702
metamorphosis    0.212492
forgiver         0.209160
plujii           0.205909
ugh              0.202758
Name: tfidf, dtype: float64

#### Question 4: Difference in Q2 and Q3 answer by POS?

In [22]:
# save the q2 and q3 responses in memory
q2_ans = pd.DataFrame(BOW_q2)
q3_ans = pd.DataFrame(BOW_q3)

q2_ans = q2_ans.join(VOCAB['max_pos'], on = 'term_str', how = 'left')
q3_ans = q3_ans.join(VOCAB['max_pos'], on = 'term_str', how = 'left')

print(q2_ans['max_pos'].value_counts())
print('\n')
print(q3_ans['max_pos'].value_counts())

max_pos
NNP    20
Name: count, dtype: int64


max_pos
NNP    7
NN     6
NNS    2
JJ     2
VBZ    2
VB     1
Name: count, dtype: int64


In [23]:
q3_ans

Unnamed: 0_level_0,tfidf,max_pos
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1
hypothetical,0.962653,NNP
slushing,0.769841,NNP
charmers,0.584106,NNS
tusculan,0.576389,NNP
disputations,0.576389,NN
um,0.560363,JJ
inquest,0.515494,NNP
unbends,0.467914,NN
increases,0.413067,VBZ
communion,0.395583,NN


Referencing the UPenn tagset, the results of Q2 are all proper nouns, while the results of Q3 include common nouns, adjectives, and verbs. I think the supposed proper nouns from Q3 may be miscategorized, though ('hypothetical' and 'slushing' are not often proper nouns, but 'plujii' appears in both lists). But I am also using max_pos and not the specific word's POS. This seemed necessary as the words from Q2 are used many times, and it seemed excessive to regenerate POS tags for the filtered data.

#### Question 5: Who has the most significant adjective?

In [24]:
# start by splitting TOKEN table by author

TOKEN_q5 = TOKEN.join(LIB['author'], on = 'book_id', how = 'left')
TOKEN_AUSTEN = TOKEN_q5[TOKEN_q5['author']=='AUSTEN, JANE']
TOKEN_MELVILLE = TOKEN_q5[TOKEN_q5['author']=='MELVILLE, HERMAN']

In [25]:
# Then, run everything for each author

BOW_AUSTEN = bag_of_words(TOKEN_AUSTEN, 'CHAPS')
TFIDF_AUSTEN = TFIDF_from_BOW(BOW_AUSTEN, 'max')
BOW_AUSTEN['tfidf'] = TFIDF_AUSTEN.stack()

BOW_MELVILLE = bag_of_words(TOKEN_MELVILLE, 'CHAPS')
TFIDF_MELVILLE = TFIDF_from_BOW(BOW_MELVILLE, 'max')
BOW_MELVILLE['tfidf'] = TFIDF_MELVILLE.stack()

In [27]:
# I know if there isn't a "JJ" in here, I'll have to change this, but it's fine
AUSTEN = pd.DataFrame(BOW_AUSTEN.groupby('term_str')['tfidf'].mean().sort_values(ascending=False))
MELVILLE = pd.DataFrame(BOW_MELVILLE.groupby('term_str')['tfidf'].mean().sort_values(ascending=False))

AUSTEN = AUSTEN.join(VOCAB['max_pos'], on = 'term_str', how = 'left').dropna()
MELVILLE = MELVILLE.join(VOCAB['max_pos'], on = 'term_str', how = 'left').dropna()

AUSTEN = AUSTEN[AUSTEN['max_pos'].str.startswith('JJ')]
MELVILLE = MELVILLE[MELVILLE['max_pos'].str.startswith('JJ')]

In [28]:
print(AUSTEN.head())
print('\n')
print(MELVILLE.head())

                tfidf max_pos
term_str                     
undismayed   2.095926      JJ
precarious   0.762155      JJ
dreary       0.698642      JJ
eoconomical  0.493159      JJ
unmajestic   0.493159      JJ


                tfidf max_pos
term_str                     
ugh          4.462786      JJ
um           2.886301      JJ
manchineels  1.497387      JJ
sneezes      1.342485      JJ
adorable     1.226297      JJ


According to this, Melville's work has the most significant adjective if we break it down like this, but I'm a bit unsure if I would consider "ugh" or "um" to be an adjective. Same with "machineels" and "sneezes". I think Austen has the most significant adjective with "undismayed".