### TF-IDF
- TF (tf_ij) = f_ij / max{f_1j, f_2j, ..., f_|v|j}
- IDF (idf_i) = log(N/df_i) # N = total number of documents ; df_i = number of documents where t_i appears
- TF-IDF(f_ij): weight of term t_i in document d_j; the number of times t_i appears in d_j

### Queries

- w_iq = (0.5 + (0.5 * f_iq / max{f_1q, f_2q, ..., f_|V|q}) * log(N/df_i)

### Cosine similarity
cosine (d_j, q) = <d_j @ q> / ||d_j|| * ||q||


In [149]:
import pandas as pd
import math
import numpy as np

In [32]:
word_freq_matrix = pd.read_csv('vsm.csv', index_col='Unnamed: 0')

In [33]:
word_freq_matrix

Unnamed: 0,antony and cleopatra,julius caesar,the tempest,hamlet,othello,macbeth
antony,157,73,0,0,0,1
brutus,4,157,0,2,0,0
caeser,232,227,0,2,1,8
calpurnia,0,10,0,0,0,0
cleopatra,57,0,0,0,0,0
mercy,2,0,3,8,5,5
worser,2,0,1,1,1,0


In [38]:
# weight of 'Antony' in 'Antony and Cleopatra'
f_11 = word_freq_matrix['antony and cleopatra']['antony'] 
f_max = max(word_freq_matrix['antony and cleopatra'])
tf_11 = f_11 / f_max
print(f'tf_11 = {tf_11}')
# math.log(x, base)
idf_11 = math.log(6/word_freq_matrix.loc['antony'].astype(bool).sum(), 2)
print(f'idf_11 = {idf_11}')

w_11 = tf_11 * idf_11
print(f'w_11 = {w_11}')

tf_11 = 0.6767241379310345
idf_11 = 1.0
w_11 = 0.6767241379310345


In [71]:
weight_matrix = dict()

def calc_tfidf(term, doc):
    freq = word_freq_matrix[doc][term]
    freq_max = max(word_freq_matrix[doc])
    tf = freq / freq_max
    
    idf = math.log(6/word_freq_matrix.loc[term].astype(bool).sum(), 2)
    
    return format(tf*idf, '.3f')

for doc in word_freq_matrix.columns:
    weight_matrix[doc] = list()
    for query in word_freq_matrix.index:
        weight_matrix[doc].append(calc_tfidf(query, doc))

In [159]:
tfidf_weights = pd.DataFrame(data=weight_matrix, index=word_freq_matrix.index, dtype=float)
tfidf_weights

Unnamed: 0,antony and cleopatra,julius caesar,the tempest,hamlet,othello,macbeth
antony,0.677,0.322,0.0,0.0,0.0,0.125
brutus,0.017,0.692,0.0,0.25,0.0,0.0
caeser,0.263,0.263,0.0,0.066,0.053,0.263
calpurnia,0.0,0.114,0.0,0.0,0.0,0.0
cleopatra,0.635,0.0,0.0,0.0,0.0,0.0
mercy,0.002,0.0,0.263,0.263,0.263,0.164
worser,0.005,0.0,0.195,0.073,0.117,0.0


In [230]:
# example of query 'Brutus Caeser'
def calc_query_weight(QUERY): 
    f_Vq = list()
    for term in list(word_freq_matrix.index):
        f_Vq.append(QUERY.count(term))    

    def calc_query_weight(term):
        f_iq = QUERY.count(term)
        idf = math.log(6/word_freq_matrix.loc[term].astype(bool).sum(), 2)
        weight = (0.5 + (0.5 * f_iq) / max(f_Vq)) * idf
        return format(weight ,'.2f')

    query_weight = list()
    for term in list(word_freq_matrix.index):
        query_weight.append(calc_query_weight(term))
    
    return pd.Series(data=query_weight, dtype=float, index=word_freq_matrix.index)

query_weights = dict()
query_weights['brutus caeser'] = calc_query_weight('brutus caeser')

In [253]:
# calculate similarity scores between 
# the query 'Brutus Caeser'(q) and the document 'Antony and Cleopatra'(d_j)

query = 'brutus caeser'

cosine_similarities = list()
for document in word_freq_matrix.columns:
    w_ij = tfidf_weights[document]
    # print(f'w_ij: {w_ij}')
    w_iq = query_weights[query] 
    # print(f'w_iq: {w_iq}')

    # cos_sim = w_iq.mul(w_ij, axis=0).sum() / math.sqrt((np.square(w_ij).sum()) * math.sqrt(np.square(w_iq).sum()))
    mul_dj_q = w_iq.mul(w_ij).sum()
    mod_dj = math.sqrt(np.square(w_ij).sum())
    mod_q = math.sqrt(np.square(w_iq).sum())
    cos_sim = mul_dj_q / (mod_dj * mod_q)
    cosine_similarities.append(format(cos_sim, '.2f'))
    
similarity_scores = pd.Series(cosine_similarities, index=word_freq_matrix.columns)
similarity_scores

antony and cleopatra    0.59
julius caesar           0.60
the tempest             0.13
hamlet                  0.39
othello                 0.13
macbeth                 0.21
dtype: object