# Exercise: Shakespeare Play (Week 2) - Weighted TF-IDF
- Construct a TF-IDF Matrix using log weighting for the corpus Shakespeare play.
- Construct a query vector consisting of terms from the vocabulary and find the ranks of the plays with respect to the query

In [1]:
import nltk
nltk.download('shakespeare')
from nltk.corpus import shakespeare
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import numpy as np
import pandas as pd
import random # for random choice of query vocab
import math
from pprint import pprint

[nltk_data] Downloading package shakespeare to C:\Users\yuvaraja
[nltk_data]     manikandan\AppData\Roaming\nltk_data...
[nltk_data]   Package shakespeare is already up-to-date!


In [2]:
# Create Corpus Dictionary
corpus_dict = {}
for file_id in shakespeare.fileids():
    corpus_dict[file_id] = {}

# Get Tokens of each document
#for file_id in shakespeare.fileids():
#    doc = corpus_dict[file_id]
#    doc['words'] = shakespeare.words(file_id)

## Preprocess each document

In [3]:
stop_words = stopwords.words('english')
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer() 

In [None]:
def normalize_text(terms_list):
    '''
    Input:
        Document Terms as Iterable
    Output:
        Returns list of Terms after case folding
    '''
    return [w.lower() for w in terms_list]

def removeAlphaNumeric(terms_list):
    '''
    Input:
        Document Terms as Iterable
    Output:
        Returns list of terms having only alphabets
    '''
    return [w for w in terms_list if w.isalpha()]

def removeStopWords(terms_list):
    '''
    Input:
        Document Terms as Iterable
    Output:
        Returns list of Terms having no english stop words
    '''
    return [w for w in terms_list if w not in stop_words]

def doStemming(terms_list):
    '''
    Input:
        Document Terms as Iterable
    Outptu:
        Returns list after performing stemming over given terms
    '''
    return [ps.stem(w) for w in terms_list]

def doLemmatization(terms_list):
    '''
    Input:
        Document Terms as Iterable
    Output:
        Returns list after performing lemmatization over given terms
    '''
    return [lemmatizer.lemmatize(w) for w in terms_list]

def preProcessWords(terms_list):
    '''
    Input:
        Document Terms as Iterable
    Outptu:
        Returns Terms as List, in which
            Terms are Normalized (case folding)
            Removes AlphaNumeric Terms
            Removes English Stop Words
            Performs Stemming over the Terms
    '''
    terms_list = normalize_text(terms_list)
    terms_list = removeAlphaNumeric(terms_list)
    terms_list = removeStopWords(terms_list)
    terms_list = doStemming(terms_list)
    terms_list = doLemmatization(terms_list)
    return terms_list
    
# preprocess the text
for file_id in shakespeare.fileids():
    doc = corpus_dict[file_id]
    play_terms = shakespeare.words(file_id)
    doc['words'] = preProcessWords(play_terms)

## Build Vocabulary set for each document

In [None]:
def getVocabulary(doc_terms_list):
    '''
    Input:
        doc_terms: Document Terms as Iterable
    Output:
        return a set of given document terms
    '''
    return set(doc_terms_list)

for file_id in shakespeare.fileids():
    doc = corpus_dict[file_id]
    doc['vocab'] = getVocabulary(doc['words'])

## Build Word Frequency List for each document

In [None]:
def getWordCounts(doc_terms_list):
    '''
    Input:
        Document Terms as Iterable
    Output:
        Returns a Dictionary, having
            Each Term as a Key
            Count of each Term as its value
    '''
    counts = {}
    for w in doc_terms_list:
        count = counts.get(w, 0)
        counts[w] = count + 1
    return counts
    
for file_id in shakespeare.fileids():
    doc = corpus_dict[file_id]
    doc['counts'] = getWordCounts(doc['words'])

## Build Term Frequency List for each document

In [None]:
def getTermFrequency(doc_vocab_list, doc_count_dict, T):
    '''
    Input:
        doc_vocab_list: Document Vocabulary as Iterable
        doc_count_dict: Document Word Frequency as Dictionary
        T: Total Number of Words in the Document
    Output:
        Returns a Dictionary, having
            Each Term as a Key
            Term Frequency of each Term as its value
            TF = C/T
                C: Count of the Term
                T: Total Words In Document
    '''
    tf = {}
    for w in doc_vocab_list:
        tf[w] = doc_count_dict[w] / T
    return tf;

for file_id in shakespeare.fileids():
    doc = corpus_dict[file_id]
    doc['tf'] = getTermFrequency(doc['vocab'],doc['counts'], len(doc['words']))

## Build Corpus Vocabulary List

In [None]:
def getTotalVocabList(corpus_dict):
    '''
    Input:
    from the given corpus dict,
    construct total vocab set and returns it
    '''
    total_corpus_vocab = set() # empty set
    for k,v in corpus_dict.items():
        total_corpus_vocab = total_corpus_vocab.union(corpus_dict[k]['vocab'])
    return total_corpus_vocab

# Build list of Vocabulary list
vocab_list = []
for file_id in shakespeare.fileids():
    doc = corpus_dict[file_id]
    vocab_list.append(doc['vocab'])

from itertools import chain
corpus_total_vocab = set(chain.from_iterable(vocab_list))
#pprint(total_corpus_vocab)

## Build Document Frequency List

In [None]:
def getDocumentFrequency(corpus_dict, total_corpus_vocab):
    '''
    from the given docs dicts,
    construct Document Frequency list
    and returns it
    '''
    df = {}
    for w in total_corpus_vocab:
        df_count = 0
        for file_id,doc in corpus_dict.items():
            if w in corpus_dict[file_id]['vocab']:
                df_count = df_count + 1
        df[w] = df_count
    return df
            
corpus_df = getDocumentFrequency(corpus_dict, corpus_total_vocab)

## Build Inverse Document Frequency

In [None]:
def getInverseDocumentFrequency(corpus_df, total_corpus_vocab):
    '''
    using provided document frequency
    constructs IDF for each vocab in total_vocab
    and returns it as dict
    '''
    idf = {}
    for w in total_corpus_vocab:
        idf[w] = corpus_df[w] / len(shakespeare.fileids())
    return idf

corpus_idf = getInverseDocumentFrequency(corpus_df, corpus_total_vocab)

## Build Weighted TF-IDF

In [None]:
def getWeightedTFIDFOfATerm(tc, idf):
    '''
    Input:
        tc: Term Count of a Term
        idf: Inverse Document Frequency of a Term
    Output:
        Computes Weighted TF-IDF and returns it
    '''
    wv = 0
    if(tc > 0):
        wv = 1 + np.log10(tc)
    #print('tc: ', tc, 'idf: ', idf, 'wf-idf: ', wv * idf)
    return wv * idf

def getWeightedTFIDFOfADocument(doc_vocab, doc_count, idf):
    '''
    Input:
        doc_vocab: Document Vocabulary as Iterable
        doc_count: Document Word Frequency as Dictionary
        idf: Inverse Document Frequency of the Corpus as Dictionary
    Output:
        Dictionary having Wv-TF-IDF for each term in doc_vocab
    '''
    wv_tf_idf = {}
    for w in doc_vocab:
        wv_tf_idf[w] = getWeightedTFIDFOfATerm(doc_count[w], idf[w])
    return wv_tf_idf

for file_id,doc in corpus_dict.items():
    vocab = corpus_dict[file_id]['vocab']
    counts = corpus_dict[file_id]['counts']
    doc = corpus_dict[file_id]
    doc['wv_tf_idf'] = getWeightedTFIDFOfADocument(vocab, counts, corpus_idf)

## Build Weighted TF-IDF DataFrame

In [None]:
col_headers = shakespeare.fileids()
tfidf_matrix = pd.DataFrame(columns=col_headers)

for w in corpus_total_vocab:
    wv = []
    for file_id,doc in corpus_dict.items():
        wv.append(doc['wv_tf_idf'].get(w, 0))
    tfidf_matrix.loc[w] = wv

#for file_id,doc in docs.items():
#    for w in doc['vocab']:
#        df.loc[w][file_id] = doc['wv_tf_idf']

In [None]:
# sort based on vocabulary
tfidf_matrix.sort_index(inplace=True)

In [None]:
tfidf_matrix

## Build Query Vocabulary
- Randomly selecting query words from Corpus Vocabulary
- Out-of-Vocabulary (OOV) is not considered in this test case

In [None]:
# max_query_words = len(total_corpus_vocab) # number of vocab words that we need to select
# max_query_words = int(len(corpus_total_vocab)/8)
max_query_words = 20
print('max_query_words: ', max_query_words)

In [None]:
query_words = random.choices(list(corpus_total_vocab), k=max_query_words)
print('query_words: ', query_words)

In [None]:
# Build  Query Document
query_doc = {}
query_doc['words'] = preProcessWords(query_words)
query_doc['vocab'] = getVocabulary(query_doc['words'])
query_doc['counts'] = getWordCounts(query_doc['words'])
query_doc['tf'] = getTermFrequency(query_doc['vocab'], query_doc['counts'], len(query_doc['words']))
query_doc['wv_tf_idf'] = getWeightedTFIDFOfADocument(query_doc['vocab'], query_doc['counts'], corpus_idf)

# for terms that don't exist in query words, assign 0 as Weighted TF-IDF for those terms
for w in corpus_total_vocab:
    count = query_doc['wv_tf_idf'].get(w, 0)
    query_doc['wv_tf_idf'][w] = count

In [None]:
def cosine_distance(a, b):
    return 1 - cosine_similarity(a,b)


def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [None]:
query_tf_df_matrix = pd.DataFrame(columns=['Wv-TF-IDF'])

for file_id,doc in corpus_dict.items():
    cos_sim= []
    doc_val = np.array(tfidf_matrix[file_id]).T
    pprint(doc_val.shape)
    query_val = np.array(list(query_doc['wv_tf_idf'].values()))
    query_tf_df_matrix.loc[file_id] = cosine_similarity(doc_val, query_val)

In [None]:
query_tf_df_matrix.sort_values(by='Wv-TF-IDF',ascending=False, inplace=True)

In [None]:
query_tf_df_matrix

# Build Query Document with Out-Of-Vocabulary (OOV)
- TO-DO
    