# Document Retrieval using TF-IDF Weighted Rank and TF-IDF Cosine Similarity

## Imports

In [None]:
# !unzip stories

In [1]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
from num2words import num2words

import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle
import re
import math

# %load_ext autotime

In [2]:
title = "stories"
alpha = 0.3

## Taking all folders

In [3]:
folders = [x[0] for x in os.walk(str(os.getcwd())+'/'+title+'/')]

In [4]:
folders

['F:\\Doctorate\\word2vecGeneral\\Information-Retrieval\\2. TF-IDF Ranking - Cosine Similarity, Matching Score/stories/',
 'F:\\Doctorate\\word2vecGeneral\\Information-Retrieval\\2. TF-IDF Ranking - Cosine Similarity, Matching Score/stories/FARNON',
 'F:\\Doctorate\\word2vecGeneral\\Information-Retrieval\\2. TF-IDF Ranking - Cosine Similarity, Matching Score/stories/SRE']

In [5]:
type(folders), len(folders)

(list, 3)

In [None]:
folders[0] = folders[0][:len(folders[0])-1]

In [None]:
folders

In [None]:
type(folders), len(folders)

## Collecting the file names and titles

In [6]:
dataset = []

c = False

for i in folders:
    file = open(i+"/index.html", 'r')
    text = file.read().strip()
    file.close()

    file_name = re.findall('><A HREF="(.*)">', text)
    file_title = re.findall('<BR><TD> (.*)\n', text)

    if c == False:
        file_name = file_name[2:]
        c = True
        
    print(len(file_name), len(file_title))

    for j in range(len(file_name)):
        dataset.append((str(i) +"/"+ str(file_name[j]), file_title[j]))

452 452
0 0
15 15


In [7]:
len(dataset)

467

In [18]:
ii = 15
file_name[ii], file_title[ii]   #temp for play

IndexError: list index out of range

In [19]:
N = len (dataset)

In [20]:
def print_doc(id):
    print(dataset[id])
    file = open(dataset[id][0], 'r', encoding='cp1250')
    text = file.read().strip()
    file.close()
    print(text)

# Preprocessing

In [21]:
def convert_lower_case(data):
    return np.char.lower(data)

In [22]:
def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [23]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

In [24]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [25]:
def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

In [26]:
def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

In [27]:
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    data = stemming(data) #needed again as we need to stem the words
    data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

## Extracting Data

In [28]:
processed_text = []
processed_title = []

for i in dataset[:N]:
    file = open(i[0], 'r', encoding="utf8", errors='ignore')
    text = file.read().strip()
    file.close()

    processed_text.append(word_tokenize(str(preprocess(text))))
    processed_title.append(word_tokenize(str(preprocess(i[1]))))

## Calculating DF for all words

In [30]:
DF = {}

for i in range(N):
    tokens = processed_text[i]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}

    tokens = processed_title[i]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}
for i in DF:
    DF[i] = len(DF[i])

In [41]:
tuple(DF.items())[12][0]

'north'

In [None]:
# DF

In [42]:
total_vocab_size = len(DF)

In [43]:
total_vocab_size

32350

In [44]:
total_vocab = [x for x in DF]

In [45]:
print(total_vocab[:20])

['sharewar', 'trial', 'project', 'freewar', 'need', 'support', 'continu', 'one', 'hundr', 'west', 'fifti', 'three', 'north', 'jim', 'prentic', 'copyright', 'thousand', 'nine', 'nineti', 'brandon']


In [46]:
def doc_freq(word):
    c = 0
    try:
        c = DF[word]
    except:
        pass
    return c

### Calculating TF-IDF for body, we will consider this as the actual tf-idf as we will add the title weight to this.

In [47]:
doc = 0

tf_idf = {}

for i in range(N):
    
    tokens = processed_text[i]
    
    counter = Counter(tokens + processed_title[i])
    words_count = len(tokens + processed_title[i])
    
    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((N+1)/(df+1))
        
        tf_idf[doc, token] = tf*idf

    doc += 1

In [None]:
# tf_idf

### Calculating TF-IDF for Title

In [48]:
doc = 0

tf_idf_title = {}

for i in range(N):
    
    tokens = processed_title[i]
    counter = Counter(tokens + processed_text[i])
    words_count = len(tokens + processed_text[i])

    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((N+1)/(df+1)) #numerator is added 1 to avoid negative values
        
        tf_idf_title[doc, token] = tf*idf

    doc += 1

In [50]:
tf_idf_title

{(0, 'fifti'): 0.005434960598980563,
 (0, 'go'): 0.0002906893990853149,
 (0, 'hundr'): 0.002570392381970895,
 (0, 'jim'): 0.005269857144642146,
 (0, 'nine'): 0.0008420698058556812,
 (0, 'nineti'): 0.001312716278834434,
 (0, 'north'): 0.021919902239379185,
 (0, 'one'): 0.0003992734536048051,
 (0, 'prentic'): 0.008085184948722846,
 (0, 'thousand'): 0.0008961984314476824,
 (0, 'three'): 0.0015785688576535318,
 (0, 'west'): 0.0033256596840258424,
 (1, 'fox'): 0.11198195635330804,
 (1, 'sli'): 0.11239056533822733,
 (1, 'stori'): 0.0007682063585522353,
 (2, 'bomb'): 0.023742982378177565,
 (2, 'languag'): 0.027898190361964424,
 (2, 'parser'): 0.05635662309253824,
 (2, 'smart'): 0.014515325244714838,
 (3, 'garag'): 0.008785324492085607,
 (3, 'guy'): 0.0031995505339385936,
 (3, 'pshota'): 0.004145380786745974,
 (3, 'two'): 0.0018360457195132072,
 (4, 'day'): 0.0008890808368913132,
 (4, 'earli'): 0.0028199293637210404,
 (4, 'eighteen'): 0.0042241724258131105,
 (4, 'high'): 0.006977574311606774,


In [64]:
type(tf_idf), len(tf_idf)

(dict, 344378)

In [66]:
type(tf_idf_title), len(tf_idf_title)

(dict, 2450)

In [67]:
tf_idf[(0,"go")]

0.0002906893990853149

In [68]:
tf_idf_title[(0,"go")]

0.0002906893990853149

## Merging the TF-IDF according to weights

In [69]:
for i in tf_idf:
    tf_idf[i] *= alpha

In [70]:
for i in tf_idf_title:
    tf_idf[i] = tf_idf_title[i]

In [72]:
len(tf_idf), type(tf_idf)

(344378, dict)

# TF-IDF Matching Score Ranking

In [73]:
def matching_score(k, query):
    preprocessed_query = preprocess(query)
    tokens = word_tokenize(str(preprocessed_query))

    print("Matching Score")
    print("\nQuery:", query)
    print("")
    print(tokens)
    
    query_weights = {}

    for key in tf_idf:
        
        if key[1] in tokens:
            try:
                query_weights[key[0]] += tf_idf[key]
            except:
                query_weights[key[0]] = tf_idf[key]
    
    query_weights = sorted(query_weights.items(), key=lambda x: x[1], reverse=True)

    print("")
    
    l = []
    
    for i in query_weights[:10]:
        l.append(i[0])
    
    print(l)
    

matching_score(10, "Without the drive of Rebeccah's insistence, Kate lost her momentum. She stood next a slatted oak bench, canisters still clutched, surveying")

Matching Score

Query: Without the drive of Rebeccah's insistence, Kate lost her momentum. She stood next a slatted oak bench, canisters still clutched, surveying

['without', 'drive', 'rebeccah', 'insist', 'kate', 'lost', 'momentum', 'stood', 'next', 'slat', 'oak', 'bench', 'canist', 'still', 'clutch', 'survey']

[166, 200, 352, 433, 211, 350, 175, 187, 188, 294]


In [None]:
# print_doc(2)

# TF-IDF Cosine Similarity Ranking

In [74]:
def cosine_sim(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim

### Vectorising tf-idf

In [75]:
D = np.zeros((N, total_vocab_size))
for i in tf_idf:
    try:
        ind = total_vocab.index(i[1])
        D[i[0]][ind] = tf_idf[i]
    except:
        pass

In [76]:
def gen_vector(tokens):

    Q = np.zeros((len(total_vocab)))
    
    counter = Counter(tokens)
    words_count = len(tokens)

    query_weights = {}
    
    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = math.log((N+1)/(df+1))

        try:
            ind = total_vocab.index(token)
            Q[ind] = tf*idf
        except:
            pass
    return Q

In [77]:
def cosine_similarity(k, query):
    print("Cosine Similarity")
    preprocessed_query = preprocess(query)
    tokens = word_tokenize(str(preprocessed_query))
    
    print("\nQuery:", query)
    print("")
    print(tokens)
    
    d_cosines = []
    
    query_vector = gen_vector(tokens)
    
    for d in D:
        d_cosines.append(cosine_sim(query_vector, d))
        
    out = np.array(d_cosines).argsort()[-k:][::-1]
    
    print("")
    
    print(out)

#     for i in out:
#         print(i, dataset[i][0])

Q = cosine_similarity(10, "Without the drive of Rebeccah's insistence, Kate lost her momentum. She stood next a slatted oak bench, canisters still clutched, surveying")

Cosine Similarity

Query: Without the drive of Rebeccah's insistence, Kate lost her momentum. She stood next a slatted oak bench, canisters still clutched, surveying

['without', 'drive', 'rebeccah', 'insist', 'kate', 'lost', 'momentum', 'stood', 'next', 'slat', 'oak', 'bench', 'canist', 'still', 'clutch', 'survey']

[200 166 433 402 175 169 369  87 151 183]


In [None]:
print_doc(200)