In [1]:
import string
import re
import collections
import math

In [2]:
def preprocess():
    data = ''
    with open('002.txt', 'r+') as f:
        for line in f.readlines():
            line = re.sub('/', ' ', line)
#             remove punctuation marks
            line = re.sub('\\n$|,|-|\.|\'', '', line).split(' ')
            str_lines = list(map(lambda x: x.strip(string.punctuation), line))
#             remove words which have less than 4 characters and ending 's'
            str_copy = [re.sub('s$', '', word) for word in str_lines if not len(word) < 4]
            if len(str_copy) == 0:
                str_copy = ['\n']
            data += ' '.join(str_copy)
    with open('002.txt', 'w') as f:
#         the file is re-written, so if you need to re-run the preprocess, please recover the '002.txt'
        f.writelines(data)

In [3]:
preprocess()

In [4]:
def create_inverted_index():
    index = {}
    with open('002.txt', 'r+') as f:
        for line in f.readlines():
            line = re.sub('\\n$', '', line).split(' ')
            for word in line:
                word = word.lower()
                if word not in index:
                    index[word] = {}
    for term in index.keys():
        with open('002.txt', 'r+') as f:
            posting_list = {}
            for i, line in enumerate(f, 1):
                DID = 'D' + f"{i:03d}"
                line = re.sub('\\n$', '', line).split(' ')
#                 full-text match and ignore letter case
                pos = [i for i, word in enumerate(line) if term == word.lower()]
                if len(pos) > 0:
                    posting_list[DID] = pos
        index[term] = posting_list
    return index

In [5]:
INVERTED_INDEX = create_inverted_index()
INVERTED_INDEX

{'shower': {'D001': [0]},
 'continued': {'D001': [1], 'D078': [18], 'D093': [10]},
 'throughout': {'D001': [2]},
 'week': {'D001': [3],
  'D002': [1],
  'D004': [19],
  'D019': [4, 30],
  'D028': [25],
  'D052': [4, 30],
  'D058': [25]},
 'bahia': {'D001': [4], 'D003': [17], 'D004': [11], 'D011': [1]},
 'cocoa': {'D001': [5],
  'D002': [20],
  'D003': [9],
  'D004': [4, 26],
  'D012': [7]},
 'zone': {'D001': [6]},
 'alleviating': {'D001': [7]},
 'drought': {'D001': [8]},
 'since': {'D001': [9]},
 'early': {'D001': [10], 'D046': [27], 'D066': [2]},
 'january': {'D001': [11], 'D021': [3], 'D032': [26], 'D054': [3]},
 'improving': {'D001': [12]},
 'prospect': {'D001': [13]},
 'coming': {'D001': [14], 'D028': [24], 'D058': [24]},
 'temporao': {'D001': [15]},
 'although': {'D001': [16], 'D026': [0], 'D057': [0]},
 'normal': {'D001': [17]},
 'humidity': {'D001': [18]},
 'level': {'D001': [19], 'D032': [3, 14, 21]},
 'have': {'D001': [20],
  'D004': [21],
  'D018': [4],
  'D025': [7],
  'D032

In [6]:
def create_document_vector():
    document_vector = {}
    with open('002.txt', 'r+') as f:
        for i, line in enumerate(f, 1):
            DID = 'D' + f"{i:03d}"
            document_vector[DID] = {}
    for DID in document_vector.keys():
        for key, value in INVERTED_INDEX.items():
            if DID in value:
                document_vector[DID].update({key: value[DID]})
    return document_vector

In [7]:
DOCUMENT_VECTOR = create_document_vector()
DOCUMENT_VECTOR

{'D001': {'shower': [0],
  'continued': [1],
  'throughout': [2],
  'week': [3],
  'bahia': [4],
  'cocoa': [5],
  'zone': [6],
  'alleviating': [7],
  'drought': [8],
  'since': [9],
  'early': [10],
  'january': [11],
  'improving': [12],
  'prospect': [13],
  'coming': [14],
  'temporao': [15],
  'although': [16],
  'normal': [17],
  'humidity': [18],
  'level': [19],
  'have': [20],
  'been': [21],
  'restored': [22],
  'comissaria': [23],
  'smith': [24],
  'said': [25],
  'weekly': [26],
  'review': [27]},
 'D002': {'week': [1],
  'cocoa': [20],
  'arrival': [0, 25],
  'ended': [2],
  'february': [3],
  'were': [4],
  '155221': [5],
  'bag': [6],
  'kilo': [7],
  'making': [8],
  'cumulative': [9],
  'total': [10],
  'season': [11],
  'against': [12],
  'same': [13],
  'stage': [14],
  'last': [15],
  'year': [16],
  'again': [17],
  'seem': [18],
  'that': [19],
  'delivered': [21],
  'earlier': [22],
  'consignment': [23],
  'included': [24],
  'figure': [26]},
 'D003': {'bahia

In [8]:
def compute_term_weights():
    document_term_weights = {}
    for DID in DOCUMENT_VECTOR:
        term_weights = {}
        for term in DOCUMENT_VECTOR[DID].keys():
            term_tf = len(DOCUMENT_VECTOR[DID][term])
            max_tf = max(list(map(lambda x: len(x), DOCUMENT_VECTOR[DID].values())))
            N = len(DOCUMENT_VECTOR)
            df = len(INVERTED_INDEX[term])
            weight = (term_tf/max_tf)*math.log2(N/df)
            term_weights[term] = weight
        document_term_weights[DID] = term_weights
    return document_term_weights

In [9]:
DOCUMENT_TERM_WEIGHTS = compute_term_weights()
DOCUMENT_TERM_WEIGHTS

{'D001': {'shower': 6.643856189774724,
  'continued': 5.058893689053568,
  'throughout': 6.643856189774724,
  'week': 3.8365012677171206,
  'bahia': 4.643856189774724,
  'cocoa': 4.321928094887363,
  'zone': 6.643856189774724,
  'alleviating': 6.643856189774724,
  'drought': 6.643856189774724,
  'since': 6.643856189774724,
  'early': 5.058893689053568,
  'january': 4.643856189774724,
  'improving': 6.643856189774724,
  'prospect': 6.643856189774724,
  'coming': 5.058893689053568,
  'temporao': 6.643856189774724,
  'although': 5.058893689053568,
  'normal': 6.643856189774724,
  'humidity': 6.643856189774724,
  'level': 5.643856189774724,
  'have': 2.9434164716336326,
  'been': 4.321928094887363,
  'restored': 6.643856189774724,
  'comissaria': 4.321928094887363,
  'smith': 3.643856189774725,
  'said': 0.5353317329965557,
  'weekly': 6.643856189774724,
  'review': 6.643856189774724},
 'D002': {'week': 1.9182506338585603,
  'cocoa': 2.1609640474436813,
  'arrival': 6.643856189774724,
  'e

In [10]:
def compute_queries_frequency():
    queries = []
    with open('query.txt', 'r+') as f:
        for line in f.readlines():
            line = re.sub('\\n$|"', '', line).split(' ')
            query = {}
            for term in line:
                term = term.lower()
                if term not in query:
                    query[term] = 1
                else:
                    query[term] += 1
            queries.append(query)
    return queries

In [11]:
QUERY_FREQUENCY = compute_queries_frequency()
QUERY_FREQUENCY

[{'bank': 1},
 {'stock': 1, 'banking': 1},
 {'the': 1, 'company': 1, 'share': 1},
 {'company': 1, 'benefit': 1, 'shares': 1},
 {'brown': 1, 'forman': 1}]

In [12]:
def main():
    for query in QUERY_FREQUENCY:
        print(query)
        cos_sim = compute_similarity_score(query)
        top_3_DID = get_top_3_documents(query, cos_sim)
        display_result(top_3_DID, cos_sim)

In [13]:
def compute_similarity_score(query):
    cos_sim = {}
    for DID in DOCUMENT_TERM_WEIGHTS.keys():
        dq_sum = 0
        doc_length = compute_length(DOCUMENT_TERM_WEIGHTS[DID])
        query_length = compute_length(query)
        for term in query.keys():
            if term in DOCUMENT_TERM_WEIGHTS[DID]:
                dq_sum += query[term]*DOCUMENT_TERM_WEIGHTS[DID][term]
        cos_sim[DID] = dq_sum/(doc_length*query_length)
    return cos_sim

In [14]:
def get_top_3_documents(query, cos_sim):
    for term in query.keys():
        term_similarity_score_list = dict(sorted(cos_sim.items(), key=lambda x: x[1], reverse=True))
        top_3_DID = list(term_similarity_score_list.keys())[0:3]
    return top_3_DID

In [15]:
def display_result(top_3_DID, cos_sim):
    for DID in top_3_DID:
        print(DID)
        print('First 5 highest weighted keywords of the document')
        print_highest_weighted_keywords(DID)
        print('Number of unique keywords in the document: ' + str(compute_unique_word_number(DID)))
        print('Magnitude of the document vector(L2 norm): ' + str(compute_length(DOCUMENT_TERM_WEIGHTS[DID])))
        print('Similarity score: ' + str(cos_sim[DID]))
        print('--------------------------------------------------')

In [16]:
def compute_length(vector):
    vector_sum = 0
    for value in vector.values():
        vector_sum += math.pow(value, 2)
    return math.sqrt(vector_sum)

In [17]:
def print_highest_weighted_keywords(DID):
    sort_term_weights = dict(sorted(DOCUMENT_TERM_WEIGHTS[DID].items(), key=lambda x: x[1], reverse=True))
    top_5_keyword = list(sort_term_weights.keys())[0:5]
    for term in top_5_keyword:
        postings_list = '|'
        for posting_DID, positions in INVERTED_INDEX[term].items():
            postings_list += posting_DID + ':' + ','.join(str(x) for x in positions) + '|'
        print(term + '\t-> ' + postings_list)

In [18]:
def compute_unique_word_number(DID):
    return [len(x) for x in DOCUMENT_VECTOR[DID].values()].count(1)

In [19]:
main()

{'bank': 1}
D084
First 5 highest weighted keywords of the document
billion	-> |D029:5|D036:13,22|D059:5|D064:7,10|D084:9,24,27|D090:14|D091:8|D098:23|
bank	-> |D020:22|D030:12,36|D053:22|D060:3,27|D073:7|D084:1,17,22|D089:20|D090:20|D092:24|
agreed	-> |D084:5|
principle	-> |D084:6|
revision	-> |D084:7|
Number of unique keywords in the document: 20
Magnitude of the document vector(L2 norm): 9.745449960747372
Similarity score: 0.35646698739664956
--------------------------------------------------
D060
First 5 highest weighted keywords of the document
debt	-> |D019:19|D028:8|D029:16|D030:22,27,32|D036:5|D037:18|D052:19|D058:8|D059:16|D060:13,18,23|D090:15|
bank	-> |D020:22|D030:12,36|D053:22|D060:3,27|D073:7|D084:1,17,22|D089:20|D090:20|D092:24|
even	-> |D030:15|D060:6|
fail	-> |D030:20|D060:11|
restructured	-> |D030:29|D060:20|
Number of unique keywords in the document: 23
Magnitude of the document vector(L2 norm): 8.030220957974697
Similarity score: 0.2884047820944454
------------------