In [1]:
!pip install nltk numpy



In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/emmanuel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
f = open('Stopword-List.txt', 'r')
stop = f.read()

In [4]:
import re

pattern = r"[\w]+|[^-_\w\s()@#$%^&*+={[\]};,<>./?~`\"]"

def tokenize(text):
    return re.findall(pattern, text)

In [5]:
from collections import defaultdict
import os
import re
from nltk.stem import PorterStemmer

ps = PorterStemmer()

docs = set()
dic = {}
punc = ['!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '-', '_', '+', '=', '{', '[', ']', '}', ':', ';', "'", '"', ',', '<', '>', '.', '/', '?', '~', '`']

for subdir, dirs, files in os.walk('ResearchPapers'):
    for file in files:
        with open(subdir + os.sep + file, 'r', encoding='cp1252') as txt:
            # Extract document ID from the filename
            doc = re.search('[0-9]*', file).group()
            doc = int(doc)
            docs.add(doc) # Add document ID to the set of docs
            # Read and tokenize the text
            tokens = tokenize(txt.read())
            
            # Process each token
            for t in tokens:
                if t not in stop and t not in punc:
                    # Lowercase and stem the token
                    term = ps.stem(t.lower())

                    # Update the inverted index
                    if term not in dic:
                        dic[term] = defaultdict(int)
                    dic[term][doc] += 1

In [6]:
#Create a mapping from docs to indices in term-document matrix

doc_list = list(docs)
doc_map = dict()
for i in range(len(doc_list)):
    doc_map[doc_list[i]] = i

In [7]:
#Create a mapping from dictionary terms to indices in term-document matrix

dic_map = dict()
keys = sorted(dic.keys())
for i in range(len(keys)):
    dic_map[keys[i]] = i

In [8]:
# Fill the matrix with respective term frequencies

import numpy as np
a = np.empty(shape=(len(docs), len(dic)))
a.fill(0)
for key1, value1 in dic.items():
    for key2, value2 in value1.items():
        a[doc_map[key2]][dic_map[key1]] = value2

In [11]:
# Multiply each entry with its respect inverse document frequency

import math
from numpy import array
from numpy.linalg import norm
idf_map = dict()
for key1, value1 in dic.items():
    idf = math.log10(20/len(value1.keys()))
    idf_map[key1] = idf
    for key2, value2 in value1.items():
        a[doc_map[key2]][dic_map[key1]] *= idf

# Divide each vector with its norm

for vec in a:
    vec /= norm(vec)

[[0.         0.00021344 0.00096927 ... 0.00162425 0.         0.01196151]
 [0.         0.00012997 0.         ... 0.00427617 0.         0.        ]
 [0.         0.         0.         ... 0.01034325 0.         0.        ]
 ...
 [0.         0.00090674 0.         ... 0.         0.         0.        ]
 [0.         0.00212811 0.0027059  ... 0.00826861 0.         0.        ]
 [0.         0.00059461 0.         ... 0.00220191 0.         0.        ]]


In [10]:
# Store term-document matrix and mappings as pickle files

import pickle

with open("tdm.pkl", "wb") as f:
    pickle.dump(a, f)

with open("doc_map.pkl", "wb") as f:
    pickle.dump(doc_map, f)

with open("dic_map.pkl", "wb") as f:
    pickle.dump(dic_map, f)