In [3]:
#Importing all the necessary packages required for pre-processing:
import numpy as np #Used for array operations
import nltk #Used as a basic package for nlp operations
from nltk.corpus import stopwords #Helps in stop words removal
from nltk.stem import WordNetLemmatizer #Helps in lemmatization process
from nltk.stem import PorterStemmer #Helps in stemming process
#Helps in tokenization of words:
from nltk.tokenize import word_tokenize 
from nltk.tokenize import sent_tokenize
import math #Used for mathematical equations solving
import pickle #Used to dump files to memory so, that we don't need to train model again anad again for a long time.
import os #Used to iterate files in the local system
import glob #Used for identifying paths in local system
import re #Used for removal of non-ascii characters
import sys #Used to manipulate different parts of the Python runtime environment
from pathlib import Path #Makes it very easy and efficient to deal with file paths
from collections import Counter #Used for carrying out frequency count

In [4]:
stop=set(stopwords.words('english')) #Storing all stop words in a set data structure
ps=PorterStemmer() #Creating object of PorterStemmer

In [5]:
#Defining certain functions that we'll use in future:
#Function to remove non-ascii caracters using regular expression:
def remove_non_ascii_characters(data):
    pattern=re.compile('[^a-zA-Z0-9\s]') #Keeping only alphabets and numbers in the text
    out=re.sub(pattern,'',data) 
    return out

#Function to find unique words and their frequencies in any document:
def find_unique_words_and_freq(data):
    unique_words=[] #Initially no unique word
    frequency_words={} #Initially frequency list is empty
    for x in data:
        if x not in unique_words: #If x is not in unique_words list, then append it.
            unique_words.append(x)
    for x in unique_words: #If x is in unique_words, increment it's frequency count
        frequency_words[x] = data.count(x)
    return frequency_words 

In [6]:
#Now, we'll start iterating the files from the given corpus:

folder='english-corpora/*' #Specifying location of the text files.
ps=PorterStemmer() #Creating an object of porter stemmer

words_all=[] #Array for storing all words of corpus
words_doc=[] #Array for storing only words of a specific document
global_dict={} #Dictionary used to keep unique keys(words) and their counts
files_with_index={} #Dictionary for keeping files with index names

index=0 #Initializing index value as 0

for file in glob.glob(folder): #Iterating all the text files using a loop
    a=file #Storing file name as a variable named a
    file=open(file,"r",encoding='UTF-8') #UTF-8 encoding is used to read all characters of document
    
    data=file.read() #Reading the file and storing it as a variable named data.
    data=remove_non_ascii_characters(data) #Removing non-ascii characters using the defined function above
    data=re.sub(re.compile('\d'),'',data)
    
    words=word_tokenize(data) #Performing tokenization on the data now and storing it as words.
    words=[word for word in words if len(words)>1]
    words=[word.lower() for word in words] #Lower casing the words
    words=[ps.stem(word) for word in words] #Stemming the words
    words=[word for word in words if word not in stop] #Removing stop words 
    
    words_doc.append(words) #Storing words of a specific document in words_doc
    
    global_dict.update(find_unique_words_and_freq(words)) 
    #Using function defined above and storing unique 
    #words and their frequencies in a dictionary.
    
    files_with_index[index]=os.path.basename(a) #Storing file names as indexes
    index=index+1
     
unique_words_all=set(global_dict.keys()) #We'll store all the unique words in this set.

In [7]:
len(unique_words_all)

524234

In [8]:
tf={}
df={}
for i in unique_words_all: #Iterating all unique words:
    tf[i]={} #Making term frequency empty. We'll append document names here containing this word in future.
    df[i]=0 #Making document frequency as 0. Means 0 documents have tis word initially.

In [11]:
#No, we'll traverse all the text files again and update these term frequency and document frequency. 
#It will be useful in TF-IDF method later.
folder='english-corpora' #Specifying folder name containing files
pathlist=Path(folder).rglob('*.txt') #Specifying extension of document names

index=0
Lavg=0
Ltot=0
Ld={} #Creating an empty dictionary

for path in pathlist: #Traversing all files using loop
    fname=str(path) #Storing file names as variable
    file=open(fname,"r",encoding="utf8") #Opening the file
    data=file.read() #Reading the file and storing it as variable.
    data=remove_non_ascii_characters(data) #Removing non ascii characters
    data=re.sub(re.compile('\d'),'',data) 
    words=word_tokenize(data) #Performing word tokenization
    words=[word.lower() for word in words] #Lower casing the words
    words=[ps.stem(word) for word in words] #Stemming the words
    Ld[index]=len(words) #len of current doc
    Ltot=Ltot+len(words) #sum of lens of all the docs
    words=[word for word in words if word not in stop] #Stop words removal
    counter=Counter(words) #Using counter data structure to maintain term frequency count
    for i in counter.keys(): #Iterating the unique words
        df[i]=df[i]+1 #Incrementing the document frequency by 1
        tf[i][index]=counter[i] #Storing ith value of counter in tf[i][index] place.
    index=index+1 #Incrementing index counter

In [12]:
#Saving it as posting list using pickle file
with open('Saved/posting_list.pkl','wb') as file:
    pickle.dump(tf,file)
    file.close()    
#Saving it as df using pickle file
with open('Saved/df.pkl','wb') as file:
    pickle.dump(df,file)
    file.close()
#Saving it as doc_len using pickle file
with open('Saved/doc_len.pkl','wb') as file:
    pickle.dump(Ld,file)
    file.close()

In [14]:
doc_norm={}
idx=0
for i in words_doc: #Traversing all words of any document
    l2=0
    for j in set(i): #If the word exists in document
        l2+=(i.count(j)*math.log(len(files_with_index)/df[j]))**2 #Written formula to store the idf 
    doc_norm[idx]=(math.sqrt(l2))
    idx+=1

In [15]:
#Saving it using pickle file
a_file=open("Saved/file_idx.pkl","wb")
pickle.dump(files_with_index, a_file)
a_file.close()

In [16]:
#Saving it using pickle file
a_file=open("Saved/unique_words_all.pkl","wb")
pickle.dump(unique_words_all , a_file)
a_file.close()

In [18]:
#Saving it using pickle file
with open('Saved/doc_words.pkl','wb') as file:
    pickle.dump(words_doc,file)
    file.close() 

In [19]:
#Saving it using pickle file
with open('Saved/doc_norm.pkl','wb') as file:
    pickle.dump(doc_norm,file)
    file.close()