In [1]:
import nltk
import os
import re
import numpy as np
import pandas as pd
from nltk.corpus import PlaintextCorpusReader,stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import string

In [2]:
 Corpus=PlaintextCorpusReader(r'C:\Users\Deepak\text', ".*\.txt")      #for reading and storing all the seperated text files into corpus. 

# Pre-Processing

In [3]:
print(Corpus.fileids()[0:5])                    #for checking if the files in corpus or showing up properly or not.

['1.txt', '10.txt', '100.txt', '101.txt', '102.txt']


In [48]:
file_dict={}
for file in Corpus.fileids():  
    final_word_list=[]
    word_List = [WordNetLemmatizer().lemmatize(word.lower()) for word in Corpus.words(file) if word not in stopwords.words('english') and word not in string.punctuation ]
    final_string = ' '.join(word_List)
    file_dict[file] = final_string
tfidf = TfidfVectorizer()
tfs = tfidf.fit_transform(file_dict.values())

#for word in wordList:
#        final_word_list.append(PorterStemmer().stem(word))
#not doing stemming because the words are not being stemmed properly, like recently to rec, genetics to genet, etc., I believe lemmatization works fine compared to stemming. Henceforth used that.   

In [92]:
tfs                            #for checking matrix size after pre-processing for 431 documents with respect to it's terms.

<431x8403 sparse matrix of type '<class 'numpy.float64'>'
	with 50038 stored elements in Compressed Sparse Row format>

In [6]:
feature_names = tfidf.get_feature_names()
dense = tfs.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
#created a dataframe to visulaize the vectors created for all the 431 files.

In [91]:
df.head()                                   #top 5 rows i.e., top 5 text files are shown with respect to each of its terms in columns with tf-idf weights.

Unnamed: 0,00,000,0000,00001,0001,0002,0008,0009,001,002,...,ziegler,ziemann,ziemba,zimmerman,zinck,zoeller,zone,zoucha,zung,zxy
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.067255,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [97]:
df['benefit'].head()                   #for checking a particular term weights for the whole collection . 

0    0.18896
1    0.00000
2    0.00000
3    0.00000
4    0.00000
Name: benefit, dtype: float64

In [385]:
print(tfs[0])                                #for checking the tf-idf values of each terms in document "1.txt", indexing start with 0.

  (0, 1313)	0.22498820329632568
  (0, 1127)	0.1889603373283924
  (0, 2582)	0.10431304373475285
  (0, 909)	0.06172753364024214
  (0, 3403)	0.35728235459514507
  (0, 4405)	0.1468117747281824
  (0, 4275)	0.07797923419543072
  (0, 8293)	0.0838151909406808
  (0, 7635)	0.01791023339133294
  (0, 5513)	0.06929703329056856
  (0, 4502)	0.08163033482804177
  (0, 5489)	0.0717717098145767
  (0, 4714)	0.12469009051607932
  (0, 3964)	0.05300972419939596
  (0, 5274)	0.05164542465610859
  (0, 3398)	0.07247383648732525
  (0, 3777)	0.06507840354064179
  (0, 3406)	0.10431304373475285
  (0, 6117)	0.05946710180050462
  (0, 7677)	0.030595987637157065
  (0, 5493)	0.07499606776544189
  (0, 6239)	0.05100369080468413
  (0, 7379)	0.037617542755676275
  (0, 2276)	0.08932058864878627
  (0, 4504)	0.06507840354064179
  :	:
  (0, 1541)	0.093228648767715
  (0, 2090)	0.2086260874695057
  (0, 610)	0.06298677910946414
  (0, 6370)	0.056572141060284506
  (0, 4039)	0.0859264475024929
  (0, 2735)	0.12345506728048428
  (0, 650

# Similarity checking

In [364]:
query1='premature children'                     #input query as per the need 
#query2='Gynecological cancer'

In [365]:
final_query_list = [WordNetLemmatizer().lemmatize(word.lower()) for word in query1.split() if word not in stopwords.words('english') and word not in string.punctuation ]
final_str = ' '.join(final_query_list)                #pre-processing for better results

In [366]:
final_str

'premature child'

In [367]:
similarity = cosine_similarity(tfidf.transform([final_str]),tfs)     # for calculating similarity from given query to available document collection.
similarDocs = cosine_similarity(tfs[0],tfs)                          # for calculating documents similar to given document, can change tfs[0] to required document id(tfs[0-430] to which, similar document needs to be calculated in the collection.)

In [368]:
similarity.shape

(1, 431)

In [369]:
sorted_similarity=np.sort(similarity)        #for sorting the similarity and similarDocs arrays
sorted_similarDocs=np.sort(similarDocs)

In [374]:
list(zip(*np.where(similarity>0.15)))            #I have set minimum threshold of 0.15 for calculating the query to document similarity and listing them as shown in below output.
                                                 #add +1 to the indexes below to get the file name.
                                                 #0+1=1, 13+1=14...... since all the files are started with index 0(tfs[0]). Files 1,14.... are referred to 1.txt,14.txt,... files.
                                                 #46, 54, 214,... docs are similar to premature children
                                                 #2, 72, 122,... docs are similar to gynecological cancer

[(0, 45),
 (0, 54),
 (0, 78),
 (0, 100),
 (0, 145),
 (0, 167),
 (0, 204),
 (0, 213),
 (0, 250),
 (0, 252),
 (0, 281),
 (0, 303),
 (0, 346),
 (0, 347),
 (0, 348),
 (0, 372)]

In [384]:
list(zip(*np.where(similarDocs>0.18)))         #I have set minimum threshold of 0.18 for calculating the document to document similarity in collection and listing them as shown in below output. Can decrease the value for more similar documents retrieval 

[(0, 0), (0, 308), (0, 351)]

In [315]:
(pd.DataFrame(sorted_similarity.reshape(431,1)))[-10:]            # when k=10, similar documents for a given query with cosine_similarity scores are shown.

Unnamed: 0,0
421,0.162462
422,0.162566
423,0.162682
424,0.165996
425,0.173836
426,0.178846
427,0.197731
428,0.203661
429,0.213362
430,0.269405


In [316]:
(pd.DataFrame(sorted_similarity.reshape(431,1)))[-5:]            # when k=5, similar documents for a given query with cosine_similarity scores are shown.

Unnamed: 0,0
426,0.178846
427,0.197731
428,0.203661
429,0.213362
430,0.269405


In [317]:
(pd.DataFrame(sorted_similarity.reshape(431,1)))[-3:]            # when k=3, similar documents for a given query with cosine_similarity scores are shown.

Unnamed: 0,0
428,0.203661
429,0.213362
430,0.269405


In [181]:
(pd.DataFrame(sorted_similarDocs.reshape(431,1)))[-5:]      # when k=5, similar documents for a given documents with high cosine_similarity scores are shown.

Unnamed: 0,0
426,0.173054
427,0.175987
428,0.188436
429,0.191291
430,1.0


In [375]:
(pd.DataFrame(sorted_similarDocs.reshape(431,1)))[-3:]      # when k=3, similar documents for a given documents with high cosine_similarity scores are shown.

Unnamed: 0,0
428,0.188436
429,0.191291
430,1.0


In [318]:
(pd.DataFrame(sorted_similarDocs.reshape(431,1)))[-1:]      #when k=1, similar documents for a given documents with high cosine_similarity scores are shown.

Unnamed: 0,0
430,1.0
