In [12]:
import nltk
import numpy as np
from tqdm.notebook import tqdm_notebook 
import tqdm
import pandas as pd
import os
import math
import warnings
warnings.filterwarnings('ignore')
import argparse
import timeit

from ir_system import IRSystem

Reading and separating documents

In [13]:
ExpReg = nltk. RegexpTokenizer('(?:[A-Za-z]\.)+|\d+(?:\.\d+)?%?|\w+(?:\-\w+)*')
MotsVides = nltk.corpus.stopwords.words('english')
Porter = nltk.PorterStemmer()
Lancaster = nltk.LancasterStemmer()

In [14]:
def load_data(path):
    
    
    #_____________ Read data from CISI.ALL file and store in dictinary ________________
    
    with open(os.path.join(path, 'CISI.ALL')) as f:
        lines = ""
        for l in f.readlines():
            lines += "\n" + l.strip() if l.startswith(".") else " " + l.strip()
        lines = lines.lstrip("\n").split("\n")
 
    doc_set = {}
    doc_id = ""
    doc_text = ""

    for l in lines:
        if l.startswith(".I"):
            doc_id = l.split(" ")[1].strip() 
        elif l.startswith(".X"):
            doc_set[doc_id] = doc_text.lstrip(" ")
            doc_id = ""
            doc_text = ""
        elif l.startswith(".T") or l.startswith(".W"):
            doc_text += l.strip()[3:] + " "

    print(f"Number of documents = {len(doc_set)}")
    print(doc_set["1"]) 
    
    
    #_____________ Read data from CISI.QRY file and store in dictinary ________________
    
    with open(os.path.join(path, 'CISI.QRY')) as f:
        lines = ""
        for l in f.readlines():
            lines += "\n" + l.strip() if l.startswith(".") else " " + l.strip()
        lines = lines.lstrip("\n").split("\n")
          
    qry_set = {}
    qry_id = ""
    for l in lines:
        if l.startswith(".I"):
            qry_id = l.split(" ")[1].strip() 
        elif l.startswith(".W"):
            qry_set[qry_id] = l.strip()[3:]
            qry_id = ""

    print(f"\n\nNumber of queries = {len(qry_set)}")    
    print(qry_set["1"]) 
    
    
    #_____________ Read data from CISI.REL file and store in dictinary ________________
    
    rel_set = {}
    with open(os.path.join(path, 'CISI.REL')) as f:
        for l in f.readlines():
            qry_id = l.lstrip(" ").strip("\n").split("\t")[0].split(" ")[0] 
            doc_id = l.lstrip(" ").strip("\n").split("\t")[0].split(" ")[-1]

            if qry_id in rel_set:
                rel_set[qry_id].append(doc_id)
            else:
                rel_set[qry_id] = []
                rel_set[qry_id].append(doc_id)

    print(f"\n\nNumber of mappings = {len(rel_set)}")
    print(rel_set["1"]) 
    
    doc_set = {int(id):doc for (id,doc) in doc_set.items()}
    qry_set = {int(id):qry for (id,qry) in qry_set.items()}
    rel_set = {int(qid):list(map(int, did_lst)) for (qid,did_lst) in rel_set.items()}
    
    return doc_set, qry_set, rel_set

In [15]:
doc_set, qry_set, rel_set = load_data('documents')

Number of documents = 1460
18 Editions of the Dewey Decimal Classifications The present study is a history of the DEWEY Decimal Classification.  The first edition of the DDC was published in 1876, the eighteenth edition in 1971, and future editions will continue to appear as needed.  In spite of the DDC's long and healthy life, however, its full story has never been told.  There have been biographies of Dewey that briefly describe his system, but this is the first attempt to provide a detailed history of the work that more than any other has spurred the growth of librarianship in this country and abroad. 


Number of queries = 112
What problems and concerns are there in making up descriptive titles? What difficulties are involved in automatically retrieving articles from approximate titles? What is the usual relevance of the content of articles to their titles?


Number of mappings = 76
['28', '35', '38', '42', '43', '52', '65', '76', '86', '150', '189', '192', '193', '195', '215', '26

Separating words

In [13]:

dico_terms = {}
for key in doc_set.keys():
    #termes = doc_set[key].split(" ")
    termes = ExpReg.tokenize(doc_set[key])
    dico_terms[key] = termes

dico_terms

{1: ['18',
  'Editions',
  'of',
  'the',
  'Dewey',
  'Decimal',
  'Classifications',
  'The',
  'present',
  'study',
  'is',
  'a',
  'history',
  'of',
  'the',
  'DEWEY',
  'Decimal',
  'Classification',
  'The',
  'first',
  'edition',
  'of',
  'the',
  'DDC',
  'was',
  'published',
  'in',
  '1876',
  'the',
  'eighteenth',
  'edition',
  'in',
  '1971',
  'and',
  'future',
  'editions',
  'will',
  'continue',
  'to',
  'appear',
  'as',
  'needed',
  'In',
  'spite',
  'of',
  'the',
  'DDC',
  's',
  'long',
  'and',
  'healthy',
  'life',
  'however',
  'its',
  'full',
  'story',
  'has',
  'never',
  'been',
  'told',
  'There',
  'have',
  'been',
  'biographies',
  'of',
  'Dewey',
  'that',
  'briefly',
  'describe',
  'his',
  'system',
  'but',
  'this',
  'is',
  'the',
  'first',
  'attempt',
  'to',
  'provide',
  'a',
  'detailed',
  'history',
  'of',
  'the',
  'work',
  'that',
  'more',
  'than',
  'any',
  'other',
  'has',
  'spurred',
  'the',
  'growth'

Stemming words

In [5]:


doc_stems_porter = {}
doc_stems_lancaster = {}
for key in dico_terms.keys():
    TermesSansMotsVides_p = [Porter.stem(terme) for terme in dico_terms[key] if terme.lower() not in MotsVides]
    TermesSansMotsVides_l = [Lancaster.stem(terme) for terme in dico_terms[key] if terme.lower() not in MotsVides]
    doc_stems_porter[key] = TermesSansMotsVides_p
    doc_stems_lancaster[key] = TermesSansMotsVides_l

doc_stems_porter


{1: ['18',
  'edit',
  'dewey',
  'decim',
  'classif',
  'present',
  'studi',
  'histori',
  'dewey',
  'decim',
  'classif',
  'first',
  'edit',
  'ddc',
  'publish',
  '1876',
  'eighteenth',
  'edit',
  '1971',
  'futur',
  'edit',
  'continu',
  'appear',
  'need',
  'spite',
  'ddc',
  'long',
  'healthi',
  'life',
  'howev',
  'full',
  'stori',
  'never',
  'told',
  'biographi',
  'dewey',
  'briefli',
  'describ',
  'system',
  'first',
  'attempt',
  'provid',
  'detail',
  'histori',
  'work',
  'spur',
  'growth',
  'librarianship',
  'countri',
  'abroad'],
 2: ['use',
  'made',
  'technic',
  'librari',
  'report',
  'analysi',
  '6300',
  'act',
  'use',
  '104',
  'technic',
  'librari',
  'unit',
  'kingdom',
  'librari',
  'use',
  'one',
  'aspect',
  'wider',
  'pattern',
  'inform',
  'use',
  'inform',
  'transfer',
  'librari',
  'restrict',
  'use',
  'document',
  'take',
  'account',
  'document',
  'use',
  'outsid',
  'librari',
  'still',
  'less',
  'i

Calculate frequencies for each word

In [6]:
def descripteur(dico):
    TermesFrequence = {}
    for key in dico.keys():
        for terme in dico[key]:
            if (key,terme) in TermesFrequence.keys():
                TermesFrequence[(key,terme)] += 1
            else:
                TermesFrequence[(key,terme)] = 1
    return TermesFrequence

In [7]:
freqs_porter = descripteur(doc_stems_porter)
freqs_porter

{(1, '18'): 1,
 (1, 'edit'): 4,
 (1, 'dewey'): 3,
 (1, 'decim'): 2,
 (1, 'classif'): 2,
 (1, 'present'): 1,
 (1, 'studi'): 1,
 (1, 'histori'): 2,
 (1, 'first'): 2,
 (1, 'ddc'): 2,
 (1, 'publish'): 1,
 (1, '1876'): 1,
 (1, 'eighteenth'): 1,
 (1, '1971'): 1,
 (1, 'futur'): 1,
 (1, 'continu'): 1,
 (1, 'appear'): 1,
 (1, 'need'): 1,
 (1, 'spite'): 1,
 (1, 'long'): 1,
 (1, 'healthi'): 1,
 (1, 'life'): 1,
 (1, 'howev'): 1,
 (1, 'full'): 1,
 (1, 'stori'): 1,
 (1, 'never'): 1,
 (1, 'told'): 1,
 (1, 'biographi'): 1,
 (1, 'briefli'): 1,
 (1, 'describ'): 1,
 (1, 'system'): 1,
 (1, 'attempt'): 1,
 (1, 'provid'): 1,
 (1, 'detail'): 1,
 (1, 'work'): 1,
 (1, 'spur'): 1,
 (1, 'growth'): 1,
 (1, 'librarianship'): 1,
 (1, 'countri'): 1,
 (1, 'abroad'): 1,
 (2, 'use'): 6,
 (2, 'made'): 1,
 (2, 'technic'): 4,
 (2, 'librari'): 8,
 (2, 'report'): 1,
 (2, 'analysi'): 1,
 (2, '6300'): 1,
 (2, 'act'): 2,
 (2, '104'): 1,
 (2, 'unit'): 1,
 (2, 'kingdom'): 1,
 (2, 'one'): 2,
 (2, 'aspect'): 1,
 (2, 'wider'): 1,
 

In [8]:
freqs_lan = descripteur(doc_stems_lancaster)
freqs_lan

{(1, '18'): 1,
 (1, 'edit'): 4,
 (1, 'dewey'): 3,
 (1, 'decim'): 2,
 (1, 'class'): 2,
 (1, 'pres'): 1,
 (1, 'study'): 1,
 (1, 'hist'): 2,
 (1, 'first'): 2,
 (1, 'ddc'): 2,
 (1, 'publ'): 1,
 (1, '1876'): 1,
 (1, 'eighteen'): 1,
 (1, '1971'): 1,
 (1, 'fut'): 1,
 (1, 'continu'): 1,
 (1, 'appear'): 1,
 (1, 'nee'): 1,
 (1, 'spit'): 1,
 (1, 'long'): 1,
 (1, 'healthy'): 1,
 (1, 'lif'): 1,
 (1, 'howev'): 1,
 (1, 'ful'): 1,
 (1, 'story'): 1,
 (1, 'nev'): 1,
 (1, 'told'): 1,
 (1, 'biograph'): 1,
 (1, 'brief'): 1,
 (1, 'describ'): 1,
 (1, 'system'): 1,
 (1, 'attempt'): 1,
 (1, 'provid'): 1,
 (1, 'detail'): 1,
 (1, 'work'): 1,
 (1, 'spur'): 1,
 (1, 'grow'): 1,
 (1, 'libr'): 1,
 (1, 'country'): 1,
 (1, 'abroad'): 1,
 (2, 'us'): 8,
 (2, 'mad'): 1,
 (2, 'techn'): 4,
 (2, 'libr'): 8,
 (2, 'report'): 1,
 (2, 'analys'): 1,
 (2, '6300'): 1,
 (2, 'act'): 2,
 (2, '104'): 1,
 (2, 'unit'): 1,
 (2, 'kingdom'): 1,
 (2, 'on'): 2,
 (2, 'aspect'): 1,
 (2, 'wid'): 1,
 (2, 'pattern'): 1,
 (2, 'inform'): 7,
 (2, 'tr

Calculate frequencies inverse

In [9]:
def descripteur_inverse(dico):
    TermesFrequence = {}
    for key in dico.keys():
        for terme in dico[key]:
            if (terme,key) in TermesFrequence.keys():
                TermesFrequence[(terme,key)] += 1
            else:
                TermesFrequence[(terme,key)] = 1
        for k in dico.keys():
            if k != key:
                for t in dico[k]:
                    if t not in dico[key]:
                        TermesFrequence[(t,key)] = 0
    return TermesFrequence

In [10]:
inverse_p = descripteur_inverse(doc_stems_porter)
inverse_p

{('18', 1): 1,
 ('edit', 1): 4,
 ('dewey', 1): 3,
 ('decim', 1): 2,
 ('classif', 1): 2,
 ('present', 1): 1,
 ('studi', 1): 1,
 ('histori', 1): 2,
 ('first', 1): 2,
 ('ddc', 1): 2,
 ('publish', 1): 1,
 ('1876', 1): 1,
 ('eighteenth', 1): 1,
 ('1971', 1): 1,
 ('futur', 1): 1,
 ('continu', 1): 1,
 ('appear', 1): 1,
 ('need', 1): 1,
 ('spite', 1): 1,
 ('long', 1): 1,
 ('healthi', 1): 1,
 ('life', 1): 1,
 ('howev', 1): 1,
 ('full', 1): 1,
 ('stori', 1): 1,
 ('never', 1): 1,
 ('told', 1): 1,
 ('biographi', 1): 1,
 ('briefli', 1): 1,
 ('describ', 1): 1,
 ('system', 1): 1,
 ('attempt', 1): 1,
 ('provid', 1): 1,
 ('detail', 1): 1,
 ('work', 1): 1,
 ('spur', 1): 1,
 ('growth', 1): 1,
 ('librarianship', 1): 1,
 ('countri', 1): 1,
 ('abroad', 1): 1,
 ('use', 1): 0,
 ('made', 1): 0,
 ('technic', 1): 0,
 ('librari', 1): 0,
 ('report', 1): 0,
 ('analysi', 1): 0,
 ('6300', 1): 0,
 ('act', 1): 0,
 ('104', 1): 0,
 ('unit', 1): 0,
 ('kingdom', 1): 0,
 ('one', 1): 0,
 ('aspect', 1): 0,
 ('wider', 1): 0,
 

In [11]:
inverse_l = descripteur_inverse(doc_stems_lancaster)
inverse_l

{('18', 1): 1,
 ('edit', 1): 4,
 ('dewey', 1): 3,
 ('decim', 1): 2,
 ('class', 1): 2,
 ('pres', 1): 1,
 ('study', 1): 1,
 ('hist', 1): 2,
 ('first', 1): 2,
 ('ddc', 1): 2,
 ('publ', 1): 1,
 ('1876', 1): 1,
 ('eighteen', 1): 1,
 ('1971', 1): 1,
 ('fut', 1): 1,
 ('continu', 1): 1,
 ('appear', 1): 1,
 ('nee', 1): 1,
 ('spit', 1): 1,
 ('long', 1): 1,
 ('healthy', 1): 1,
 ('lif', 1): 1,
 ('howev', 1): 1,
 ('ful', 1): 1,
 ('story', 1): 1,
 ('nev', 1): 1,
 ('told', 1): 1,
 ('biograph', 1): 1,
 ('brief', 1): 1,
 ('describ', 1): 1,
 ('system', 1): 1,
 ('attempt', 1): 1,
 ('provid', 1): 1,
 ('detail', 1): 1,
 ('work', 1): 1,
 ('spur', 1): 1,
 ('grow', 1): 1,
 ('libr', 1): 1,
 ('country', 1): 1,
 ('abroad', 1): 1,
 ('us', 1): 0,
 ('mad', 1): 0,
 ('techn', 1): 0,
 ('report', 1): 0,
 ('analys', 1): 0,
 ('6300', 1): 0,
 ('act', 1): 0,
 ('104', 1): 0,
 ('unit', 1): 0,
 ('kingdom', 1): 0,
 ('on', 1): 0,
 ('aspect', 1): 0,
 ('wid', 1): 0,
 ('pattern', 1): 0,
 ('inform', 1): 0,
 ('transf', 1): 0,
 ('res

reading files to save as dataframes

In [18]:
freq_poid_porter = ''
with open(r'freqs_poids_porter.txt','r') as f:
         for i in f.readlines():
            freq_poid_porter=i #string
freq_poid_porter = eval(freq_poid_porter) # this is orignal dict with instace dict
len(freq_poid_porter)

77171

In [19]:
columns = ['Document','Terme','Frequence','Poid']
rows= []
for key in freq_poid_porter.keys():
    row = [key[0],key[1],freq_poid_porter[key][0],freq_poid_porter[key][1]]
    rows.append(row)

df_freqs_poids_porter = pd.DataFrame(data=rows,columns=columns)
df_freqs_poids_porter.head()

Unnamed: 0,Document,Terme,Frequence,Poid
0,1,18,1,0.565909
1,1,edit,4,1.543491
2,1,dewey,3,1.566545
3,1,decim,2,0.982483
4,1,classif,2,0.586663


In [20]:
freq_poid_lancaster = ''
with open(r'freqs_poids_lancaster.txt','r') as f:
         for i in f.readlines():
            freq_poid_lancaster=i #string
freq_poid_lancaster = eval(freq_poid_lancaster) # this is orignal dict with instace dict
len(freq_poid_lancaster)

75466

In [21]:
columns = ['Document','Terme','Frequence','Poid']
rows= []
for key in freq_poid_lancaster.keys():
    row = [key[0],key[1],freq_poid_lancaster[key][0],freq_poid_lancaster[key][1]]
    rows.append(row)

df_freqs_poids_lan = pd.DataFrame(data=rows,columns=columns)
len(df_freqs_poids_lan)

75466

In [53]:
df_freqs_poids_lan.to_csv('freq_poids_lancaster.csv',index=False)
df_freqs_poids_porter.to_csv('freq_poids_porter.csv',index=False)

In [23]:
inverse_poid_porter = {}
inverse_poid_lancaster = {}

In [24]:
for key in inverse_p.keys():
    if (key[1],key[0]) in freq_poid_porter.keys():
        inverse_poid_porter[key] = freq_poid_porter[(key[1],key[0])]
    else: inverse_poid_porter[key]=(0,0)

inverse_poid_porter

{('18', 1): (1, 0.565909017147027),
 ('edit', 1): (4, 1.5434905250073216),
 ('dewey', 1): (3, 1.5665449229653912),
 ('decim', 1): (2, 0.982483187415549),
 ('classif', 1): (2, 0.5866625214062646),
 ('present', 1): (1, 0.1877219509003781),
 ('studi', 1): (1, 0.17545995052595345),
 ('histori', 1): (2, 0.7358471441205445),
 ('first', 1): (2, 0.48859585375514675),
 ('ddc', 1): (2, 1.2334338101770548),
 ('publish', 1): (1, 0.2807087921539409),
 ('1876', 1): (1, 0.6408702713486026),
 ('eighteenth', 1): (1, 0.7911625539835742),
 ('1971', 1): (1, 0.43136122496691015),
 ('futur', 1): (1, 0.30350169701850216),
 ('continu', 1): (1, 0.3379036103833587),
 ('appear', 1): (1, 0.3148773895116402),
 ('need', 1): (1, 0.20839407201501553),
 ('spite', 1): (1, 0.5803330184572565),
 ('long', 1): (1, 0.43926136536343463),
 ('healthi', 1): (1, 0.7911625539835742),
 ('life', 1): (1, 0.4240162534231529),
 ('howev', 1): (1, 0.28771330156332897),
 ('full', 1): (1, 0.4138864669617845),
 ('stori', 1): (1, 0.64087027

In [25]:
for key in inverse_l.keys():
    if (key[1],key[0]) in freq_poid_lancaster.keys():
        inverse_poid_lancaster[key] = freq_poid_lancaster[(key[1],key[0])]
    else: inverse_poid_lancaster[key]=(0,0)

inverse_poid_lancaster

{('18', 1): (1, 0.565909017147027),
 ('edit', 1): (4, 1.4178437779965243),
 ('dewey', 1): (3, 1.5665449229653912),
 ('decim', 1): (2, 0.982483187415549),
 ('class', 1): (2, 0.4978710884287293),
 ('pres', 1): (1, 0.186317505997247),
 ('study', 1): (1, 0.17545995052595345),
 ('hist', 1): (2, 0.6611096473669597),
 ('first', 1): (2, 0.48747004080393636),
 ('ddc', 1): (2, 1.2334338101770548),
 ('publ', 1): (1, 0.20201067831787864),
 ('1876', 1): (1, 0.6408702713486026),
 ('eighteen', 1): (1, 0.6167169050885274),
 ('1971', 1): (1, 0.43136122496691015),
 ('fut', 1): (1, 0.30350169701850216),
 ('continu', 1): (1, 0.34100264951924825),
 ('appear', 1): (1, 0.3148773895116402),
 ('nee', 1): (1, 0.20802580488989758),
 ('spit', 1): (1, 0.5803330184572565),
 ('long', 1): (1, 0.3908992088808837),
 ('healthy', 1): (1, 0.7911625539835742),
 ('lif', 1): (1, 0.4240162534231529),
 ('howev', 1): (1, 0.28771330156332897),
 ('ful', 1): (1, 0.34420124825492365),
 ('story', 1): (1, 0.6408702713486026),
 ('nev'

In [26]:
len(inverse_poid_porter)

10109040

In [27]:
len(inverse_poid_lancaster)

8816940

In [36]:
columns = ['Terme','Document','Frequence','Poid']
rows= []
for key in tqdm_notebook(inverse_poid_porter.keys()):
    row = [key[0],key[1],inverse_poid_porter[key][0],inverse_poid_porter[key][1]]
    rows.append(row)

df_inverse_freqs_poids_porter = pd.DataFrame(data=rows,columns=columns)
len(df_inverse_freqs_poids_porter)

  0%|          | 0/10109040 [00:00<?, ?it/s]

10109040

In [37]:
columns = ['Terme','Document','Frequence','Poid']
rows= []
for key in tqdm_notebook(inverse_poid_lancaster.keys()):
    row = [key[0],key[1],inverse_poid_lancaster[key][0],inverse_poid_lancaster[key][1]]
    rows.append(row)

df_inverse_freqs_poids_lancaster = pd.DataFrame(data=rows,columns=columns)
len(df_inverse_freqs_poids_lancaster)

  0%|          | 0/8816940 [00:00<?, ?it/s]

8816940

In [38]:
df_inverse_freqs_poids_porter.head()

Unnamed: 0,Terme,Document,Frequence,Poid
0,18,1,1,0.565909
1,edit,1,4,1.543491
2,dewey,1,3,1.566545
3,decim,1,2,0.982483
4,classif,1,2,0.586663


In [39]:
df_inverse_freqs_poids_lancaster.head()

Unnamed: 0,Terme,Document,Frequence,Poid
0,18,1,1,0.565909
1,edit,1,4,1.417844
2,dewey,1,3,1.566545
3,decim,1,2,0.982483
4,class,1,2,0.497871


In [4]:
df_inverse_freqs_poids_lancaster.to_csv('inverse_freq_poids_lancaster.csv',index=False)
df_inverse_freqs_poids_porter.to_csv('inverse_freq_poids_porter.csv',index=False)

NameError: name 'df_inverse_freqs_poids_lancaster' is not defined

In [55]:
df=pd.read_csv('freq_poids_porter.csv')
df.head()

Unnamed: 0,Document,Terme,Frequence,Poid
0,1,18,1,0.565909
1,1,edit,4,1.543491
2,1,dewey,3,1.566545
3,1,decim,2,0.982483
4,1,classif,2,0.586663


give document returns words

In [43]:
def freq1(dico, docu):
        keys = dico.keys()
        docu_keys = []
        for each in keys:
                if docu == each[0]:
                        docu_keys.append(each)
        response = {}
        for key in docu_keys:
                response[key[1]] = dico[key]
        return response

In [85]:
def freq(df,docu):
    results = df[df['Document'] == docu]
    results = results.drop('Document',axis=1).reset_index(drop=True)
    return results

In [86]:
freq(df_freqs_poids_porter,4).head()

Unnamed: 0,Terme,Frequence,Poid
0,system,1,0.116752
1,analysi,1,0.174883
2,univers,3,0.579645
3,librari,5,0.56226
4,final,1,0.280738


In [46]:
freq1(freq_poid_lancaster,4)

{'system': (1, 0.11337622394094585),
 'analys': (1, 0.14795097189386752),
 'univers': (3, 0.5783363370693033),
 'libr': (5, 0.5419526262276546),
 'fin': (1, 0.26111552468294036),
 'report': (2, 0.3804335960777981),
 'research': (2, 0.2811210975557048),
 'project': (1, 0.2370161715558728),
 'est': (1, 0.23466500856250586),
 'nin': (1, 0.3481894408725934),
 'new': (1, 0.16701122034402327),
 '1960': (1, 0.3929932749662196),
 'provok': (1, 0.5376246142811297),
 'high': (1, 0.23389717849143088),
 'stim': (1, 0.45272721371762165),
 're-examination': (1, 0.572783475391572),
 'nat': (1, 0.1679073135710136),
 'purpos': (1, 0.21216673269842792),
 'man': (1, 0.1940959416620021),
 'academ': (2, 0.49443567549238665),
 'long-established': (1, 0.572783475391572),
 'attitud': (1, 0.34508897997352816),
 'method': (1, 0.16215621524057672),
 'quest': (1, 0.2225693296973819),
 'although': (1, 0.2835687555993049),
 'chang': (2, 0.45603995067462927),
 'mad': (1, 0.18046201011363855),
 'bas': (1, 0.128911644

give word returns documents

In [49]:
def freq2(dico, terme):
        terme = terme.strip()
        terme = Porter.stem(terme.lower())
        keys = dico.keys()
        docu_keys = []
        for each in keys:
                if terme == each[0] and dico[each] != (0,0):
                        docu_keys.append(each)
        response = {}
        for key in docu_keys:
                response[key[1]] = dico[key]
        return response

In [57]:
def freq_inverse(df,terme):
    results = df[(df['Terme']==terme) & (df['Frequence'] != 0)]
    results = results.drop('Terme',axis=1).reset_index(drop=True)
    return results

In [58]:
len(freq_inverse(df_freqs_poids_lan,'class'))

164

In [50]:
freq2(inverse_poid_porter,'class')

{5: (1, 0.24666782382619173),
 16: (1, 0.37000173573928763),
 42: (1, 0.37000173573928763),
 176: (1, 0.24666782382619173),
 233: (2, 0.37000173573928763),
 275: (1, 0.37000173573928763),
 282: (1, 0.37000173573928763),
 290: (1, 0.21142956327959292),
 328: (1, 0.49333564765238347),
 341: (1, 0.2960013885914301),
 345: (1, 0.49333564765238347),
 363: (1, 0.24666782382619173),
 379: (3, 1.110005207217863),
 404: (3, 1.110005207217863),
 405: (1, 0.24666782382619173),
 417: (1, 0.2960013885914301),
 428: (1, 0.49333564765238347),
 455: (1, 0.21142956327959292),
 476: (1, 0.49333564765238347),
 478: (1, 0.24666782382619173),
 479: (3, 0.6342886898387787),
 486: (1, 0.2960013885914301),
 559: (1, 0.21142956327959292),
 577: (2, 0.7400034714785753),
 610: (1, 0.24666782382619173),
 669: (1, 0.49333564765238347),
 694: (1, 0.18500086786964381),
 701: (1, 0.14800069429571505),
 722: (1, 0.18500086786964381),
 769: (1, 0.49333564765238347),
 791: (1, 0.37000173573928763),
 797: (4, 1.480006942

produit scalaire

In [6]:
qry_set[3]

'What is information science?  Give definitions where possible.'

In [5]:
df_freqs_poids_porter = pd.read_csv('freq_poids_porter.csv')
df_freqs_poids_lan = pd.read_csv('freq_poids_lancaster.csv')

In [21]:
def produit_scalaire(df,query,stemmer = 'P'):
    words = np.unique(ExpReg.tokenize(query))
    docs = df.Document.unique()
    if stemmer=='P':
        TermesSansMotsVides = [Porter.stem(terme) for terme in words if terme.lower() not in MotsVides]
    elif stemmer=='L':
        TermesSansMotsVides = [Lancaster.stem(terme) for terme in words if terme.lower() not in MotsVides]
    print(TermesSansMotsVides)
    rows=[]
    for doc in docs:
        result = df[(df['Terme'].isin(TermesSansMotsVides)) & (df['Document']==doc)]
        somme = np.sum(result['Poid'])
        rows.append([doc,somme])
    return TermesSansMotsVides, pd.DataFrame(data=rows,columns=['Document','Poid'])

In [22]:
test = df_freqs_poids_lan.head(150)
test

Unnamed: 0,Document,Terme,Frequence,Poid
0,1,18,1,0.565909
1,1,edit,4,1.417844
2,1,dewey,3,1.566545
3,1,decim,2,0.982483
4,1,class,2,0.497871
...,...,...,...,...
145,4,stim,1,0.452727
146,4,re-examination,1,0.572783
147,4,nat,1,0.167907
148,4,purpos,1,0.212167


In [25]:
_,d=produit_scalaire(df_freqs_poids_porter,qry_set[3],'P')
d.sort_values(by='Poid',ascending=False).reset_index(drop=True)

['give', 'definit', 'inform', 'possibl', 'scienc']


Unnamed: 0,Document,Poid
0,1181,2.842382
1,1235,2.149689
2,1179,2.089300
3,445,1.955432
4,469,1.639858
...,...,...
1455,332,0.000000
1456,953,0.000000
1457,954,0.000000
1458,959,0.000000


In [26]:

def Cosine(df,query,stemmer='L'):
    words,produit = produit_scalaire(df,query,stemmer)
    taille = len(words)
    rows = []
    docs = df.Document.unique()
    for doc in docs :
        # temp = df[(df['Terme'].isin(words))&(df['Document']==doc)].assign(square = lambda x:(x['Poid']**2))
        temp = df[df['Document']==doc].assign(square = lambda x:(x['Poid']**2))
        square_root = math.sqrt(np.sum(temp['square']))
        part1 = produit[produit['Document']==doc]['Poid'].values[0]
        resultat = part1/(math.sqrt(taille)*square_root)
        rows.append([doc,resultat])
    return pd.DataFrame(data=rows,columns=['Document','Mesure Cosine']).replace(np.nan,0).sort_values(by='Mesure Cosine',ascending=False).reset_index(drop=True)
    

In [27]:
Cosine(df_freqs_poids_porter,qry_set[3],'P')

['give', 'definit', 'inform', 'possibl', 'scienc']


Unnamed: 0,Document,Mesure Cosine
0,469,0.493620
1,1181,0.321706
2,85,0.307310
3,599,0.301864
4,1142,0.290937
...,...,...
1455,978,0.000000
1456,979,0.000000
1457,980,0.000000
1458,983,0.000000


In [28]:
def Jaccard(df,query,stemmer='L'):
    words,produit = produit_scalaire(df,query,stemmer)
    taille = len(words)
    rows = []
    docs = df.Document.unique()
    for doc in docs :
        # temp = df[(df['Terme'].isin(words))&(df['Document']==doc)].assign(square = lambda x:(x['Poid']**2))
        temp = df[df['Document']==doc].assign(square = lambda x:(x['Poid']**2))
        somme_carres = np.sum(temp['square'])
        # print('somme carres ',somme_carres)
        part1 = produit[produit['Document']==doc]['Poid'].values[0]
        # print('produit :', part1)
        somme_poids = np.sum(df[(df['Terme'].isin(words))&(df['Document']==doc)]['Poid'])
        # print('somme poids : ',somme_poids)
        # print(taille+somme_carres-somme_poids)
        resultat = part1/(taille+somme_carres-somme_poids)
        rows.append([doc,resultat])
    return pd.DataFrame(data=rows,columns=['Document','Mesure Jaccard']).replace(np.nan,0).sort_values(by='Mesure Jaccard',ascending=False).reset_index(drop=True)


In [29]:
Jaccard(df_freqs_poids_porter,qry_set[3],'P')

['give', 'definit', 'inform', 'possibl', 'scienc']


Unnamed: 0,Document,Mesure Jaccard
0,469,0.294546
1,85,0.178021
2,599,0.177620
3,1142,0.160353
4,1181,0.159952
...,...,...
1455,369,0.000000
1456,1056,0.000000
1457,785,0.000000
1458,1058,0.000000


In [61]:
def BM25(df,query,stemmer='L',K=1.20,B=0.75):
    words = np.unique(ExpReg.tokenize(query))
    docs = df.Document.unique()
    if stemmer=='P':
        TermesSansMotsVides = [Porter.stem(terme) for terme in words if terme.lower() not in MotsVides]
    elif stemmer=='L':
        TermesSansMotsVides = [Lancaster.stem(terme) for terme in words if terme.lower() not in MotsVides]
    taille = []
    for doc in docs:
        taille.append(len(df[df['Document']==doc]))
    avdl = np.mean(taille)
    N = len(docs)
    nis = pd.DataFrame(df['Terme'].value_counts()).reset_index()
    nis.columns=['Terme','Ni']
    nis=nis[nis['Terme'].isin(TermesSansMotsVides)]
    rows = []
    for doc in tqdm_notebook(docs):
        A = ((1-B)+B*(taille[doc-1]/avdl))*K
        somme = 0
        for terme in TermesSansMotsVides:
            freq = df.loc[(df['Terme']==terme)&(df['Document']==doc)]['Frequence'].values
            if(len(freq)==0): freq=0
            else: freq=freq[0]
            ni = nis.loc[nis['Terme']==terme]['Ni'].values
            if(len(ni)==0) : ni=0
            else : ni=ni[0]
            result = (freq / (A+freq)) * math.log10((N-ni+0.5)/(ni+0.5))
            somme += result
        rows.append([doc,somme])
    final = pd.DataFrame(data=rows,columns=['Document','Probabilite BM25']).sort_values(by='Probabilite BM25',ascending=False).reset_index(drop=True)

    return final
        

In [62]:
BM25(df_freqs_poids_porter,qry_set[3],stemmer='P')

  0%|          | 0/1460 [00:00<?, ?it/s]

Unnamed: 0,Document,Probabilite BM25
0,1181,2.689751
1,540,1.918782
2,469,1.748774
3,1179,1.530954
4,1133,1.478005
...,...,...
1455,979,0.000000
1456,980,0.000000
1457,397,0.000000
1458,983,0.000000


In [30]:
stop_words = ['is', 'a', 'for', 'the', 'of']
# def parse_args():
#     parser = argparse.ArgumentParser(description='Information Retrieval System Configuration')
#     return parser.parse_args()

def main():
    # args = parse_args()
    ir = IRSystem(doc_set, stop_words=stop_words)

    while True:
            query = 'information AND classification OR title AND computers'

            start = timeit.default_timer()
            results = ir.process_query(query)
            stop = timeit.default_timer()
            if results is not None:
                print ('Processing time: {:.5} secs'.format(stop - start))
                print('\nDoc IDS: ')
                print(results)
            print()

In [32]:
try:
    main()
except KeyboardInterrupt as e:
    print('EXIT')

AttributeError: 'int' object has no attribute 'split'