In [1]:
import pandas as pd
import numpy as np

In [2]:
words = pd.DataFrame(
    [[3,0,5,0,2,6,0,2,0,2],
    [0,7,0,2,1,0,0,3,0,0],
    [0,1,0,0,1,2,2,0,3,0],
    [1,1,1,1,1,1,1,1,1,1],
    [1,1,1,1,1,1,1,1,1,1]],
    columns=['team','coach','play','ball','score','game','win','lost','timeout','season'],
    index=['Document 1','Document 2', 'Document 3','Document 4', 'Document 5']
)

words

Unnamed: 0,team,coach,play,ball,score,game,win,lost,timeout,season
Document 1,3,0,5,0,2,6,0,2,0,2
Document 2,0,7,0,2,1,0,0,3,0,0
Document 3,0,1,0,0,1,2,2,0,3,0
Document 4,1,1,1,1,1,1,1,1,1,1
Document 5,1,1,1,1,1,1,1,1,1,1


In [3]:
(0.5*5/20)+(0.5*7/62)

0.1814516129032258

In [4]:
query = ['team']

In [5]:
def retrieval(query):
    for x in query:
        doc_mod = words[x]
        word_sum = words[x].sum()
        #doc_sum = words[x].sum(axis=1)
        print(doc_mod)
        
        
retrieval(query)

Document 1    3
Document 2    0
Document 3    0
Document 4    1
Document 5    1
Name: team, dtype: int64


In [6]:
class LIretrieval():
    def __init__(self,model,alpha):
        self.model = model
        self.alpha = alpha
    
    def totals(self):
        return self.model.iloc[::].sum(axis=1)
        
    def retrieve(self, query):
        totals = self.totals
        probabilities = []
        num_docs = len(self.model)
        coll_num = self.model.iloc[::].sum(axis=1).sum()
        if len(query) == 0:
            return 'Please input query'
        for x in query:
            if x in self.model:
                doc_mod = self.model[x]
                word_sum = self.model[x].sum()
                doc_probs = doc_mod/self.totals()
                coll_model = doc_mod.sum()/self.totals().sum()
                probabilities.append((self.alpha*doc_probs) + ((1-self.alpha)*coll_model))
        
        probs_df = pd.concat(probabilities,axis=1)
        return probs_df.product(axis=1).sort_values(ascending=False)
    
test = LIretrieval(words, 0.5)
test.retrieve(['team','play'])

Document 1    0.020925
Document 4    0.009615
Document 5    0.009615
Document 2    0.002276
Document 3    0.002276
dtype: float64

In [7]:
((0.5*3/20)+(0.5*5/62))*((0.5*5/20)+(0.5*7/62))

0.020925468262226847

In [8]:
class DIRretrieval():
    def __init__(self,model,mu=None):
        self.model = model
        self.mu = mu
    def totals(self):
        return self.model.iloc[::].sum(axis=1)
        
    def retrieve(self, query):
        totals = self.totals
        probabilities = []
        num_docs = len(self.model)
        coll_tot = self.model.iloc[::].sum(axis=1).sum()
        if self.mu == None:
            self.mu = coll_tot/num_docs
        doc_lens = self.model.sum(axis=1)
        if len(query) == 0:
            return 'Please input query'
        for x in query:
            if x in self.model:
                doc_mod = self.model[x]
                word_sum = self.model[x].sum()
                probabilities.append((doc_mod + self.mu *(word_sum/coll_tot))/(self.mu+doc_lens))
        
        probs_df = pd.concat(probabilities,axis=1)
        return probs_df.product(axis=1).sort_values(ascending=False)
    
test = DIRretrieval(words)
test.retrieve(['team','play'])

Document 1    0.024387
Document 4    0.009566
Document 5    0.009566
Document 3    0.003057
Document 2    0.002170
dtype: float64

In [9]:
((3+(12.4*5/62))/(12.4+20))*(5+(12.4*7/62))/(12.4+20)

0.024386526444139616

In [15]:
class TwoStageretrieval():
    def __init__(self,model,alpha,mu=None):
        self.model = model
        self.alpha = alpha
        self.mu = mu
    def totals(self):
        return self.model.iloc[::].sum(axis=1)
        
    def retrieve(self, query):
        totals = self.totals
        probabilities = []
        num_docs = len(self.model)
        coll_tot = self.model.iloc[::].sum(axis=1).sum()
        if self.mu == None:
            self.mu = coll_tot/num_docs
        doc_lens = self.model.sum(axis=1)
        if len(query) == 0:
            return 'Please input query'
        for x in query:
            if x in self.model:
                doc_mod = self.model[x]
                word_sum = self.model[x].sum()
                doc_probs = doc_mod/self.totals()
                coll_model = doc_mod.sum()/self.totals().sum()
                probabilities.append((1-self.alpha)*((doc_mod + (self.mu*word_sum/coll_tot))/(doc_lens + self.mu)) + (self.alpha*word_sum/coll_tot))
        
        probs_df = pd.concat(probabilities,axis=1)
        return probs_df.product(axis=1).sort_values(ascending=False)
    
test = TwoStageretrieval(words,0.5,12.4)
test.retrieve(['team', 'play'])

Document 1    0.015840
Document 4    0.009348
Document 5    0.009348
Document 3    0.005678
Document 2    0.005041
dtype: float64

In [16]:
((1-0.5)*((3 + (12.4*5/62))/(20 + 12.4)) + (0.5*5/62))*((1-0.5)*((5 + (12.4*7/62))/(20 + 12.4)) + (0.5*7/62))

0.015840050888793412

In [12]:
test.retrieve([])

'Please input query'