In [2]:
import math
import numpy as np
query_list_path = "q_100_d_10000/query_list.txt"
doc_list_path = "q_100_d_10000/doc_list.txt"

with open(query_list_path, "r") as f:
    q_list = f.read().split('\n')[:-1]
with open(doc_list_path, "r") as f:
    d_list = f.read().split('\n')[:-1]

def get_query_word(q):
    with open("q_100_d_10000/queries/{}.txt".format(q),'r') as f:
        words = f.read().split(' ')
    return words
def get_doc_word(d):
    with open("q_100_d_10000/docs/{}.txt".format(d),'r') as f:
        words = f.read().split(' ')
    return words
def get_random_probability_matrix(event_num,condition_num):
    matrix=np.random.random_sample((event_num,condition_num))
    for i in range(condition_num):
        temp_sum = matrix[:,i].sum()
        matrix[:,i]/=temp_sum #set sum to 1
    return matrix

class parameter_retriever:
    def __init__(self,topic_num):
        self.create_index_term_set()
        self.word_num = len(self.index_term_list)
        self.topic_num = topic_num
        self.doc_num = len(d_list)
        self.get_word_count_in_doc()
        self.initPossibilities()


    def create_index_term_set(self):
        print("creating index term set")
        print(" creating index term set from query")
        index_term_set_q = set()
        for q in q_list:
            words = get_query_word(q)
            index_term_set_q = index_term_set_q.union(set(words))

        print(" creating index term set from doc")
        index_term_set_d = set()
        for d in d_list:
            words = get_doc_word(d)
            index_term_set_d = index_term_set_d.union(set(words))

        self.index_term_list = list(index_term_set_q.intersection(index_term_set_d))
        print("number of words in index_term_set: {}".format(len(self.index_term_list)))
        print("...done")

    def get_word_count_in_doc(self):
        self.c_wd = np.zeros((self.word_num,self.doc_num))
        self.doc_length = np.zeros(self.doc_num)
        self.c_w=np.zeros(self.word_num)

        for idx_d,d in enumerate(d_list):
            words = get_doc_word(d)
            for idx_term,term in enumerate(self.index_term_list):
                temp_count = words.count(term)
                self.c_wd[idx_term,idx_d] = temp_count
                self.doc_length[idx_d] += temp_count
                self.c_w[idx_term]+=1

    def initPossibilities(self,k):
        print("initializing possibilities")
        self.P_w_T = get_random_probability_matrix(self.word_num,self.topic_num)
        self.P_T_d = get_random_probability_matrix(self.topic_num,self.doc_num)
        self.P_T_wd = np.zeros((self.topic_num,self.word_num,self.doc_num))
        print("...done")

    def E_step(self):
        print("start E_step")
        for i in range(self.word_num):
            for j in range(self.doc_num):
                for k in range(self.topic_num):
                    self.P_T_wd[k,i,j] = self.P_w_T[i,k]*self.P_T_d[k,j]
                sum_of_topic_k = self.P_T_wd[:,i,j].sum()
                self.P_T_wd[:,i,j] /= sum_of_topic_k
        print("...done")

    def M_step(self):
        print("start M_step")
        print(" process P_w_T")
        for k in range(self.topic_num):
            for i in range(self.word_num):
                self.P_w_T[i,k]=0
                for j in range(self.doc_num):
                    self.P_w_T[i,k]+=self.c_wd[i,j]*self.P_T_wd[k,i,j]
            temp_sum = self.P_w_T[:,k].sum()
            self.P_w_T[:,k]/=temp_sum

        print(" process P_T_d")
        for j in range(self.doc_num):
            for k in range(self.topic_num):
                self.P_T_d[k,j]=0
                for i in range(self.word_num):
                    self.P_T_d[k,j]+=self.c_wd[i,j]*self.P_T_wd[k,i,j]
            self.P_w_T[:,j]/=self.doc_length[j]

        print("...done")

    def iter(self):
        self.E_step()
        self.M_step()



class PLSA:
    def __init__(self,topic_num,alpha,beta):
        self.topic_num=topic_num
        self.param = parameter_retriever(topic_num)
        
        self.alpha=alpha
        self.beta=beta
    
    def get_sim(self,idx_doc,q):
        logsum=0
        for q_word in get_query_word(q):
            if q_word in self.param.index_term_list:
                i=self.param.index_term_list.index(q_word)
                first = self.alpha*self.param.c_wd[i,idx_doc]/self.param.doc_length[idx_doc]
                second = 0
                for k in range(self.topic_num):
                    second+=self.param.P_w_T[i,k]*self.param.P_T_d[k,idx_doc]
                second*=self.beta
                third = (1-self.alpha-self.beta)*self.param.c_w[i]/self.param.doc_num
                temp = np.logaddexp(first, second)
                temp = np.logaddexp(temp, third)
                logsum+=temp
        return logsum


    def query(self,q):
        sim={}
        for idx_doc,doc in enumerate(d_list):

            sim[doc] = self.get_sim(idx_doc,q)
        sim = sorted(sim.items(), key=lambda x:x[1],reverse=True)[:]
        ans = ""
        for i in sim:
            ans+=i[0]+' '
        return ans


In [None]:
plsa=PLSA(6,10,0.75,0)
f = open("ans.txt","w")
f.write("Query,RetrievedDocuments\n")
for q in q_list:
    ranking=plsa.query(q)
    f.writelines(q+","+ranking+'\n')
f.close()
    