In [2]:
import math
import numpy as np
query_list_path = "q_100_d_10000/query_list.txt"
doc_list_path = "q_100_d_10000/doc_list.txt"

with open(query_list_path, "r") as f:
    q_list = f.read().split('\n')[:-1]
with open(doc_list_path, "r") as f:
    d_list = f.read().split('\n')[:-1]

def get_query_word(q):
    with open("q_100_d_10000/queries/{}.txt".format(q),'r') as f:
        words = f.read().split(' ')
    return words
def get_doc_word(d):
    with open("q_100_d_10000/docs/{}.txt".format(d),'r') as f:
        words = f.read().split(' ')
    return words

def get_random_probability_matrix(event_num,condition_num):
    matrix=np.random.random_sample((event_num,condition_num))
    for i in range(condition_num):
        temp_sum = matrix[:,i].sum()
        matrix[:,i]/=temp_sum #set sum to 1
    return matrix

class parameter_retriever:
    def __init__(self,k):
        self.create_index_term_set()
        self.word_num = len(self.index_term_set)
        self.topic_num = k
        self.doc_num = len(d_list)
        self.initPossibilities()


    def create_index_term_set(self):
        print("creating index term set")
        print(" creating index term set from query")
        self.index_term_set = set()
        for q in q_list:
            words = get_query_word(q)
            self.index_term_set = self.index_term_set.union(set(words))
        print(" creating index term set from doc")
        for d in d_list:
            words = get_doc_word(d)
            self.index_term_set = self.index_term_set.union(set(words))
        print("number of words in index_term_set: {}".format(len(self.index_term_set)))
        print("...done")

    def initPossibilities(self,k):
        print("initializing possibilities")
        self.P_w_T = get_random_probability_matrix(self.word_num,self.topic_num)
        self.P_T_d = get_random_probability_matrix(self.topic_num,self.doc_num)
        self.P_T_wd = np.zeros((self.topic_num,self.word_num,self.doc_num))
        print("...done")

    def E_step(self):
        print("start E_step")
        for i in range(self.word_num):
            for j in range(self.doc_num):
                for k in range(self.topic_num):
                    self.P_T_wd[k,i,j] = self.P_w_T[i,k]*self.P_T_d[k,j]
                sum_of_topic_k = self.P_T_wd[:,i,j].sum()
                self.P_T_wd[:,i,j] /= sum_of_topic_k
        print("...done")

    def M_step(self):
        print("start M_step")
        print("...done")

    def iter(self):
        self.E_step()
        self.M_step()



class PLSA:
    def __init__(self,k,alpha,beta):
        self.k=k
        self.param = parameter_retriever(k)
        
        self.alpha=alpha
        self.beta=beta
    
    def get_sim(self,doc,q):
        print("get_sim")

    def query(self,q):
        sim={}
        for doc in d_list:
            sim[doc] = self.get_sim(doc,q)
        sim = sorted(sim.items(), key=lambda x:x[1],reverse=True)
        ans = ""
        for i in sim:
            ans+=i[0]+' '
        return ans


In [None]:
plsa=PLSA(6,10,0.75,0)
f = open("ans.txt","w")
f.write("Query,RetrievedDocuments\n")
for q in q_list:
    ranking=plsa.query(q)
    f.writelines(q+","+ranking+'\n')
f.close()
    