In [1]:
import pandas as pd
import numpy as np
from gensim import corpora
from tqdm import tqdm

## Train PE-LDA and conduct followee recommendations

In [2]:
# Sampling Functions
def sampleFromDirichlet(alpha):
    return np.random.dirichlet(alpha)
def sampleFromMultinomial(theta):
    theta = theta / np.sum(theta)
    a = np.random.multinomial(1, theta)
    return ((a!=0).argmax())
def sampleFromBeta(gamma):
    return np.random.beta(gamma[0], gamma[1])
def sampleFromBinomial(psi):
    return np.random.binomial(1, psi)

In [3]:
# PE-LDA Gibbs sampler
class PELDA:
    def __init__(self, numTopics, alpha, beta, gamma):
        self.alpha = alpha
        self.beta = beta
        self.gamma = gamma
        self.numTopics = numTopics

    def processProfiles(self, users_lists):
        wordOccuranceMatrix = users_lists
        self.numDocs = len(wordOccuranceMatrix)
        self.words_count = np.max([np.max(i) for i in users_lists]) + 1
        self.sum_alpha = self.numTopics * self.alpha
        self.sum_beta = self.words_count * self.beta
        self.sum_gamma = self.gamma[0] + self.gamma[1]
        return wordOccuranceMatrix

    def initialize(self, users_lists):
        assignVec = [self.alpha] * self.numTopics
        for user in range(self.numDocs):
            u_list = users_lists[user]
            for i, w in enumerate(u_list):  
                if i < 1:  # the first item
                    trigger = 0
                    topic_update = sampleFromMultinomial(assignVec)
                else:
                    trigger = sampleFromBinomial(0.5)
                    if trigger == 0:  # non-sticky
                        topic_update = sampleFromMultinomial(assignVec)
                    else:  # sticky
                        topic_update = self.Z_post[user][i - 1]
                self.Z_post[user][i] = topic_update
                self.Trigger[user][i] = trigger
                w = u_list[i]
                # update pseudo-counters
                self.n_doc_topic_trigger[user][topic_update][trigger] += 1
                self.n_topic_word[w][topic_update] += 1
                self.n_doc_trigger[user][trigger] += 1
                self.sum_doc[user][trigger] += 1
                self.sum_topic[topic_update] += 1

    def gibbsSampler(self, user, item):
        prob_topictrigger = np.zeros((self.numTopics, 2))
        w = self.wordOccuranceMatrix[user][item] 
        factor_theta_0 = (self.n_doc_topic_trigger[user, :, 0] + self.n_doc_topic_trigger[user, :, 1] + self.alpha) / (
                    self.sum_doc[user, 0] + self.sum_doc[user, 1] + self.sum_alpha)
        factor_phi_0 = (self.n_topic_word[w, :] + self.beta) / (self.sum_topic + self.sum_beta)
        # Gibbs Sampling:
        if item < 1:
            prob_topictrigger[:, 0] = factor_phi_0 * factor_theta_0
            prob_topictrigger[:, 1] *= 0
        else:
            factor_psi_0 = (self.gamma[1] + self.n_doc_trigger[
                user, 0] - 1)
            # non-sticky
            prob_topictrigger[:, 0] = factor_psi_0 * factor_phi_0 * factor_theta_0
            # sticky
            prob_topictrigger[:, 1] *= 0
            topic_n_1 = self.Z_post[user][item - 1]
            factor_phi_1 = (self.n_topic_word[w, topic_n_1] + self.beta) / (self.sum_topic[topic_n_1] + self.sum_beta)
            factor_psi_1 = (self.gamma[0] + self.n_doc_trigger[
                user, 1])
            prob_topictrigger[:, 1][topic_n_1] = factor_psi_1 * factor_phi_1
        return prob_topictrigger

    def fit(self, users_lists, iter_times):
        print('//////////// alpha:', self.alpha, '; beta:', self.beta, '; gamma:', self.gamma, '; K:', self.numTopics)
        self.wordOccuranceMatrix = self.processProfiles(users_lists)
        # Construct Pseudo-counters:
        self.n_doc_trigger = np.zeros((self.numDocs, 2), dtype='int')  # Psi
        self.n_doc_topic_trigger = np.zeros((self.numDocs, self.numTopics, 2), dtype='int')  # Theta
        self.n_topic_word = np.zeros((self.words_count, self.numTopics), dtype='int')  # Phi
        self.sum_doc = np.zeros((self.numDocs, 2), dtype='int')  # theta
        self.n_doc = np.zeros(self.numDocs, dtype='int')  # Psi
        self.sum_topic = np.zeros(self.numTopics, dtype='int')  # Phi
        # Construct saver for the sample results of topic and triggers
        self.Z_post = []
        self.Trigger = []
        for x in range(self.numDocs):
            u_list = users_lists[x]
            self.n_doc[x] = len(u_list)
            self.Z_post.append([0 for i in range(len(u_list))])
            self.Trigger.append([0 for i in range(len(u_list))])
        # Latent Parameters
        self.Theta = np.zeros((self.numDocs, self.numTopics))
        self.Phi = np.zeros((self.numTopics, self.words_count))
        self.Psi = np.zeros(self.numDocs)
        print("//////////// Start the Initializations ////////////")
        # Initialize: randomly assign topic and trigger
        self.initialize(users_lists)
        for iteration in tqdm(range(iter_times)):
            for user in range(self.numDocs):
                u_list = users_lists[user]
                for i in range(len(u_list)):
                    # retrieve the topic and trigger
                    topic = self.Z_post[user][i]  # i: order in list
                    trigger = self.Trigger[user][i]
                    w = u_list[i]  # w: item id
                    # for this current assignment of k to a term t, Pseudo-counter - 1
                    self.n_doc_topic_trigger[user][topic][trigger] -= 1
                    self.n_topic_word[w][topic] -= 1
                    self.n_doc_trigger[user][trigger] -= 1
                    self.sum_doc[user][trigger] -= 1
                    self.sum_topic[topic] -= 1
                    # gibbs-sampler
                    prob_topictrigger = self.gibbsSampler(user, i)
                    if i < 1:
                        prob_gibbs = prob_topictriggerprob_topictrigger[:, 1][:, 0]
                        topic_new = sampleFromMultinomial(prob_gibbs)
                        trigger_new = 0 
                    else:
                        prob_gibbs = prob_topictrigger.flatten('F')
                        topic_trigger = sampleFromMultinomial(prob_gibbs)
                        if topic_trigger >= self.numTopics:
                            topic_new = topic_trigger - self.numTopics
                            trigger_new = 1
                        else:
                            topic_new = topic_trigger
                            trigger_new = 0
                    # Save topic
                    self.Z_post[user][i] = topic_new
                    self.Trigger[user][i] = trigger_new
                    # Update counters
                    self.n_doc_topic_trigger[user][topic_new][trigger_new] += 1
                    self.n_topic_word[w][topic_new] += 1
                    self.n_doc_trigger[user][trigger_new] += 1
                    self.sum_doc[user][trigger_new] += 1
                    self.sum_topic[topic_new] += 1

    def params_estimate(self):
        print("////////////   Estimate the Posterior  ////////////")
        ## Theta & Psi
        for user in range(self.numDocs):
            self.Theta[user] = (self.n_doc_topic_trigger[user, :, 0] + self.n_doc_topic_trigger[user, :, 1] + self.alpha) / (
                        self.sum_doc[user, 0] + self.sum_doc[user, 1] + self.sum_alpha)
            self.Psi[user] = (self.gamma[0] + self.n_doc_trigger[user, 1]) / (self.n_doc[user] - 1 + self.sum_gamma)
        ## Phi
        for topic in range(self.numTopics):
            self.Phi[topic] = (self.n_topic_word[:, topic] + self.beta) / (self.sum_topic[topic] + self.sum_beta)

    def negative_recommend(self, negative_sample, TopN):
        print('////////////   Negative Sample Recommendation  ////////////')
        self.recommendation = []
        for user in tqdm(range(self.numDocs)):
            p_zu = self.Theta[user]
            psi = self.Psi[user]
            topic_n_1 = self.Z_post[user][-1]
            u_negative = negative_sample[user]
            prob_list = np.zeros(len(u_negative))
            for i in range(len(u_negative)):
                item = u_negative[i]
                if item != -1:
                    p_gz = self.Phi[:, item]
                    prob_list[i] = (1 - psi) * np.sum(p_gz * p_zu) + psi * p_gz[topic_n_1]
                else:  # the item only appears in Testset,
                    prob_list[i] = 0
            a = prob_list.tolist()
            recommend_ranklist = (np.argsort(-np.array(a))[: TopN]).tolist()
            self.recommendation.append(recommend_ranklist)

    def read_Theta(self):
        return self.Theta

    def read_Phi(self):
        return self.Phi

    def read_Psi(self):
        return self.Psi
    
    def read_negative_recommend(self):
        return self.recommendation

Note. PE-LDA was implemented in C in our experiments. It is available based on reasonable request.