In [1]:
%matplotlib inline
import re
import nltk
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import joblib
from joblib import Parallel, delayed
from tqdm.notebook import tqdm as tqdm
from tqdm.notebook import trange
import contextlib
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import pandas as pd
import edward2 as ed
import tensorflow as tf
from scipy.special import digamma
from pickle import dump, load
from scipy.sparse import csr_matrix
import tensorflow_probability as tfp
from gensim.corpora.dictionary import Dictionary
from gensim.models import CoherenceModel
from gensim.parsing import strip_tags, strip_numeric, strip_multiple_whitespaces, stem_text, strip_punctuation, remove_stopwords
from gensim.parsing import preprocess_string
import os
import sys
import time

os.environ['CUDA_VISIBLE_DEVICES'] = '0'
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
# tensorflow does not work with new numpy versions





In [2]:
tqdm.pandas()

@contextlib.contextmanager
def tqdm_joblib(tqdm_object):
    """Context manager to patch joblib to report into tqdm progress bar given as argument"""
    class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack):
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)

        def __call__(self, *args, **kwargs):
            tqdm_object.update(n=self.batch_size)
            return super().__call__(*args, **kwargs)

    old_batch_callback = joblib.parallel.BatchCompletionCallBack
    joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
    try:
        yield tqdm_object
    finally:
        joblib.parallel.BatchCompletionCallBack = old_batch_callback
        tqdm_object.close()  

In [3]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /home/iron/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/iron/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/iron/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# data_proc = pd.read_pickle('data_proc.pkl')
# data_enc = pd.read_pickle('data.pkl')
# with open('word_to_idx.pkl', 'rb') as f:
#     words_to_idx = load(f)

In [5]:
n = 4000

seed = 42
data_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)

In [6]:
# data = pd.Series(data_train.data).sample(n).copy()
# data.reset_index(drop=True, inplace=True)

In [7]:
clean_stem_filters = [strip_tags,
                        strip_numeric,
                        strip_punctuation, 
                        lambda x: x.lower(),
                        lambda s: re.sub(r'\b\w{1,2}\b', ' ', s),
                        strip_multiple_whitespaces,
                        remove_stopwords
                     ]

def text_processing(document):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(i) for i in preprocess_string(document, clean_stem_filters)]

In [8]:
def proc_func(data):
    with tqdm_joblib(tqdm(desc="Preprocessing", total=len(data))) as progress_bar:
        data_proc = Parallel(n_jobs=1)(delayed(text_processing)(text) for text in data)
        data_proc = pd.Series(data_proc, index=data.index, name='data')
    return data_proc

In [9]:
# data_proc = proc_func(data)
# data_proc.head()

In [10]:
# def preproc_func(text, stop_words):
#     text = text.lower()
#     text = re.sub(r'(\d+)', '', text)
#     text = re.sub(r'(\n)|(\t)', ' ', text)
#     text = text.translate({ ord(c): None for c in string.punctuation })
#     text = text.strip()
    
#     lemmatizer = WordNetLemmatizer()
#     tokens = word_tokenize(text)
#     text = [i for i in tokens if not i in stop_words and len(i) > 1]
#     text = [lemmatizer.lemmatize(word) for word in text]
#     return text

# with tqdm_joblib(tqdm(desc="Preprocessing", total=len(data_n))) as progress_bar:
#     stop_words = set(stopwords.words('english'))
#     data_proc = Parallel(n_jobs=1)(delayed(preproc_func)(text, stop_words) for text in data_n['paper_text'])
#     data_proc = pd.Series(data_proc, index=data_n.index, name='paper_text')
# data_proc.head()

In [11]:
def encode2(text, word_dict):
    return np.asarray(word_dict.doc2idx(text))

In [12]:
# word_dict = Dictionary(data_proc)
# data_enc = data_proc.progress_apply(lambda x: encode2(x, word_dict))

In [13]:
# data_proc.to_pickle('data_proc.pkl')
# data_enc.to_pickle('data.pkl')

# with open('word_to_idx.pkl', 'wb') as f:
#     dump(words_to_idx, f)

In [14]:
# K = 10
# D = len(data_enc)
# Ns = data_enc.apply(lambda x: len(x)).to_numpy().astype(int)
# N = Ns.sum()
# V = len(word_dict)

In [15]:
# print(K, D, Ns, N, V)

In [16]:
def create_indices(data, D, K, V):
    Ns = np.empty(D, dtype=int)
     
    for i, doc in enumerate(data):
        Ns[i] = len(doc)
    
    N = Ns.sum()
    rows = np.empty(N, dtype=np.int64)
    cols = np.empty(N, dtype=np.int64)
    v_cols = np.empty(N, dtype=np.int64)
    
    last_idx = 0
    
    for i, doc in tqdm(enumerate(data), total=D):
        n = len(doc)
        rows[last_idx:last_idx+n] = i
        cols[last_idx:last_idx+n] = np.arange(n, dtype=np.int64)
        v_cols[last_idx:last_idx+n] = doc
        last_idx += n
        
    K_idx = np.tile(np.arange(K), N)
    left_indices = np.stack((K_idx, np.repeat(v_cols, K)), axis=1)
    dt_indices = np.stack((np.repeat(rows, K), K_idx), axis=1)
    
    return left_indices, dt_indices, N, Ns

In [17]:
# Did not optimize much, as we use it only once
@tf.function
def init_matrices_internal(data_ragged, alpha, beta, D, N, K, V, Ns):
    dt = tf.zeros((D, K), dtype=np.int64)
    wt = tf.zeros((K, V), dtype=np.int64)  
    
    alpha_d = tf.repeat(tf.expand_dims(alpha, 0), N, axis=0)
    tdp = ed.Dirichlet(alpha_d)
    dw = ed.Categorical(probs=tdp)
    dw_r = tf.RaggedTensor.from_row_lengths(dw, Ns)

    upd_cond = lambda i, dt, wt: i < D
    def upd_body(i, dt, wt):
        td = dw_r[i]
        y, idx, counts = tf.unique_with_counts(td, out_idx=tf.int64)

        didx = tf.stack((tf.fill((tf.size(y),), i), y), axis=1)

        dt = tf.tensor_scatter_nd_update(dt, didx, counts)
        
        tidx = tf.stack((tf.cast(td, dtype=tf.int64), data_ragged[i]), axis=1) 
        wt = tf.tensor_scatter_nd_add(wt, tidx, tf.ones(tf.size(data_ragged[i]), dtype=np.int64))
        return i + 1, dt, wt

    j = tf.constant(0)
    j, dt, wt = tf.while_loop(upd_cond, upd_body, [j, dt, wt])
    
    dw = tf.concat(dw, 0)
    pz = tf.random.uniform((N, K)) + 1e-5
    pz /= tf.math.reduce_sum(pz, axis=1, keepdims=True)
         
    return dt, wt, dw, pz, data_ragged


def init_matrices(data, alpha, beta, D, N, K, V, Ns):
    data_ragged = tf.ragged.stack([tf.convert_to_tensor(doc, dtype=tf.int64) for doc in data])
    return init_matrices_internal(data_ragged, alpha, beta, D, N, K, V, Ns) 
    
    

In [18]:
# alpha = tf.zeros((K,)) + 0.5 #tf.random.uniform((K,))
# beta = tf.zeros(()) + 0.5 #tf.random.uniform(())

In [19]:
@tf.function
def log_likelihood(beta, wt, K, V):
    wt = tf.cast(wt, tf.float32)
    return -K * tf.math.lbeta(tf.repeat(beta[None], V)) + tf.math.reduce_sum(tf.math.lbeta(wt + beta))

@tf.function
def train_internal(alpha, beta, D, N, K, V, Ns, dt, wt, dw, pz, data_ragged, left_indices, dt_indices, max_it, parallel_iterations, rtol, atol):
#     train_cond = lambda i, dt, wt, dw, pz, ll_old, ll: tf.logical_and(i < max_it, 
#                                                           tf.logical_or(i == 0, 
#                                                                         tf.logical_not(tf.experimental.numpy.allclose(ll_old, ll, rtol=rtol, atol=atol))))
    train_cond = lambda i, dt, wt, dw, pz, ll_old, ll: i < max_it
    
    def train_body(i, dt, wt, dw, pz, ll_old, ll):   
        ll_old = tf.identity(ll)
        dt_float = tf.cast(dt, tf.float32)
        wt_float = tf.cast(wt, tf.float32)

        term = (wt_float + beta) / (tf.math.reduce_sum(wt_float, axis=1, keepdims=True) + V * beta)
        left = tf.expand_dims(alpha, 1) * term
        left = tf.gather_nd(left, left_indices)
        right = tf.gather_nd(dt_float, dt_indices) * tf.gather_nd(term, left_indices)

        pz_new = left + right
        pz_new = tf.reshape(pz_new, (N, K))

        dw_new = ed.Categorical(probs=pz_new, dtype=tf.int32)
        dw_new_idx = tf.stack((tf.cast(tf.range(0, N), dtype=tf.int32), dw_new.value), axis=1)
        
        # MH step
        pz_new_masked = tf.gather_nd(pz_new, dw_new_idx)
        pz_masked = tf.gather_nd(pz, dw_new_idx)
        ratios = pz_new_masked / pz_masked

        u = tf.random.uniform(ratios.shape)
        mask = tf.math.less_equal(u, ratios)
        indices_upd = tf.where(mask)
        vals_upd = tf.boolean_mask(dw_new, mask)
        dw = tf.tensor_scatter_nd_update(dw, indices_upd, vals_upd)
        dw_r = tf.RaggedTensor.from_row_lengths(dw, Ns)
        dt = tf.zeros_like(dt)
        wt = tf.zeros_like(wt)

        upd_cond = lambda i, dt, wt: i < D
        def upd_body(i, dt, wt):
            td = dw_r[i]
            y, idx, counts = tf.unique_with_counts(td, out_idx=tf.int64)

            didx = tf.stack((tf.fill((tf.size(y),), i), y), axis=1)
            dt = tf.tensor_scatter_nd_update(dt, didx, counts)

            tidx = tf.stack((tf.cast(td, dtype=tf.int64), data_ragged[i]), axis=1) 
            wt = tf.tensor_scatter_nd_add(wt, tidx, tf.ones(tf.size(data_ragged[i]), dtype=np.int64))
            
            return i + 1, dt, wt

        j = tf.constant(0)
        j, dt, wt = tf.while_loop(upd_cond, upd_body, [j, dt, wt], parallel_iterations=parallel_iterations)
        ll = log_likelihood(beta, wt, K, V)
        tf.print('Epoch:', i, 'LL =', ll)            
    
        return i + 1, dt, wt, dw, pz, ll_old, ll

    i = tf.constant(0)
    ll_old = tf.constant(-np.inf, dtype=tf.float32)
    ll = tf.constant(-np.inf, dtype=tf.float32)
    i, dt, wt, dw, pz, _, ll = tf.while_loop(train_cond, train_body, [i, dt, wt, dw, pz, ll_old, ll], parallel_iterations=parallel_iterations)
    tf.print('Converged in', i,  'iterations')
    return dt, wt, ll

def train(data, alpha, beta, D, K, V, max_it=1000, parallel_iterations=1, rtol=1e-4, atol=1e-3):
    start = time.time()
    tf.print('Preparing...')
    left_indices, dt_indices, N, Ns = create_indices(data_enc, D, K, V)
    dt, wt, dw, pz, data_ragged = init_matrices(data, alpha, beta, D, N, K, V, Ns)
    
    dt, wt, ll = train_internal(alpha, beta, D, N, K, V, Ns, dt, wt, dw, pz, data_ragged, 
                                left_indices, dt_indices, max_it, parallel_iterations, rtol, atol)
    end = time.time()
    print('Time:', end-start, 's')
    return dt, wt, ll

In [20]:
# dt, wt, ll = train(data_enc, alpha, beta, D, K, V, max_it=250, parallel_iterations=2)

In [21]:
@tf.function
def get_phi_matrix(wt, beta, V):
    wt = tf.cast(wt, tf.float32)
    phi = (wt + beta) / (tf.math.reduce_sum(wt, axis=1, keepdims=True) + V * beta)
    return phi

@tf.function   
def get_theta_matrix(dt, alpha, K):
    dt = tf.cast(dt, tf.float32)
    alpha = tf.expand_dims(alpha, 0)
    theta = (dt + alpha) / (tf.math.reduce_sum(dt, axis=1, keepdims=True) + K * alpha)
    return theta

In [22]:
# phi = get_phi_matrix(wt, beta, V)
# theta = get_theta_matrix(dt, alpha, K)

In [23]:
# for h in range(K):
#     print(([word_dict[i] for i in tf.math.top_k(phi[h], k=10)[1].numpy()]))

In [24]:
# # Generate doc
# theta_d = ed.Dirichlet(alpha)
# for i in range(100):
#     z_dn = ed.Categorical(probs=theta_d)
#     w_dn = ed.Categorical(probs=beta[z_dn])
#     print(word_dict[w_dn.numpy()])


In [25]:
# Get topic top words
def get_topics(phi, word_dict, k):
    topics = []
    weights = []
    for h in range(k):
        top_phi = tf.math.top_k(phi[h], k=10)
        topic = [word_dict[i] for i in top_phi[1].numpy()] 
        weight = list(top_phi[0].numpy())
    #     print(([idx_to_word[i] for i in tf.math.top_k(beta[h], k=10)[1].numpy()]))
        topics.append(topic)
        weights.append(weight)

    return (np.asarray(topics), np.asarray(weights))

In [26]:
# topics, weights = get_topics(phi, word_dict, K)
# topics

In [27]:
def get_coherence(topics, text_data, word_dict, coherence_type='c_v'):
    coherence_model_lda = CoherenceModel(topics=topics, texts=text_data, dictionary=word_dict, coherence='c_v')
    return coherence_model_lda.get_coherence()

In [28]:
# coherence = get_coherence(topics, data_proc, word_dict, 'c_v')
# coherence

Varying topic number

In [None]:
import pandas as pd

Ns = [4000, 2000, 1000]
Ks = range(4, 22, 3)
alphas = [0.1, 0.5]
betas = [0.1, 0.5]

params = []
for n in Ns:
    n_params = []
    for k in Ks:
        for alpha, beta in zip(alphas,betas):
            n_params.append((k, alpha, beta))
    params.append((n, n_params))


i = 0
for n, n_params in tqdm(params):
    data_n = pd.Series(data_train.data).sample(n).copy()
    data_proc = proc_func(data_n)
    word_dict = Dictionary(data_proc)
    data_enc = data_proc.progress_apply(lambda x: encode2(x, word_dict))
    D = len(data_enc)
    Ns = data_enc.apply(lambda x: len(x)).to_numpy().astype(int)
    N = Ns.sum()
    V = len(word_dict)

    results = pd.DataFrame(columns=["N", "K", "alpha", "beta", "ll", "perplexity", "coherence", "topics", "weights"])
    for k, alpha, beta in tqdm(n_params):
        np.random.seed(42)
        print(n, k, alpha, beta, V)
        alpha_ = tf.zeros((k,)) + alpha
        beta_ = tf.zeros(()) + beta
        _, wt, ll = train(data_enc, alpha_, beta_, n, k, V, max_it=250, parallel_iterations=2)
        phi = get_phi_matrix(wt, beta_, V)
        
        topics, weights = get_topics(phi, word_dict, k=k)
        perplexity = np.exp(-ll/N)
        coherence = get_coherence(topics, data_proc, word_dict, 'c_v')
        
        str_topics = repr([list(topic) for topic in topics])
        str_weights = repr([list(weight) for weight in weights])
        results.loc[i] = n, k, alpha, beta, ll.numpy(), perplexity, coherence, str_topics, str_weights
        
        print("Perplexity:", perplexity, "Coherence:", coherence)
        
        i += 1
    
    results.to_csv("results/resultset_n{}.csv".format(n, k), index=False)

  0%|          | 0/3 [00:00<?, ?it/s]

Preprocessing:   0%|          | 0/4000 [00:00<?, ?it/s]

  0%|          | 0/4000 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

4000 4 0.1 0.1 46116
Preparing...


  0%|          | 0/4000 [00:00<?, ?it/s]

Epoch: 0 LL = -5229215
Epoch: 1 LL = -5224879.5
Epoch: 2 LL = -5220481.5
Epoch: 3 LL = -5216161
Epoch: 4 LL = -5211152.5
Epoch: 5 LL = -5.20592e+06
Epoch: 6 LL = -5200168.5
Epoch: 7 LL = -5193778
Epoch: 8 LL = -5.18704e+06
Epoch: 9 LL = -5179463
Epoch: 10 LL = -5170823
Epoch: 11 LL = -5161517.5
Epoch: 12 LL = -5150920.5
Epoch: 13 LL = -5139619.5
Epoch: 14 LL = -5127213
Epoch: 15 LL = -5114859.5
Epoch: 16 LL = -5.10208e+06
Epoch: 17 LL = -5089343.5
Epoch: 18 LL = -5076419.5
Epoch: 19 LL = -5063556.5
Epoch: 20 LL = -5051467.5
Epoch: 21 LL = -5039575.5
Epoch: 22 LL = -5028834
Epoch: 23 LL = -5018845
Epoch: 24 LL = -5009054.5
Epoch: 25 LL = -5000524.5
Epoch: 26 LL = -4992374.5
Epoch: 27 LL = -4984277.5
Epoch: 28 LL = -4976119.5
Epoch: 29 LL = -4968167.5
Epoch: 30 LL = -4960701.5
Epoch: 31 LL = -4952967.5
Epoch: 32 LL = -4945105.5
Epoch: 33 LL = -4937824
Epoch: 34 LL = -4929927.5
Epoch: 35 LL = -4922958.5
Epoch: 36 LL = -4916349
Epoch: 37 LL = -4909481
Epoch: 38 LL = -4903043.5
Epoch: 39 LL

  0%|          | 0/4000 [00:00<?, ?it/s]

Epoch: 0 LL = -5229123.5
Epoch: 1 LL = -5225571.5
Epoch: 2 LL = -5.22203e+06
Epoch: 3 LL = -5218353.5
Epoch: 4 LL = -5214522
Epoch: 5 LL = -5210314.5
Epoch: 6 LL = -5205636.5
Epoch: 7 LL = -5200443.5
Epoch: 8 LL = -5194699
Epoch: 9 LL = -5188539.5
Epoch: 10 LL = -5181951
Epoch: 11 LL = -5174558.5
Epoch: 12 LL = -5166658
Epoch: 13 LL = -5157575
Epoch: 14 LL = -5147818
Epoch: 15 LL = -5137938.5
Epoch: 16 LL = -5127283.5
Epoch: 17 LL = -5116609
Epoch: 18 LL = -5106463.5
Epoch: 19 LL = -5096279.5
Epoch: 20 LL = -5086691
Epoch: 21 LL = -5077648.5
Epoch: 22 LL = -5068381
Epoch: 23 LL = -5058899
Epoch: 24 LL = -5049604.5
Epoch: 25 LL = -5041427
Epoch: 26 LL = -5033509.5
Epoch: 27 LL = -5025808
Epoch: 28 LL = -5018303
Epoch: 29 LL = -5011371.5
Epoch: 30 LL = -5004355.5
Epoch: 31 LL = -4997743.5
Epoch: 32 LL = -4.99108e+06
Epoch: 33 LL = -4984588.5
Epoch: 34 LL = -4978389
Epoch: 35 LL = -4972300.5
Epoch: 36 LL = -4966532.5
Epoch: 37 LL = -4960862.5
Epoch: 38 LL = -4954885
Epoch: 39 LL = -494914

In [None]:
df_results_4000 = pd.read_csv("results/resultset_n4000.csv")
df_results_4000.sort_values(by="coherence", ascending=False)

In [None]:
import matplotlib.pyplot as plt

df_results_4000.sort_values(by="K", ascending=True, inplace=True)

fig, ax = plt.subplots()

ax.plot(df_results_4000.K, df_results_4000.coherence)
ax.set_xlabel("Topics")
ax.set_xticks(df_results_4000.K)
ax.set_ylabel("Coherence")

ax.set_title("Topic Number vs. Coherence - 4000 Documents")

fig.savefig("results/resultset_topicVariation_4000-TopicVsCoherence.pdf")

In [None]:
import matplotlib.pyplot as plt

df_results_4000.sort_values(by="K", ascending=True, inplace=True)

fig, ax = plt.subplots()

ax.plot(df_results_4000.K, df_results_4000.coherence)
ax.set_xlabel("Topics")
ax.set_xticks(df_results_4000.K)
ax.set_ylabel("Perplexity")

ax.set_title("Topic Number vs. Perplexity - 4000 Documents")

fig.savefig("results/resultset_topicVariation_4000-TopicVsPerplexity.pdf")

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(8,4))
axes = axes.flatten()

df_results_4000.sort_values(by="K", ascending=True, inplace=True)
axes[0].plot(df_results_4000.K, df_results_4000.alpha)
axes[0].set_xlabel("Topics")
axes[0].set_ylabel("$\\alpha$")

df_results_4000.sort_values(by="K", ascending=True, inplace=True)
axes[1].plot(df_results_4000.K, df_results_4000.eta)
axes[1].set_xlabel("Topics")
axes[1].set_ylabel("$\\beta$")

for ax in axes:
    ax.set_xticks(df_results_4000.K)
    
fig.suptitle("Optimized priors vs. Topics - 25 documents")

plt.tight_layout()

fig.savefig("results/resultset_topicVariation_25-TopicVsOptPriors.pdf")

In [None]:
df_results_50 = pd.read_csv("results/resultset_topicVariation_50.csv")
df_results_50.sort_values(by="coherence", ascending=False)

In [None]:
import matplotlib.pyplot as plt

df_results_50.sort_values(by="K", ascending=True, inplace=True)

fig, ax = plt.subplots()

ax.plot(df_results_50.K, df_results_50.coherence)
ax.set_xlabel("Topics")
ax.set_xticks(df_results_50.K)
ax.set_ylabel("Coherence")

ax.set_title("Topic Number vs. Coherence - 50 Documents")

fig.savefig("results/resultset_topicVariation_50-TopicVsCoherence.pdf")

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(8,4))
axes = axes.flatten()

df_results_50.sort_values(by="K", ascending=True, inplace=True)
axes[0].plot(df_results_50.K, df_results_50.alpha)
axes[0].set_xlabel("Topics")
axes[0].set_ylabel("$\\alpha$")

df_results_50.sort_values(by="K", ascending=True, inplace=True)
axes[1].plot(df_results_50.K, df_results_50.eta)
axes[1].set_xlabel("Topics")
axes[1].set_ylabel("$\eta$")

for ax in axes:
    ax.set_xticks(df_results_50.K)
    
fig.suptitle("Optimized priors vs. Topics - 50 documents")

plt.tight_layout()

fig.savefig("results/resultset_topicVariation_50-TopicVsOptPriors.pdf")

In [None]:
df_results_100 = pd.read_csv("results/resultset_topicVariation_100.csv")
df_results_100.sort_values(by="coherence", ascending=False)

In [None]:
import matplotlib.pyplot as plt

df_results_100.sort_values(by="K", ascending=True, inplace=True)

fig, ax = plt.subplots()

ax.plot(df_results_100.K, df_results_100.coherence)
ax.set_xlabel("Topics")
ax.set_xticks(df_results_100.K)
ax.set_ylabel("Coherence")

ax.set_title("Topic Number vs. Coherence - 100 Documents")

fig.savefig("results/resultset_topicVariation_100-TopicVsCoherence.pdf")

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(8,4))
axes = axes.flatten()

df_results_100.sort_values(by="K", ascending=True, inplace=True)
axes[0].plot(df_results_100.K, df_results_100.alpha)
axes[0].set_xlabel("Topics")
axes[0].set_ylabel("$\\alpha$")

df_results_100.sort_values(by="K", ascending=True, inplace=True)
axes[1].plot(df_results_100.K, df_results_100.eta)
axes[1].set_xlabel("Topics")
axes[1].set_ylabel("$\eta$")

for ax in axes:
    ax.set_xticks(df_results_100.K)
    
fig.suptitle("Optimized priors vs. Topics - 100 documents")

plt.tight_layout()

fig.savefig("results/resultset_topicVariation_100-TopicVsOptPriors.pdf")

In [None]:
# MAX_IT = 10
# EPS = 0.001
# tf.executing_eagerly()
# optim = tf.keras.optimizers.Adam(1e-3)
# # B = ed.Dirichlet(concentration=tf.fill([K, V], 0.1), name="topics")
# # Z = ed.DirichletMultinomial(tf.convert_to_tensor(Ns), concentration=tf.fill([D, K], 0.1))
# alpha = np.copy(alpha_n).astype(np.float32)
# eta = np.copy(eta_n).astype(np.float32)

# beta = np.copy(beta_n).astype(np.float32)
# phi = [np.full((n, K), 1/K).astype(np.float32) for n in Ns]
# gamma = np.copy(gamma_n).astype(np.float32)
# lmbd = np.copy(lmbd_n).astype(np.float32)

# bb = None
# gg = None
# ww = None

# class Positive(tf.keras.constraints.Constraint):
#     def __call__(self, w):
#         return w * tf.cast(tf.math.greater(w, 0.), w.dtype)

# bb = []
# for it in trange(MAX_IT):
#     bb2 = []
#     gg2 = []
#     ww2 = []
#     print('before', gamma)
#     for d in range(D):
#         for n in range(Ns[d]):
#             for i in range(K):
#                 phi[d][n, i] = beta[i, data_enc.iloc[d][n]] * np.exp(digamma(gamma[d, i]) - digamma(np.sum(gamma[d])))
#         phi[d] /= np.sum(phi[d], axis=-1, keepdims=True) + 1e-5
        
#         for i in range(K):
#             gamma[d, i] = alpha[i] + np.sum(phi[d][:, i])
#     print('after', gamma)

#     lmbd = np.full((K, V), eta)
#     for i in range(K):
#         for j in range(V):
#             for d in range(D):
#                 mask = (data_enc.iloc[d] == j)
#                 lmbd[i, j] += np.sum(phi[d][:, i]*mask)
    
     
# #     if bb is None:
# #         bb = bb2
# #         gg = gg2
# #         ww = ww2
# #         break
                
#     alpha_t = tf.Variable(alpha, trainable=True, constraint=Positive())
#     gamma_t = tf.convert_to_tensor(gamma, dtype=tf.float32)
    
    
#     def f_x():
#         g_term = tf.math.reduce_sum(tf.expand_dims((alpha_t - 1), 0)*(tf.math.digamma(gamma_t) - 
#                                                    tf.math.digamma(tf.math.reduce_sum(gamma_t, axis=1, keepdims=True))), axis=1)
#         loss = -tf.math.reduce_sum(tf.math.lgamma(tf.math.reduce_sum(alpha_t)) - tf.math.reduce_sum(tf.math.lgamma(alpha_t)) + g_term)
#         return loss
    
#     for itt in range(10):
#         for i in range(K):
#             for itt1 in range(50):
#                 #with tf.GradientTape() as tape:
#                 optim.minimize(f_x, [alpha_t])
# #                 grads = tape.gradient(loss, opt_a)
# #                 optim.apply_gradients([(grads, opt_a)])
#                 alpha[i] = alpha_t.numpy()[i]
#                 np.nan_to_num(alpha, copy=False, nan=1e-5)
#                 alpha_t.assign(alpha)
#         print(alpha_t)
#     beta = (lmbd - eta) / (np.sum(lmbd - eta, axis=-1, keepdims=True) + 1e-5)
#     break
    
#     eta_t = tf.Variable(eta, trainable=True, constraint=Positive())
    
#     @tf.function
#     def f_eta():
#         loss = K*((eta_t-1)*(tf.math.digamma(eta_t) - tf.math.digamma(eta_t*V)) + tf.math.lgamma(eta_t*V) - V*tf.math.lgamma(eta))
#         return loss
    
#     for itt1 in range(50):
#         optim.minimize(f_eta, [eta_t])
#     eta = eta_t.numpy()

    