In [74]:
import pymc as pm
import re
import numpy as np
import pandas as pd

def build_word_representation(corpus, useRegex):
   
    results = [] 
    words_index = dict()
    index = 0
    regex = r"[\w'-]+"
    
    for sentence in corpus:
        result = []
        if useRegex == True:
            words = re.findall(regex, sentence)
        else:
            words = sentence
        for word in words:
            if word in words_index:
                result.append(words_index[word])
            else:
                result.append(index)
                words_index[word] = index
                index = index + 1
        results.append(result)
    return results


def build_model(data,K,M,V, training, burned):
  
    alpha = np.ones(K) * 0.8
    beta = np.ones(V + 1) * 0.8
    N = [len(sent) for sent in data]
   
    theta = [pm.CompletedDirichlet("theta_%s" % m, pm.Dirichlet("ptheta_%s" % m, theta=alpha)) for m in range(M)]
    phi = [pm.CompletedDirichlet("phi_%s" % k, pm.Dirichlet("pphi_%s" % k, theta=beta)) for k in range(K)]
    z = [pm.Categorical('z_%i' % m, p = theta[m], size = N[m], value=np.random.randint(K, size=N[m])) for m in range(M)]
    w = [pm.Categorical("w_%i_%i" % (m,i),
                        p = pm.Lambda('phi_z_%i_%i' % (m,i),lambda z=z[m][i], phi=phi: phi[z]),
                        value=data[m][i],
                        verbose = 0,
                        observed=True)
                        for m in range(M) for i in range(N[m])]
    
    model = pm.Model([theta, phi, z, w])
    mcmc = pm.MCMC(model)
    mcmc.sample(training, burned)
    return mcmc

def trace_variables(mcmc,M,K,no_samples):
    print()
    print()
    print("Theta's:")
    for i in range(M):
        theta_trace = mcmc.trace('theta_%s' % i)[:]
        print(theta_trace.mean(axis = 0))

    print()
    print("Phi's:")
    for i in range(K):
        phi_trace = mcmc.trace('phi_%s' % i)[:]
        print(phi_trace.mean(axis = 0))

    print()
    print("Z's:")
    for i in range(M):
        z_trace = mcmc.trace('z_%s' % i)[:]
        print(np.round(z_trace.mean(axis = 0)))

def build_dataframe(mcmc,M,K, no_samples):
    
    data = []

    for i in range(M):
        row = mcmc.trace('theta_%s' % i)[:][no_samples - 1]
        data.append(row)


    pd.options.display.float_format = '{:,.3f}'.format
    df = pd.DataFrame(np.array(data).reshape(M,K), columns=['$\\boldsymbol\\theta_{m,0}$', 
                                     '$\\boldsymbol\\theta_{m,1}$'])
    df.index.name = 'Document'
    return df

<h1> Task 1 </h1>

In [40]:
corpus = [
    "I had a peanuts butter sandwich for breakfast.",
    "I like to eat almonds, peanuts and walnuts.",
    "My neighbor got a little dog yesterday.",
    "Cats and dogs are mortal enemies.",
    "You mustn't feed peanuts to your dog."
]

corpus_vectorized = build_word_representation(corpus, True)
K = 2
M = 5
V = 29
training = 5000
burned = 1000
mcmc = build_model(corpus_vectorized, K, M, V, training, burned)
trace_variables(mcmc,M,K,training - burned)
df = build_dataframe(mcmc,M,K, training - burned)
df

 [-----------------100%-----------------] 5000 of 5000 complete in 3.7 sec

Theta's:
[[0.78303718 0.21696282]]
[[0.79069365 0.20930635]]
[[0.16002058 0.83997942]]
[[0.91559649 0.08440351]]
[[0.83606897 0.16393103]]

Phi's:
[[0.02187799 0.00243291 0.02201912 0.04629614 0.00049322 0.00747129
  0.0758531  0.12606074 0.00459242 0.04577137 0.03913143 0.00379505
  0.02554135 0.037047   0.12656128 0.04369714 0.00935809 0.00013265
  0.00394474 0.05547928 0.05750463 0.07869561 0.00953384 0.00193986
  0.0007641  0.00622885 0.00148163 0.04430693 0.01991454 0.08207371]]
[[0.01133821 0.01582584 0.04909613 0.15106791 0.03339699 0.01395981
  0.03690806 0.07989552 0.02220419 0.03284021 0.05101098 0.01754229
  0.01381916 0.00043307 0.00250433 0.01049601 0.02174804 0.02710766
  0.04184942 0.01164135 0.0022886  0.01853047 0.00244392 0.13710293
  0.02674447 0.01437732 0.00149294 0.01912269 0.01417451 0.11903697]]

Z's:
[0 1 1 0 1 1 0 0]
[0 1 0 0 0 0 0 0]
[0 1 1 0 1 1 0]
[0 0 0 0 1 1]
[0 0 0 0 0 0 0]


Unnamed: 0_level_0,"$\boldsymbol\theta_{m,0}$","$\boldsymbol\theta_{m,1}$"
Document,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.783,0.217
1,0.791,0.209
2,0.16,0.84
3,0.916,0.084
4,0.836,0.164


<h1> Sanity Check </h1>

In [75]:
docs = [["aaa", "bbb", "aaa"],
        ["bbb", "aaa", "bbb"],
        ["aaa", "bbb", "bbb", "aaa"],
        ["uuu", "vvv"],
        ["uuu", "vvv", "vvv"],
        ["uuu", "vvv", "vvv", "uuu"]]

docs_vectorized = build_word_representation(docs, False)
K = 2
M = 6
V = 5
training = 5000
burned = 1000
mcmc = build_model(docs_vectorized, K, M, V, training, burned)
trace_variables(mcmc,M,K,training - burned)
df = build_dataframe(mcmc,M,K, training - burned)
df

 [-----------------100%-----------------] 5000 of 5000 complete in 3.9 sec

Theta's:
[[0.09270071 0.90729929]]
[[0.18804699 0.81195301]]
[[0.55955093 0.44044907]]
[[0.74353424 0.25646576]]
[[0.94272263 0.05727737]]
[[0.82552969 0.17447031]]

Phi's:
[[0.00628777 0.09357907 0.53958113 0.23587111 0.12395798 0.00072295]]
[[0.11441991 0.11576845 0.13019259 0.03179911 0.30145869 0.30636124]]

Z's:
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1. 1.]
[0. 0.]
[0. 0. 0.]
[0. 0. 0. 0.]


Unnamed: 0_level_0,"$\boldsymbol\theta_{m,0}$","$\boldsymbol\theta_{m,1}$"
Document,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.093,0.907
1,0.188,0.812
2,0.56,0.44
3,0.744,0.256
4,0.943,0.057
5,0.826,0.174
