In [1]:
import pymc as pm
import re
import numpy as np

def build_word_representation(corpus, useRegex):
   
    results = [] 
    words_index = dict()
    index = 0
    regex = r"[\w'-]+"
    
    for sentence in corpus:
        result = []
        if useRegex == True:
            words = re.findall(regex, sentence)
        else:
            words = sentence
        for word in words:
            if word in words_index:
                result.append(words_index[word])
            else:
                result.append(index)
                words_index[word] = index
                index = index + 1
        results.append(result)
    return results


def build_model(data,K,M,V):
  
    alpha = np.ones(K) * 0.8
    beta = np.ones(V + 1) * 0.8
    N = [len(sent) for sent in data]
   
    theta = [pm.CompletedDirichlet("theta_%s" % m, pm.Dirichlet("ptheta_%s" % m, theta=alpha)) for m in range(M)]
    phi = [pm.CompletedDirichlet("phi_%s" % k, pm.Dirichlet("pphi_%s" % k, theta=beta)) for k in range(K)]
    z = [pm.Categorical('z_%i' % m, p = theta[m], size = N[m], value=np.random.randint(K, size=N[m])) for m in range(M)]
    w = [pm.Categorical("w_%i_%i" % (m,i),
                        p = pm.Lambda('phi_z_%i_%i' % (m,i),lambda z=z[m][i], phi=phi: phi[z]),
                        value=data[m][i],
                        verbose = 0,
                        observed=True)
                        for m in range(M) for i in range(N[m])]
    
    model = pm.Model([theta, phi, z, w])
    mcmc = pm.MCMC(model)
    mcmc.sample(2000,100)
    return mcmc



<h1> Task 1 </h1>

In [35]:
corpus = [
    "I had a peanuts butter sandwich for breakfast.",
    "I like to eat almonds, peanuts and walnuts.",
    "My neighbor got a little dog yesterday.",
    "Cats and dogs are mortal enemies.",
    "You mustn't feed peanuts to your dog."
]

corpus_vectorized = build_word_representation(corpus, True)
K = 2
M = 5
V = 29
mcmc = build_model(corpus_vectorized, K, M, V)

print()
print()
print("Theta's:")
for i in range(M):
    theta_trace = mcmc.trace('theta_%s' % i)[:]
    print(theta_trace.mean(axis = 0))

print()
print("Phi's:")
for i in range(K):
    phi_trace = mcmc.trace('phi_%s' % i)[:]
    print(phi_trace.mean(axis = 0))




 [-----------------100%-----------------] 2000 of 2000 complete in 1.2 sec

Theta's:
[[0.18311101 0.81688899]]
[[0.00897481 0.99102519]]
[[0.17312074 0.82687926]]
[[0.43251347 0.56748653]]
[[0.99554655 0.00445345]]

Phi's:
[[1.59751158e-02 1.15919509e-02 5.73421893e-02 7.12831787e-02
  5.71275077e-02 4.30299938e-03 1.97930561e-04 1.71261031e-01
  3.61691114e-02 7.62104130e-03 5.02467438e-04 1.47628768e-02
  3.49580088e-03 2.52839887e-02 7.35058498e-02 9.14789298e-02
  6.19845264e-03 1.44393381e-02 1.20979253e-01 3.32797807e-03
  7.92461640e-03 2.34542201e-03 5.43869785e-02 8.93076399e-02
  3.79945613e-03 8.44718899e-03 5.87189982e-03 2.62065738e-02
  1.48453280e-02 1.79055026e-05]]
[[4.48468459e-05 6.57943952e-02 3.01193107e-02 1.02267962e-05
  1.93969139e-01 4.63862830e-04 2.23943287e-02 4.03122184e-02
  2.27185479e-02 5.37875866e-02 2.15733866e-02 4.20036754e-03
  3.37739578e-02 2.78319565e-02 3.56340503e-02 1.97730968e-02
  7.69327904e-03 2.04916582e-04 8.92904279e-02 3.95571030e-02

In [36]:
import pandas as pd
data = []

for i in range(M):
    row = np.mean(mcmc.trace('theta_%s' % i)[:], axis=0).tolist()
    data.append(row)


pd.options.display.float_format = '{:,.3f}'.format    
df = pd.DataFrame(np.array(data).reshape(5,2), columns=['$\\boldsymbol\\theta_{d,0}$', 
                                 '$\\boldsymbol\\theta_{d,1}$'])
df.index.name = 'Document'
df

Unnamed: 0_level_0,"$\boldsymbol\theta_{d,0}$","$\boldsymbol\theta_{d,1}$"
Document,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.183,0.817
1,0.009,0.991
2,0.173,0.827
3,0.433,0.567
4,0.996,0.004


<h1> Sanity Check </h1>

In [3]:
docs = [["aaa", "bbb", "aaa"],
        ["bbb", "aaa", "bbb"],
        ["aaa", "bbb", "bbb", "aaa"],
        ["uuu", "vvv"],
        ["uuu", "vvv", "vvv"],
        ["uuu", "vvv", "vvv", "uuu"]]

docs_vectorized = build_word_representation(docs, False)
K = 2
M = 6
V = 5
mcmc = build_model(docs_vectorized, K, M, V)

print()
print()
print("Theta's:")
for i in range(M):
    theta_trace = mcmc.trace('theta_%s' % i)[:]
    print(theta_trace.mean(axis = 0))

print()
print("Phi's:")
for i in range(K):
    phi_trace = mcmc.trace('phi_%s' % i)[:]
    print(phi_trace.mean(axis = 0))
    
print()
print("Z's:")
for i in range(len(docs)):
    z_trace = mcmc.trace('z_%s' % i)[:]
    print(np.round(z_trace.mean(axis = 0)))



 [-----------------100%-----------------] 2000 of 2000 complete in 1.3 sec

Theta's:
[[0.19225862 0.80774138]]
[[0.05607777 0.94392223]]
[[0.17462872 0.82537128]]
[[0.27257018 0.72742982]]
[[0.88166269 0.11833731]]
[[0.90178246 0.09821754]]

Phi's:
[[0.04461403 0.01744699 0.06397167 0.03671182 0.41123429 0.4260212 ]]
[[0.37599832 0.01688767 0.0547564  0.00696405 0.19107649 0.35431708]]

Z's:
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1. 1.]
[1. 0.]
[0. 0. 0.]
[0. 0. 0. 0.]


In [33]:
import pandas as pd
data = []

for i in range(M):
    row = np.mean(mcmc.trace('theta_%s' % i)[:], axis=0).tolist()
    data.append(row)


pd.options.display.float_format = '{:,.3f}'.format    
df = pd.DataFrame(np.array(data).reshape(6,2), columns=['$\\boldsymbol\\theta_{d,0}$', 
                                 '$\\boldsymbol\\theta_{d,1}$'])
df.index.name = 'Document'
df

Unnamed: 0_level_0,"$\boldsymbol\theta_{d,0}$","$\boldsymbol\theta_{d,1}$"
Document,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.192,0.808
1,0.056,0.944
2,0.175,0.825
3,0.273,0.727
4,0.882,0.118
5,0.902,0.098
