In [9]:
import numpy as np
import pymc as pm
from matplotlib import pyplot as plt
import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
from nltk.corpus import stopwords 
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer

numberOfDocs = 10
fileNames = [f"input/{x}.txt" for x in range(numberOfDocs)]

In [58]:
def read_files():
    files = [None] * numberOfDocs
    stopWords = set(stopwords.words('english'))
    tokenizer = RegexpTokenizer(r'\w+')
    stemmer = PorterStemmer()
    for i in range(numberOfDocs):
        with open(fileNames[i], errors = 'ignore') as file:
            files[i] = file.read().replace('\n', '').lower()
        wordTokens = tokenizer.tokenize(files[i])
#         files[i] = wordTokens
        files[i] = [stemmer.stem(word) for word in wordTokens if word not in stopWords and len(word) > 1]    
    return files

In [76]:
docs = read_files()
# print(docs)
dic = {}
count = 0
for i in range(len(docs)):
    for j in range(len(docs[i])):
#         print(docs[i][j])
        if dic.get(docs[i][j]) != None:
            val = dic.get(docs[i][j])
            dic[docs[i][j]] = (int(val[0]), int(val[1]) + 1)
#             docs[i][j] = int(val[0])
        else:
            dic[docs[i][j]] = (int(count), 1)
#             docs[i][j] = int(count)
            count += 1
docs = np.array(docs)
# print(dic)
freq = []
for val in dic.values():
    freq.append(int(val[1]))
max(freq)
freq = [x for x in freq if x > 10]
print(len(dic))
newDic = {}
count = 0
for key, el in dic.items():
    if(int(el[1]) > 10):
        newDic[key] = (count, el[1])
        count += 1
print(len(newDic))
dic = newDic
for i in range(len(docs)):
    docs[i] = [dic.get(x)[0] for x in docs[i] if dic.get(x) != None]
# print(docs)

1521
64


In [80]:
numberOfTopics = 3
numberOfWords = len(dic) + 1

# numberOfTopics = 2
# numberOfWords = 4
# numberOfDocs = 6
# docs = np.array([[0, 1, 0], [1, 0, 1], [0, 1, 1, 0], [2, 3], [2, 3, 3], [2, 3, 3, 2]])

alpha = np.ones(numberOfTopics) * 0.8
beta = np.ones(numberOfWords) * 0.8

phi = pm.Container([
    pm.CompletedDirichlet(f'phi_{k}',
                          pm.Dirichlet(f'pphi_{k}', beta)
                         ) 
    for k in range(numberOfTopics)
])

theta = pm.Container([
    pm.CompletedDirichlet(f'theta_{m}', 
                          pm.Dirichlet(f'ptheta_{m}', alpha)
                         ) 
    for m in range(numberOfDocs)
])

z = pm.Container([
    pm.Categorical(f'z_{i}', 
                    p = theta[i], 
                    size = len(docs[i]), 
                    value = np.random.randint(numberOfTopics, size = len(docs[i]))
                  )
    for i in range(numberOfDocs)
])

w = pm.Container([
    pm.Categorical(f'w_{i}_{j}', 
                   p = pm.Lambda(f'phi_z_{i}_{j}', 
                                 lambda z = z[i][j], phi = phi: phi[z]), 
                   value = docs[i][j], 
                   observed = True
                  ) 
    for i in range(numberOfDocs) for j in range(len(docs[i]))
])

mcmc = pm.MCMC([theta, phi, z, w])
mcmc.sample(2000, 100)
print()
for i in range(numberOfDocs):
    print(theta.value[i])
for i in range(numberOfDocs):
    print(z.value[i])

 [-----------------100%-----------------] 2000 of 2000 complete in 5.5 sec
[[0.34005926 0.01566262 0.64427812]]
[[0.79988294 0.17094832 0.02916874]]
[[0.12436705 0.15246895 0.723164  ]]
[[0.06557073 0.60525088 0.32917839]]
[[0.73101125 0.09969772 0.16929103]]
[[0.12985148 0.06851493 0.80163358]]
[[0.16440261 0.20812345 0.62747394]]
[[0.09478506 0.87964123 0.0255737 ]]
[[0.97151485 0.02223518 0.00624997]]
[[0.3680524  0.33989084 0.29205676]]
[1 0 2 1 2 1 2 0 0 1 2 1 1 2 1 1 1 0 0 2 2 0 2 0 2 1 1 1 1 0 2 0 0 2 0 2 1
 2 2 0 2 0 1 2 1 0 1 0 0 1 2 0 1 2 0 0 1 1 1 0 1 0 0 0 1 2 0 1 1 0 2 1 0 2
 0 2 2 1 1 0 0 2 2 1 0 0 1 0 2 1 0 1 2 1 1 2 0 1 2 0 1 1 2 2 2 0 2 1 2 2 0
 0 0 1 1 0 0 0 1 1 2 2 1 2 0 2 0 1 1 2 1 0 2 2 2 0 1 1 2 1 2 1 2 1 0 0 1 1
 2 1 0 0 2 2 1 0 0 1 0]
[2 2 0 0 2 1 1 0 1 1 0 0 0 2 2 2 1 0 0 2 2 2 1 2 0 1 0 2 1 2 0 1 0 1 1 0 2
 2 1 1 2 1 0 1 0 2 1 0 0 1 2 1 0 1 0 0 0 1 1 2 0 0 1 1 0 1 0 2 0 2 1 2 2 0
 1 0 1 2 1 0 2 2 1 2 1 2 0 0 0 0 1 0 0 0 1 2 0 2 1 0 2 2 2 1 2 1 0 1 0 0 0
 2 2 0

In [5]:
mcmc.trace('theta_0')[:]

array([[[0.42676748, 0.5210027 , 0.05222982]],

       [[0.42676748, 0.5210027 , 0.05222982]],

       [[0.42676748, 0.5210027 , 0.05222982]],

       ...,

       [[0.42676748, 0.5210027 , 0.05222982]],

       [[0.42676748, 0.5210027 , 0.05222982]],

       [[0.42676748, 0.5210027 , 0.05222982]]])

In [6]:
np.set_printoptions(threshold=np.inf)
for i in range(numberOfTopics):
    print(phi.value[i])

[[1.14942892e-03 1.73687153e-04 1.63362557e-04 8.04730779e-04
  1.05765980e-03 2.30420782e-04 6.14250453e-04 1.25274097e-03
  1.96145585e-04 9.84490679e-04 3.02967840e-04 1.06549858e-03
  5.89093727e-04 3.93257649e-04 1.00551671e-03 3.20750566e-04
  2.87505178e-05 5.18596707e-05 4.12324670e-04 1.11027452e-03
  3.50971081e-04 3.93249357e-05 6.14027991e-04 1.64506778e-04
  1.00012051e-03 2.34797528e-04 2.80375137e-04 7.74961845e-04
  5.10572126e-05 7.94643661e-04 9.27608580e-04 2.16209341e-04
  7.01445258e-05 2.01126898e-05 2.56737049e-05 1.37579757e-04
  1.59187302e-04 3.16438562e-04 4.45941276e-04 4.11934914e-04
  6.04630470e-04 2.48715224e-04 4.15097942e-05 4.61089256e-04
  2.13564352e-04 1.10327912e-03 1.20076504e-04 6.57087052e-04
  2.99862049e-04 3.01296621e-04 3.21544718e-04 4.73649702e-04
  3.15109457e-04 3.59707567e-04 8.62658234e-04 1.61652432e-04
  6.62300044e-04 8.60005549e-04 1.59627506e-04 4.14235815e-04
  1.26931964e-03 1.38945333e-04 3.05572209e-04 2.60964830e-04
  1.1178

In [7]:
for i in range(numberOfDocs):
    print(z.value[i])

[2 1 1 0 0 1 0 2 1 2 2 0 1 2 1 2 1 1 2 1 1 1 0 1 1 0 1 2 0 2 2 1 0 1 0 2 1
 2 2 1 1 1 2 1 0 0 2 1 2 1 2 0 1 0 0 2 2 0 0 2 0 1 1 1 2 1 0 2 2 2 2 2 0 0
 0 1 0 1 1 1 0 0 1 1 2 2 1 1 2 2 0 0 2 1 1 1 2 0 0 0 0 1 0 0 1 1 0 1 0 0 0
 0 1 1 2 0 0 0 2 1 1 2 1 0 2 0 2 0 0 1 0 0 1 2 2 2 0 0 1 1 0 2 1 2 2 2 2 2
 0 1 2 0 0 0 1 0 0 1 1 1 1 0 0 0 2 2 0 1 0 2 0 2 2 0 1 0 0 2 0 2 0 0 2 2 1
 0 1 2 2 0 2 0 1 1 0 0 2 1 2 1 0 0 2 0 1 2 1 1 2 2 0 1 1 1 0 2 2 1 2 1 0 0
 0 1 2 0 1 0 1 2 2 1 2 1 2 1 0 1 2 2 0 2 2 1 1 2 1 0 2 1 2 1 0 1 2 2 0 2 2
 1 1 2 2 0 2 2 0 2 2 2 2 0 0 0 2 2 2 2 1 2 2 1 0 0 2 2 1 2 0 0 2 0 0 1 2 2
 1 2 2 0 0 0 2 1 2 0 0 1 0 0 1 2 2 1 0 1 1 0 1 2 0 0 1 0 0 1 2 2 1 1 0 2 0
 1 0 2 2 2 0 1 1 0 0 0 0 0 2 1 2 1 1 0 0 1 0 1 2 2 0 0 2 2 0 1 1 1 2 2 0 0
 2 0 0 1 1 2 2 1 2 0 2 1 1 0 1 2 0 1 1 0 2 0 0 2 0 1 2 1 0 1 1 1 0 0 2 2 2
 2 2 0 2 1 0 1 1 2 0 1 2 0 2 2 0 0 2 1 2 2 0 2 0 1 1 0 0 0 2 2 1 0 1 0 2 1
 0 0 0 0 0 1 0 1 2 2 2 1 1 0 1 2 0 1 0 1 2 0 1 0 0 0 1 2 1 1 2 2 1 2 2 2 2
 1 1 0 1 0 1 0 1 0 2]
[1 