In [155]:
import pymc as pm
import re
import numpy as np
import pandas as pd

#function used to compute the numerical representation of the given corpus 
def build_word_representation(corpus, useRegex):
   
    results = [] 
    words_index = dict()
    index = 0
    regex = r"[\w'-]+" # a regex is used for finding only the words within a text including expressions like "mustn't"

    for sentence in corpus:
        result = []
        if useRegex == True:
            words = re.findall(regex, sentence)
        else:
            words = sentence
        for word in words:
            if word in words_index:
                result.append(words_index[word])
            else:
                result.append(index)
                words_index[word] = index
                index = index + 1
        results.append(result)
    return results,words_index


def build_model(data,K,M,V,val_alpha, val_betha, training, burned):
  
    alpha = np.ones(K) * val_alpha #hyperparameter for the Dirichlet distribution corresponding to the topic distribution in a document
    beta = np.ones(V) * val_betha # hyperparameter for the Dirichlet distribution corresponding to the word distribution in a topic
    N = [len(sent) for sent in data]
  
    #PyMC represents Dirichlet variables of length k by the first k−1 elements; 
    #since they must sum to 1, the k-th element is determined by the others
    #CompletedDirichlet appends the k-th element to its parent
    theta = pm.Container([pm.CompletedDirichlet("theta_%s" % m, pm.Dirichlet("ptheta_%s" % m, theta=alpha)) for m in range(M)])
    phi = pm.Container([pm.CompletedDirichlet("phi_%s" % k, pm.Dirichlet("pphi_%s" % k, theta=beta)) for k in range(K)])
    z = pm.Container([pm.Categorical('z_%i' % m, p = theta[m], size = N[m], value=np.random.randint(K, size=N[m])) for m in range(M)])
    w = pm.Container([pm.Categorical("w_%i_%i" % (m,i),
                        p = pm.Lambda('phi_z_%i_%i' % (m,i),lambda z=z[m][i], phi=phi:phi[z]),
                        value=data[m][i],
                        verbose = 0,
                        observed=True)
                        for m in range(M) for i in range(N[m])])
    
    model = pm.Model([theta, phi, z, w])
    mcmc = pm.MCMC(model)
    mcmc.sample(training, burned)
    return mcmc

# function build to trace and display the inferred variables; used for testing purposes
def trace_variables(mcmc,M,K,no_samples):
    print()
    print()
    print("Theta's:")
    for i in range(M):
        theta_trace = mcmc.trace('theta_%s' % i)[:]
        print(theta_trace.mean(axis = 0))

    print()
    print("Phi's:")
    for i in range(K):
        phi_trace = mcmc.trace('phi_%s' % i)[:]
        print(phi_trace.mean(axis = 0))

    print()
    print("Z's:")
    for i in range(M):
        z_trace = mcmc.trace('z_%s' % i)[:]
        print(np.round(z_trace.mean(axis = 0)))


def build_most_common_words_in_topics(mcmc,M,K,words,samples,N):
    # words is a dict containing pairs (word, index) -> word_reversed(index, word)
    # operation can be performed because words does not contain duplicates
    words_reversed = dict(zip(words.values(),words.keys()))
    print()
    print()
    
    for i in range(K):
        phi_trace = mcmc.trace('phi_%s' % i)[:].mean(axis = 0).reshape(len(words))
        words_indexes = (-phi_trace).argsort()[:N]
        most_common_words = [words_reversed.get(key) for key in words_indexes]
        print("Topic %s most common %s words:" %(i,N), most_common_words)

def build_dataframe(mcmc,M,K, no_samples):
    
    data = []
    columns = []
    
    for i in range(M):
        row = mcmc.trace('theta_%s' % i)[:][no_samples - 1]
        data.append(row)
    
    for i in range(K):
        columns.append('$\\boldsymbol\\theta_{m,%s}$' % i)
     
    pd.options.display.float_format = '{:,.3f}'.format
    
    df = pd.DataFrame(np.array(data).reshape(M,K), columns= columns)
    df.index.name = 'Document'
    return df

<h1> Task 1 </h1>

In [185]:
corpus = [
    "I had a peanuts butter sandwich for breakfast.",
    "I like to eat almonds, peanuts and walnuts.",
    "My neighbor got a little dog yesterday.",
    "Cats and dogs are mortal enemies.",
    "You mustn't feed peanuts to your dog."
]

corpus_vectorized,words_dict = build_word_representation(corpus, True)
K = 2
M = 5
alpha = 1
beta = 1
V = len(words_dict)
training = 5000
burned = 500
MCW = 4 # most common words
mcmc = build_model(corpus_vectorized, K, M, V,alpha,beta,training, burned)
trace_variables(mcmc,M,K,training - burned)
build_most_common_words_in_topics(mcmc,M,K,words_dict,training - burned, MCW)
df = build_dataframe(mcmc,M,K, training - burned)
df

 [-----------------100%-----------------] 5000 of 5000 complete in 2.6 sec

Theta's:
[[0.28804672 0.71195328]]
[[0.11875395 0.88124605]]
[[0.5005846 0.4994154]]
[[0.52711057 0.47288943]]
[[0.3003751 0.6996249]]

Phi's:
[[0.00742165 0.10671785 0.01273074 0.04146544 0.01848661 0.02634262
  0.0509882  0.11119579 0.01034079 0.02563058 0.0325814  0.03785641
  0.02821568 0.02383998 0.08555305 0.029265   0.02668905 0.03950603
  0.02566035 0.01030017 0.01577824 0.02250023 0.00182578 0.0277533
  0.01032082 0.0169013  0.06937903 0.03075675 0.05399716]]
[[0.02144694 0.00654613 0.02792847 0.00660188 0.04882269 0.01936149
  0.06578588 0.01803779 0.02842915 0.01317874 0.00076382 0.02827857
  0.01631396 0.02930387 0.03211798 0.04492087 0.10146567 0.06840131
  0.20702366 0.07034498 0.00398669 0.01327285 0.01961246 0.01607834
  0.01021376 0.00321422 0.01082388 0.02925325 0.03847072]]

Z's:
[1. 0. 1. 0. 1. 1. 1. 0.]
[1. 1. 1. 0. 1. 1. 1. 1.]
[0. 1. 1. 1. 1. 1. 1.]
[0. 0. 0. 1. 0. 0.]
[0. 0. 1. 0. 1. 1. 

Unnamed: 0_level_0,"$\boldsymbol\theta_{m,0}$","$\boldsymbol\theta_{m,1}$"
Document,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.288,0.712
1,0.119,0.881
2,0.501,0.499
3,0.527,0.473
4,0.3,0.7


<p> The results show that the model does not converge to the expected results. As it can be observed, words such as prepositions or conjunctions are classified as core words in the topic 1 which means that they have a strong negative impact on the current model.</p>

<h1> Sanity Check </h1>

In [219]:
docs = [["aaa", "bbb", "aaa"],
        ["bbb", "aaa", "bbb"],
        ["aaa", "bbb", "bbb", "aaa"],
        ["uuu", "vvv"],
        ["uuu", "vvv", "vvv"],
        ["uuu", "vvv", "vvv", "uuu"]]

docs_vectorized, words_dict = build_word_representation(docs, False)
K = 2
M = 6
V = len(words_dict)
alpha = 0.5
beta = 0.5
training = 5000
burned = 1000
mcmc = build_model(docs_vectorized, K, M, V,alpha,beta, training, burned)
trace_variables(mcmc,M,K,training - burned)
df = build_dataframe(mcmc,M,K, training - burned)
df

 [-----------------100%-----------------] 5000 of 5000 complete in 3.3 sec

Theta's:
[[0.35130225 0.64869775]]
[[0.24208306 0.75791694]]
[[0.14801208 0.85198792]]
[[0.8788258 0.1211742]]
[[0.78066714 0.21933286]]
[[0.58403155 0.41596845]]

Phi's:
[[0.0107335  0.00147989 0.54574716 0.44203945]]
[[0.49698404 0.33191988 0.17055093 0.00054515]]

Z's:
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1. 1.]
[0. 0.]
[0. 0. 0.]
[0. 0. 0. 0.]


Unnamed: 0_level_0,"$\boldsymbol\theta_{m,0}$","$\boldsymbol\theta_{m,1}$"
Document,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.351,0.649
1,0.242,0.758
2,0.148,0.852
3,0.879,0.121
4,0.781,0.219
5,0.584,0.416


<p>From the displayed results, it can be observed that this time the model converges to the desired values. Not only is this caused by the lack of prepositions and conjunctions, but also due to the non existing correlation between words since the corpus used does not have any sense in English <p>

<h1> Test on Abc News </h1>

<p> For testing the model in real life scenarios, I used as support corpus "A million News Headlines" dataset from Kaggle (https://www.kaggle.com/therohk/million-headlines). The dataset contains over 100 000 samples with news headlines published over a period of fifteen years in Australian Broadcasting Corp. In the following experiments I am going to use only 200 samples from it due to hardware limitations. </p>

<p> As previosuly discussed, the prepositions, conjunctions or any other word which can not be naturally included into a topic affect the performances of the LDA model. To remove the majority of these words, I consulted a list of stopwords from the nltk package which is composed of 179 of such words.</p>

In [246]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

data = pd.read_csv("abcnews-date-text.csv")["headline_text"][:200].tolist()
stop_words = set(stopwords.words('english'))

processed_sentences = []
for sentence in data:
    words = word_tokenize(sentence)
    words_filtered = []
    for w in words:
        if w not in stop_words:
            words_filtered.append(w)
    processed_sentences.append(' '.join(word for word in words_filtered))       
print(len(stop_words))
print(stop_words)
processed_sentences[:20]

179
{'were', 'they', "didn't", "mustn't", 'didn', 'we', 'itself', 'herself', 'against', 'be', 'during', 'than', 'up', "you'd", 'not', 'here', 'down', "she's", "needn't", 'then', 'll', 'needn', 'ma', 'whom', 'most', "that'll", 'both', 'such', "mightn't", 'she', 'nor', 'that', 'but', 'ours', 'into', 's', 'wasn', 'was', 'hasn', "you'll", 'an', 'him', 'weren', 'hers', 'his', 'this', 'them', 'ourselves', 'too', "aren't", 'has', 'only', 'same', 'having', 'below', 'm', 'when', 'some', "weren't", 'mightn', 'yours', "haven't", 'at', 'through', 'and', 'if', "you've", 've', 'been', 'does', 'doesn', 'once', 'more', 'your', 'with', 'its', 'have', 'few', 'the', 'above', 'he', 'now', 'other', 'how', 'couldn', 'had', 'under', 'are', 'because', 'myself', "shan't", 'own', 'very', 'shouldn', 'those', 'their', 'these', 'off', "doesn't", "couldn't", 'aren', 'so', 'being', 'd', 't', 'who', 'as', 'in', "won't", 'about', 'no', 'between', 'a', 'doing', 'wouldn', 'themselves', "you're", 'there', 'y', 'which', '

['aba decides community broadcasting licence',
 'act fire witnesses must aware defamation',
 'g calls infrastructure protection summit',
 'air nz staff aust strike pay rise',
 'air nz strike affect australian travellers',
 'ambitious olsson wins triple jump',
 'antic delighted record breaking barca',
 'aussie qualifier stosur wastes four memphis match',
 'aust addresses un security council iraq',
 'australia locked war timetable opp',
 'australia contribute 10 million aid iraq',
 'barca take record robson celebrates birthday',
 'bathhouse plans move ahead',
 'big hopes launceston cycling championship',
 'big plan boost paroo water supplies',
 'blizzard buries united states bills',
 'brigadier dismisses reports troops harassed',
 'british combat troops arriving daily kuwait',
 'bryant leads lakers double overtime win',
 'bushfire victims urged see centrelink']

In [240]:
# Experiment 1 , K = 15
docs_vectorized, words_dict = build_word_representation(processed_sentences, True)
K = 15
M = len(processed_sentences)
V = len(words_dict)
training = 10000
burned = 1000
alpha = 1
beta = 1
MCW = 5
mcmc = build_model(docs_vectorized, K, M, V, alpha, beta, training, burned)
build_most_common_words_in_topics(mcmc,M,K,words_dict,training - burned,MCW)
df = build_dataframe(mcmc,M,K, training - burned)
df

 [-----------------100%-----------------] 10000 of 10000 complete in 457.5 sec

Topic 0 most common 5 words: ['abs', 'landfill', 'month', 'effects', 'warriors']
Topic 1 most common 5 words: ['peace', 'critics', 'attend', 'jump', 'tick']
Topic 2 most common 5 words: ['freak', 'girl', 'loan', 'asylum', 'respite']
Topic 3 most common 5 words: ['qr', 'student', 'disgusted', 'tie', 'threat']
Topic 4 most common 5 words: ['water', '50m', 'australia', 'farmers', 'entry']
Topic 5 most common 5 words: ['forest', 'control', 'backs', 'harassment', 'shortly']
Topic 6 most common 5 words: ['crews', 'shines', 'forest', 'overseas', 'iraqi']
Topic 7 most common 5 words: ['program', 'ventures', 'ruins', 'river', 'plans']
Topic 8 most common 5 words: ['security', 'defeat', 'cycling', 'angry', 'fans']
Topic 9 most common 5 words: ['rise', 'buries', 'gm', 'drinking', 'greens']
Topic 10 most common 5 words: ['10', 'drink', 'issues', 'gas', 'domestic']
Topic 11 most common 5 words: ['surge', 'survey', 'miss

Unnamed: 0_level_0,"$\boldsymbol\theta_{m,0}$","$\boldsymbol\theta_{m,1}$","$\boldsymbol\theta_{m,2}$","$\boldsymbol\theta_{m,3}$","$\boldsymbol\theta_{m,4}$","$\boldsymbol\theta_{m,5}$","$\boldsymbol\theta_{m,6}$","$\boldsymbol\theta_{m,7}$","$\boldsymbol\theta_{m,8}$","$\boldsymbol\theta_{m,9}$","$\boldsymbol\theta_{m,10}$","$\boldsymbol\theta_{m,11}$","$\boldsymbol\theta_{m,12}$","$\boldsymbol\theta_{m,13}$","$\boldsymbol\theta_{m,14}$"
Document,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,0.162,0.065,0.011,0.111,0.007,0.066,0.016,0.100,0.035,0.053,0.111,0.106,0.023,0.027,0.106
1,0.020,0.142,0.056,0.012,0.002,0.002,0.017,0.019,0.010,0.013,0.026,0.333,0.040,0.053,0.256
2,0.017,0.130,0.060,0.053,0.039,0.054,0.013,0.010,0.201,0.162,0.057,0.068,0.018,0.027,0.088
3,0.031,0.107,0.024,0.109,0.038,0.011,0.049,0.053,0.056,0.025,0.069,0.065,0.171,0.015,0.175
4,0.010,0.066,0.014,0.140,0.001,0.109,0.015,0.172,0.054,0.030,0.160,0.114,0.002,0.096,0.016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.018,0.150,0.042,0.238,0.022,0.007,0.011,0.058,0.050,0.004,0.155,0.137,0.073,0.017,0.019
196,0.043,0.016,0.018,0.183,0.030,0.007,0.017,0.078,0.007,0.067,0.042,0.076,0.136,0.209,0.072
197,0.164,0.022,0.039,0.019,0.086,0.039,0.000,0.063,0.135,0.058,0.020,0.180,0.038,0.097,0.040
198,0.068,0.022,0.151,0.008,0.059,0.039,0.021,0.179,0.013,0.003,0.064,0.074,0.197,0.071,0.030


In [241]:
# Experiment 2 , K = 12
docs_vectorized, words_dict = build_word_representation(processed_sentences, True)
K = 12
M = len(processed_sentences)
V = len(words_dict)
training = 10000
burned = 1000
alpha = 1
beta = 1
MCW = 5
mcmc = build_model(docs_vectorized, K, M, V, alpha, beta, training, burned)
build_most_common_words_in_topics(mcmc,M,K,words_dict,training - burned,MCW)
df = build_dataframe(mcmc,M,K, training - burned)
df

 [-----------------100%-----------------] 10001 of 10000 complete in 411.1 sec

Topic 0 most common 5 words: ['birthday', 'profit', 'bowling', 'nca', 'witnesses']
Topic 1 most common 5 words: ['conduct', 'new', 'rabbit', 'push', 'gets']
Topic 2 most common 5 words: ['warns', 'aba', 'police', 'go', 'art']
Topic 3 most common 5 words: ['relief', 'venezuela', 'growth', 'stay', 'witnesses']
Topic 4 most common 5 words: ['offers', 'driver', 'hopes', 'freedom', 'incursion']
Topic 5 most common 5 words: ['inspect', 'control', 'brigadier', 'line', 'abattoir']
Topic 6 most common 5 words: ['jury', 'bryant', 'urged', 'regulator', 'arsenal']
Topic 7 most common 5 words: ['mine', 'disgusted', 'second', '2500', 'contain']
Topic 8 most common 5 words: ['meet', 'sterrey', 'prepare', 'buenos', 'robson']
Topic 9 most common 5 words: ['gladstone', 'levy', 'crews', 'miss', 'triple']
Topic 10 most common 5 words: ['welcome', 'peace', 'code', 'qualifier', 'knock']
Topic 11 most common 5 words: ['deposits',

Unnamed: 0_level_0,"$\boldsymbol\theta_{m,0}$","$\boldsymbol\theta_{m,1}$","$\boldsymbol\theta_{m,2}$","$\boldsymbol\theta_{m,3}$","$\boldsymbol\theta_{m,4}$","$\boldsymbol\theta_{m,5}$","$\boldsymbol\theta_{m,6}$","$\boldsymbol\theta_{m,7}$","$\boldsymbol\theta_{m,8}$","$\boldsymbol\theta_{m,9}$","$\boldsymbol\theta_{m,10}$","$\boldsymbol\theta_{m,11}$"
Document,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,0.065,0.013,0.006,0.010,0.132,0.234,0.087,0.086,0.096,0.010,0.011,0.248
1,0.110,0.021,0.089,0.015,0.020,0.009,0.011,0.227,0.045,0.221,0.150,0.083
2,0.033,0.015,0.053,0.126,0.056,0.058,0.004,0.051,0.002,0.160,0.187,0.255
3,0.035,0.174,0.048,0.023,0.032,0.096,0.108,0.033,0.018,0.065,0.192,0.177
4,0.000,0.166,0.063,0.073,0.193,0.064,0.065,0.034,0.064,0.141,0.053,0.084
...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.018,0.171,0.048,0.053,0.017,0.020,0.036,0.010,0.410,0.184,0.024,0.008
196,0.058,0.150,0.020,0.041,0.047,0.003,0.080,0.038,0.044,0.181,0.176,0.163
197,0.035,0.005,0.035,0.085,0.061,0.186,0.086,0.133,0.100,0.152,0.003,0.118
198,0.190,0.048,0.006,0.015,0.122,0.233,0.125,0.022,0.076,0.137,0.011,0.015


In [242]:
# Experiment 1 , K = 18
docs_vectorized, words_dict = build_word_representation(processed_sentences, True)
K = 18
M = len(processed_sentences)
V = len(words_dict)
training = 10000
burned = 1000
alpha = 1
beta = 1
MCW = 5
mcmc = build_model(docs_vectorized, K, M, V, alpha, beta, training, burned)
build_most_common_words_in_topics(mcmc,M,K,words_dict,training - burned,MCW)
df = build_dataframe(mcmc,M,K, training - burned)
df

 [-----------------100%-----------------] 10000 of 10000 complete in 454.3 sec

Topic 0 most common 5 words: ['continue', 'wollongong', 'memphis', 'potential', 'politics']
Topic 1 most common 5 words: ['massive', 'western', 'key', 'seat', 'shut']
Topic 2 most common 5 words: ['wine', 'changes', 'calling', 'frustrate', 'impact']
Topic 3 most common 5 words: ['house', 'united', 'nth', 'members', 'restrictions']
Topic 4 most common 5 words: ['bilby', 'protection', 'came', 'put', 'thousands']
Topic 5 most common 5 words: ['profit', 'colleague', 'deposits', 'air', 'nth']
Topic 6 most common 5 words: ['brother', 'announced', 'threat', 'staff', 'raid']
Topic 7 most common 5 words: ['men', 'division', 'continued', 'birthday', 'govt']
Topic 8 most common 5 words: ['lift', 'passengers', 'ambitious', 'project', 'onesteel']
Topic 9 most common 5 words: ['osullivan', 'resolution', 'avenges', 'urged', '11']
Topic 10 most common 5 words: ['vegetation', 'al', 'west', 'publics', 'addresses']
Topic 11 m

Unnamed: 0_level_0,"$\boldsymbol\theta_{m,0}$","$\boldsymbol\theta_{m,1}$","$\boldsymbol\theta_{m,2}$","$\boldsymbol\theta_{m,3}$","$\boldsymbol\theta_{m,4}$","$\boldsymbol\theta_{m,5}$","$\boldsymbol\theta_{m,6}$","$\boldsymbol\theta_{m,7}$","$\boldsymbol\theta_{m,8}$","$\boldsymbol\theta_{m,9}$","$\boldsymbol\theta_{m,10}$","$\boldsymbol\theta_{m,11}$","$\boldsymbol\theta_{m,12}$","$\boldsymbol\theta_{m,13}$","$\boldsymbol\theta_{m,14}$","$\boldsymbol\theta_{m,15}$","$\boldsymbol\theta_{m,16}$","$\boldsymbol\theta_{m,17}$"
Document,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,0.105,0.000,0.048,0.002,0.014,0.043,0.028,0.049,0.007,0.000,0.099,0.177,0.054,0.001,0.021,0.176,0.065,0.111
1,0.076,0.003,0.099,0.274,0.019,0.018,0.076,0.014,0.075,0.064,0.046,0.003,0.023,0.000,0.101,0.012,0.001,0.097
2,0.117,0.010,0.017,0.036,0.119,0.044,0.002,0.069,0.061,0.149,0.066,0.027,0.160,0.082,0.002,0.019,0.004,0.018
3,0.002,0.001,0.000,0.011,0.013,0.180,0.023,0.061,0.012,0.008,0.058,0.104,0.048,0.142,0.085,0.020,0.083,0.149
4,0.047,0.005,0.043,0.048,0.054,0.133,0.148,0.094,0.052,0.092,0.034,0.045,0.005,0.035,0.083,0.025,0.042,0.015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.157,0.104,0.128,0.034,0.081,0.063,0.022,0.056,0.032,0.019,0.003,0.009,0.054,0.038,0.008,0.037,0.147,0.008
196,0.012,0.040,0.004,0.017,0.206,0.204,0.061,0.031,0.089,0.022,0.005,0.007,0.043,0.046,0.004,0.081,0.118,0.012
197,0.055,0.040,0.279,0.037,0.006,0.099,0.075,0.010,0.067,0.020,0.130,0.008,0.014,0.010,0.031,0.083,0.035,0.001
198,0.002,0.013,0.022,0.046,0.005,0.035,0.131,0.049,0.003,0.155,0.000,0.113,0.005,0.100,0.201,0.019,0.007,0.093


In [247]:
# Experiment 1 , K = 8
docs_vectorized, words_dict = build_word_representation(processed_sentences, True)
K = 8
M = len(processed_sentences)
V = len(words_dict)
training = 10000
burned = 1000
alpha = 1
beta = 1
MCW = 5
mcmc = build_model(docs_vectorized, K, M, V, alpha, beta, training, burned)
build_most_common_words_in_topics(mcmc,M,K,words_dict,training - burned,MCW)
df = build_dataframe(mcmc,M,K, training - burned)
df

 [-----------------100%-----------------] 10000 of 10000 complete in 358.3 sec

Topic 0 most common 5 words: ['un', 'international', 'court', 'freedom', 'wins']
Topic 1 most common 5 words: ['belittling', 'wins', 'leaves', 'ventures', 'eases']
Topic 2 most common 5 words: ['afl', 'sacking', 'control', 'stand', 'wheatbelt']
Topic 3 most common 5 words: ['clean', 'council', 'bushfire', 'international', 'vegetable']
Topic 4 most common 5 words: ['hacker', 'white', 'tasmanian', 'interest', 'necessary']
Topic 5 most common 5 words: ['buenos', 'buries', 'disgusted', 'national', 'mauls']
Topic 6 most common 5 words: ['match', 'number', 'north', 'local', 'push']
Topic 7 most common 5 words: ['honoured', 'crean', 'qualifier', 'drinking', 'awards']


Unnamed: 0_level_0,"$\boldsymbol\theta_{m,0}$","$\boldsymbol\theta_{m,1}$","$\boldsymbol\theta_{m,2}$","$\boldsymbol\theta_{m,3}$","$\boldsymbol\theta_{m,4}$","$\boldsymbol\theta_{m,5}$","$\boldsymbol\theta_{m,6}$","$\boldsymbol\theta_{m,7}$"
Document,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.124,0.203,0.147,0.147,0.155,0.027,0.152,0.045
1,0.124,0.062,0.239,0.276,0.004,0.076,0.203,0.016
2,0.021,0.144,0.150,0.099,0.107,0.336,0.030,0.113
3,0.014,0.130,0.064,0.173,0.202,0.086,0.326,0.006
4,0.004,0.051,0.127,0.389,0.010,0.092,0.140,0.187
...,...,...,...,...,...,...,...,...
195,0.134,0.018,0.051,0.144,0.081,0.061,0.034,0.479
196,0.394,0.060,0.044,0.032,0.141,0.144,0.139,0.046
197,0.091,0.180,0.071,0.173,0.097,0.015,0.050,0.323
198,0.018,0.073,0.140,0.051,0.116,0.251,0.306,0.046


<p>Since the number of topics K is not known before the inferring and the structure of the news headlines is not that well-defined, I conducted four different experiments to find the best value for the hyperparameter K. Even though almost every headline could be integrated in one of the following topics: sport, politics, defense, health, history, animals, I opted for larger values of K to be tested to see wheather the model can infer more specifics topics. </p>
<p> In the case of  K = 8 it can be observed that model gather some semantic sense for few of the topics. For example,in topic 0 the most 5 common words describing the topic are ['un', 'international', 'court', 'freedom', 'wins'] which are somehow correlated. Same behaviour can be noticed in the case where K = 12 (Topic 0 most common 5 words:['birthday', 'profit', 'bowling', 'nca', 'witnesses']).</p>
<p> From the results, it can be observed that for large values of K : 15 or 18. The model do not converge anymore to the expected results. Words from separate topics can be merged into one general topic such as in the case with Topic 13 and 14 for K = 15. When K is 18, the model gives the highest failure, being incapable to distinguish between a such large number of topics considering only 200 headlines are used as a corpus. </p>

<h2> Conclusion </h2>

<p> In conclusion the current state of the model does not provide a good separation between different topics and more experiments could be conducted to improve the performances of the current model. Increasing the training data is a reliable way to acquire a more rigurous analysis over the model performances. Also, trying more values for the hyperparameters alpha and beta could represent a new path to explore. </p>

<h1> Extras </h1>

<h2> 1) Can the topic model be used to define a topic-based similarity measure between documents? </h2>

<p> A topic-based similarity measure between documents  can be modelled using <b>JS</b> distance which is an adapted symmetrical version of <b>KL</b> distance used for measuring the difference between two probabilities. Because the topic distribution over the documents is already calculated, it is easy to describe the topic-basic similarity between 2 topic distributions <b> p </b> and <b> q</b> as it follows:</p>
<p> \begin{align}
 D_{KL}(p,q) & = \sum_{i=1}^M p_i * ln\frac{p_i}{q_i} \\
 D_{JS}(p,q) & = \frac{1}{2}[D_{KL}(p,\frac{p+q}{2}) + D_{KL}(q,\frac{p+q}{2})]
\end{align}

</p>

<h2> 2) What about a new document? How can topics be assigned to it? </h2>

<p> A way to compute the topics distribution for a new document is to add the new document to the existing corpus and to infer the whole dataset again. Unfortunately, this method has high computational cost, especially for large datasets, and it could become quite infeasible. </p>
<p> Another solution could be to make use of the word distribution in a topic and the topic distribution in a document already calculated. For each word w in the new document we assign it to the topic t in which w has the highest probability to appear. Based on these assignations we can compute the topic distribution for the new document as it follows: for each topic t, the probability that t is a topic of the new document is equal to the number of words assigned to the topic t, divided by the number of the total words in the document</p>