# Code 36

In [None]:
from gensim.models import LdaMulticore , TfidfModel
from gensim.corpora import Dictionary
import multiprocessing

# Code 37
This code assumes a collection of documents `instances`

In [None]:
dictionary = Dictionary(instances)
dictionary.filter_extremes(no_below=100, no_above=0.1)

ldacorpus = [dictionary.doc2bow(text) for text in instances]

tfidfmodel = TfidfModel(ldacorpus)

model_corpus = tfidfmodel[ldacorpus]

# Code 38

In [None]:
num_passes = 10
num_topics = 20

# find chunksize to make about 200 updates
chunk_size = len(model_corpus) * num_passes/200

model = LdaMulticore(model_corpus, # corpus from above
                     id2word=dictionary, # dictionary from above
                     num_topics=num_topics,
                     workers=min(10, multiprocessing.cpu_count()-1), # use 10 or maximum number of cores -1
                     passes=num_passes,
                     chunksize=chunk_size)

# Code 39

In [None]:
topic_corpus = model[model_corpus]

# Code 40

In [None]:
model.print_topics()

# Code 41

In [None]:
import re
topic_sep = re.compile(r"0\.[0-9]{3}\*")

model_topics = [(topic_no , re.sub(topic_sep, '', model_topic).split(' + ')) 
                for topic_no, model_topic in model.print_topics(num_topics=num_topics, 
                                                                                                                              num_words=5)]

descriptors = []
for i, m in model_topics:
    print(i+1, ", ".join(m[:5]))
    descriptors.append(", ".join(m[:2]).replace('"', ''))

# Code 42

In [None]:
from gensim.models import CoherenceModel

coherence_values = []
model_list = []
for num_topics in range(5, 21):
    print(num_topics)
    model = LdaMulticore(corpus=sample, id2word=dictionary, num_topics=num_topics)
    model_list.append(model)
    coherencemodel_umass = CoherenceModel(model=model, texts=test_sample, dictionary=dictionary, coherence='u_mass')

    coherencemodel_cv = CoherenceModel(model=model, texts=test_sample, dictionary=dictionary, coherence='c_v')

    coherence_values.append((num_topics, coherencemodel_umass.get_coherence(), coherencemodel_cv.get_coherence()))

# Code 43

In [None]:
from collections import defaultdict
author2doc = defaultdict(list)

for i, country in enumerate(df.country):
    author2doc[country].append(i)

# Code 44

In [None]:
from gensim.models import AuthorTopicModel
from gensim.test.utils import temporary_file

with temporary_file("serialized") as s_path:
    author_model = AuthorTopicModel(
        model_corpus, 
        author2doc=author2doc, 
        id2word=dictionary, 
        num_topics=9,
        serialized=True, 
        serialization_path=s_path,
        alpha=0.5
    )
    
    author_model.update(model_corpus, author2doc)