# This function is to grid search LDA with start, limit and step while calculating Coherence score for each of the models

In [1]:
def ldamodel(dictionary, corpus, texts, start, limit, step, doc_id):
    
    import numpy as np
    import pandas as pd
    import re, nltk, spacy
    import pickle
    import os
    import scispacy
    import random
    import string

    import gensim
    from gensim.models.coherencemodel import CoherenceModel
    from gensim.models.ldamodel import LdaModel
    from gensim.test.utils import common_corpus, common_dictionary, get_tmpfile
    from gensim.corpora.dictionary import Dictionary
    import gensim.corpora as corpora
    
    import pyLDAvis.gensim
    import matplotlib.pyplot as plt
    %matplotlib inline
    
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    random = ''.join([random.choice(string.ascii_letters + string.digits) for n in range(10)])
    
    directory = 'Models_' + random
    
    os.mkdir(directory) 
    # Running a for loop as a grid search for the LDA models. 
    
    for num_topics in range(start, limit, step):
        model=LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
        # Saving all created models in a folder if needed later. 
        modelpath = "model_" + str(num_topics) + "_topics"
        model.save(directory + '/' + modelpath)
        
    # Creating document-topic probability csv
    doc_topic_matrix = pd.DataFrame()
    mixture = [dict(model_list[np.argmax(coherence_values)][x]) for x in corpus]
    doc_topic_matrix = pd.DataFrame(mixture)
    doc_topic_matrix.columns.astype(str)
    doc_id = doc_id.to_numpy()
    doc_topic_matrix.insert(0, 'pubmedID', doc_id)
    pd.DataFrame(doc_topic_matrix).to_csv(directory + "/document-topic.csv", index = False)
    
    # Creating word-topic probability csv

    topics_terms = model_list[np.argmax(coherence_values)].state.get_lambda()
    topics_terms_matrix = np.apply_along_axis(lambda x: x/x.sum(),1,topics_terms)
    words = [model_list[np.argmax(coherence_values)].id2word[i] for i in range(topics_terms_matrix.shape[1])]
    pd.DataFrame(topics_terms_matrix, columns=words).to_csv(directory + "/word-topic.csv")
    
    # Creating the coherence score plot
    x = range(start, limit, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Num Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.savefig(directory + '/Coherence.png')
    
    # Creating LDA visual
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim.prepare(model_list[np.argmax(coherence_values)], corpus, dictionary)
    pyLDAvis.save_html(vis, directory + '/dashboard_LDA_Tfidf.html')

    return model_list, coherence_values, doc_topic_matrix, corpus, dictionary