<a href="https://colab.research.google.com/github/drob-xx/TopicModelTuning/blob/main/TopicModelTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook has code that parrallels the article *Using Metrics to Determine  The Right LDA Topic Model Size*. Users can run the notebook and step-by-step re-create the procedures described in the article.

To run the code presented here, follow this outline (details in the cells below):

1.   Download two csv files from the GitHub repository into a directory accessible to the notebook.
1.   Download the text DB csv file from Kaggle.
1.   Assign the global directory value to the location of the above files.
1. Install the required packages.
1. Execute the imports.
1. Run the cells containing Python function definitions used in the notebook.
1. Generate the six models used in the evaluation. This shold take about 15 minutes on a standard Google Colab account. You can save the models for later use if desired.
1. Run the evaluation code.


### Download CSV Files
There are three csv files that are needed to run this notebook:

In the [GitHub repository](https://github.com/drob-xx/TopicModelTuning):
- ExcludelistDF.csv
- ModelRunMetrics.csv

On Kaggle
- [NewsDF.csv](https://www.kaggle.com/datasets/danrobinson707/newsdf) 

***ExcludelistDF*** is a list of stop words which can be used when building models based on the sample text.

***ModelRunMetrics*** are the metrics from 90 runs of the LDA and can be used to re-create and explore the data from the article.

***NewsDF*** is a copy of the 30,000 article DB that has both the original text as well as pre-processed versions of the articles. You will need this if you want to run your own models AND if you want to explore the text that the models are built on.

It is recommended that you place all of these files in a location accessible to the Colab notebook and referenced in the DATA_DIR variable

In [None]:
DATA_DIR = 'content/Projects/POSBenchmarks/data/news'

### Installs

In [None]:
!pip install plotly.express
# !pip install gensim --upgrade
!pip install kneed

# install pyyaml 5.4.1 because of a compatibility problem with plotly.express
!pip install pyyaml==5.4.1

In [None]:
!pip install -U kaleido

Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[K     |████████████████████████████████| 79.9 MB 104 kB/s 
[?25hInstalling collected packages: kaleido
Successfully installed kaleido-0.2.1


### Imports

In [None]:
import pandas as pd
# import plotly.express as px
import plotly.graph_objects as go
from  plotly.subplots import make_subplots

import gensim.corpora as corpora
# import gensim
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import Phrases
from gensim.models.phrases import Phraser

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import maxabs_scale

from kneed import KneeLocator




  defaults = yaml.load(f)


### Defs

In [None]:
def plotKnee(x, y) :
  k1 = KneeLocator(x, y, curve='concave' )
  print(round(k1.knee, 3))
  k1.plot_knee()

# def PrintTopics(model, ListOfTopics) :
#   topics2show = [[topic[0] for topic in model.show_topic(topicnum, topn=20)]  for topicnum in ListOfTopics]
#   for index, line in enumerate([' '.join(topiclist) for topiclist in topics2show]) :
#     print(index, '\t', line)

def CreateID2WordAndCorpus(TextLines, stopwords=[]) :
  docs = [line.split() for line in TextLines]
  newlist = []
  for lines in docs :
    adoc = []
    for word in lines :
      if word not in stopwords :
        adoc.append(word)
    newlist.append(adoc)
  id2word = corpora.Dictionary(newlist)
  corpus = [id2word.doc2bow(text) for text in newlist] 
  return id2word, corpus
  
# def format_topics_sentences(ldamodel, corpus, texts):
#     # Init output
#     sent_topics_df = pd.DataFrame()

#     # Get main topic in each document
#     for i, row in enumerate(ldamodel[corpus]):
#         row = sorted(row, key=lambda x: (x[1]), reverse=True)
#         # Get the Dominant topic, Perc Contribution and Keywords for each document
#         for j, (topic_num, prop_topic) in enumerate(row):
#             if j == 0:  # => dominant topic
#                 wp = ldamodel.show_topic(topic_num)
#                 topic_keywords = ", ".join([word for word, prop in wp])
#                 sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
#             else:
#                 break
#     sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

#     # Add original text to the end of the output
#     contents = pd.Series(texts)
#     sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
#     return(sent_topics_df)  

# def CreateTopicAndContribMatrices(Model, Corpus) :
#   pctContribution = []
#   topicNum = []
#   for _, row in enumerate(Model[Corpus]) :
#     row = sorted(row, key=lambda x: (x[1]), reverse=True)
#     tempNum = []
#     tempVal = []
#     for _, tupe in enumerate(row) :
#       tempNum.append(tupe[0])
#       tempVal.append(tupe[1])
#     topicNum.append(tempNum)
#     pctContribution.append(tempVal)
#   pctDF = pd.DataFrame(pctContribution)
#   #Set low scores to 0
#   pctDF = pctDF * (pctDF > .005)
#   topicNumDF = pd.DataFrame(topicNum)
#   topicNumDF = ((topicNumDF + 1) * (pctDF > 0)) - 1
#   return pctDF, topicNumDF

# def NumDocsPerTopic(docDF, topicNumDF, numTopics) :
#   TopicOccurence = {topic: docDF[pd.Series((topicNumDF == topic).sum(axis=1))>0].shape[0] for topic in range(numTopics)}
#   return collections.OrderedDict(sorted(TopicOccurence.items(), key=lambda item: item[1]))

# def TotalContributionByTopic(pctContributionDF, topicNumDF, numTopics) :
#   values = [((pctContributionDF * ((topicNumDF[topicNumDF == x] -(x-1)).fillna(0))).sum()).sum() for x in range(numTopics)] 
#   TopicContrib = {x : values[x] for x in range(numTopics)}
#   return collections.OrderedDict(sorted(TopicContrib.items(), key=lambda item: item[1]))

# def AverageTopicWordLength(model) :

#   twentyWords = [[topic[0] for topic in model.show_topic(topicnum, topn=20)]  for topicnum in range(len(model.get_topics()))]

#   TopicAvgWordLen = []
#   for topic in twentyWords :
#     totalLen = 0
#     for word in topic :
#       totalLen += len(word)
#     TopicAvgWordLen.append((totalLen / len(topic)))

#   AverageTopicWordLen = {topicNum: Avg  for topicNum, Avg in enumerate(TopicAvgWordLen) }
#   return collections.OrderedDict(sorted(AverageTopicWordLen.items(), key=lambda item: item[1]))

# def GetTextAndTopicsFromModel(topic_num, text_index, corpus) :
#   selectCriteria = ModelsDF[ModelsDF['NumTopics']==topic_num]['TopicDFs'].values[0][0].index==text_index
#   topicList = sorted(ModelsDF[ModelsDF['NumTopics']==topic_num]['Models'].values[0][corpus[text_index]], key=lambda x: (x[1]), reverse=True)
#   textList = textwrap.wrap(NewsDF[selectCriteria]['Content'].values[0])
#   return topicList, textList


In [None]:
# def run_Topic_Metrics(DF, metricsToPlot, mode='markers', rows=2, cols=3) :
#   traces = []
#   fig = make_subplots(rows=rows, cols=cols)
#   xval = 1
#   yval = 1
#   for metric in metricsToPlot :
#       fig.append_trace(go.Scatter(x=DF['num_topics'], y=DF[metric], name=metric, mode=mode), yval, xval)
#       xval += 1
#       if xval > 3 :
#         xval = 1
#         yval += 1
#   return fig

# def plotKnee(x, y) :
#   k1 = KneeLocator(x, y, curve='concave' )
#   print(round(k1.knee, 3))
#   k1.plot_knee()

# def PrintTopics(model, ListOfTopics) :
#   topics2show = [[topic[0] for topic in model.show_topic(topicnum, topn=20)]  for topicnum in ListOfTopics]
#   for index, line in enumerate([' '.join(topiclist) for topiclist in topics2show]) :
#     print(index, '\t', line)

# def CreateID2WordAndCorpus(TextLines, stopwords=[]) :
#   docs = [line.split() for line in TextLines]
#   newlist = []
#   for lines in docs :
#     adoc = []
#     for word in lines :
#       if word not in stopwords :
#         adoc.append(word)
#     newlist.append(adoc)
#   id2word = corpora.Dictionary(newlist)
#   corpus = [id2word.doc2bow(text) for text in newlist] 
#   return id2word, corpus
  
# def format_topics_sentences(ldamodel, corpus, texts):
#     # Init output
#     sent_topics_df = pd.DataFrame()

#     # Get main topic in each document
#     for i, row in enumerate(ldamodel[corpus]):
#         row = sorted(row, key=lambda x: (x[1]), reverse=True)
#         # Get the Dominant topic, Perc Contribution and Keywords for each document
#         for j, (topic_num, prop_topic) in enumerate(row):
#             if j == 0:  # => dominant topic
#                 wp = ldamodel.show_topic(topic_num)
#                 topic_keywords = ", ".join([word for word, prop in wp])
#                 sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
#             else:
#                 break
#     sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

#     # Add original text to the end of the output
#     contents = pd.Series(texts)
#     sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
#     return(sent_topics_df)  

# def CreateTopicAndContribMatrices(Model, Corpus) :
#   pctContribution = []
#   topicNum = []
#   for _, row in enumerate(Model[Corpus]) :
#     row = sorted(row, key=lambda x: (x[1]), reverse=True)
#     tempNum = []
#     tempVal = []
#     for _, tupe in enumerate(row) :
#       tempNum.append(tupe[0])
#       tempVal.append(tupe[1])
#     topicNum.append(tempNum)
#     pctContribution.append(tempVal)
#   pctDF = pd.DataFrame(pctContribution)
#   #Set low scores to 0
#   pctDF = pctDF * (pctDF > .005)
#   topicNumDF = pd.DataFrame(topicNum)
#   topicNumDF = ((topicNumDF + 1) * (pctDF > 0)) - 1
#   return pctDF, topicNumDF

# def NumDocsPerTopic(docDF, topicNumDF, numTopics) :
#   TopicOccurence = {topic: docDF[pd.Series((topicNumDF == topic).sum(axis=1))>0].shape[0] for topic in range(numTopics)}
#   return collections.OrderedDict(sorted(TopicOccurence.items(), key=lambda item: item[1]))

# def TotalContributionByTopic(pctContributionDF, topicNumDF, numTopics) :
#   values = [((pctContributionDF * ((topicNumDF[topicNumDF == x] -(x-1)).fillna(0))).sum()).sum() for x in range(numTopics)] 
#   TopicContrib = {x : values[x] for x in range(numTopics)}
#   return collections.OrderedDict(sorted(TopicContrib.items(), key=lambda item: item[1]))

# def AverageTopicWordLength(model) :

#   twentyWords = [[topic[0] for topic in model.show_topic(topicnum, topn=20)]  for topicnum in range(len(model.get_topics()))]

#   TopicAvgWordLen = []
#   for topic in twentyWords :
#     totalLen = 0
#     for word in topic :
#       totalLen += len(word)
#     TopicAvgWordLen.append((totalLen / len(topic)))

#   AverageTopicWordLen = {topicNum: Avg  for topicNum, Avg in enumerate(TopicAvgWordLen) }
#   return collections.OrderedDict(sorted(AverageTopicWordLen.items(), key=lambda item: item[1]))

# def GetTextAndTopicsFromModel(topic_num, text_index, corpus) :
#   selectCriteria = ModelsDF[ModelsDF['NumTopics']==topic_num]['TopicDFs'].values[0][0].index==text_index
#   topicList = sorted(ModelsDF[ModelsDF['NumTopics']==topic_num]['Models'].values[0][corpus[text_index]], key=lambda x: (x[1]), reverse=True)
#   textList = textwrap.wrap(NewsDF[selectCriteria]['Content'].values[0])
#   return topicList, textList


#### Model Monster

In [None]:
import pickle
import collections
import textwrap

class ModelMonster :
  
  # ModelsDict key = String Identifyer, Topic Size
  def __init__(self, ModelsDict, CorpusText, StopWords=[], ContribMinThreshold=0.005) :
    self.modelsdict = ModelsDict
    self.corpustext = CorpusText
    self.stopwords = StopWords
    self.corpustext = CorpusText
    self.models = {}
    self.pctContribDFs = {}
    self.topicIDDFs = {}
    self.numdocspertopic = {}
    self.corpus = None

    self.contrib_min_threshold = ContribMinThreshold
    self.num_topics_to_retrieve = 20

    self._GenerateModels()
    self._GenerateMatricies()

    print('Models Complete')

  def save(self, filepath):
      with open(filepath, 'wb') as f:
          pickle.dump(self, f)

  @classmethod
  def load(cls, filepath):
      with open(filepath, 'rb') as f:
          return pickle.load(f)

  def _GenerateModels(self) :
    if self.corpus == None :
        self.docs = []
        templist = [docs for docs in [lines.split() for lines in self.corpustext]]
        for doc in templist :
            tempdoc = []
            if len(self.stopwords) > 0 :
              for word in doc :
                if word not in self.stopwords :
                  tempdoc.append(word)
              self.docs.append(tempdoc)
        print('Generating Dictionary')
        self.id2word = corpora.Dictionary(self.docs)
        print('Generating Corpus')
        self.corpus = [self.id2word.doc2bow(word) for word in self.docs] 

    for key, _ in self.modelsdict.items() :
        print('Generating ', key, ' LDA model')
        self.models[key] = LdaModel(self.corpus, num_topics=self.modelsdict[key])

  def _GenerateMatricies(self) :
      for key, _ in self.modelsdict.items() :
        pctContribution = []
        topicID = []
        for _, row in enumerate( (self.models[key])[self.corpus] ) :
          row = sorted(row, key=lambda x: (x[1]), reverse=True)
          tempNum = []
          tempVal = []
          for _, tupe in enumerate(row) :
            tempNum.append(tupe[0])
            tempVal.append(tupe[1])
          topicID.append(tempNum)
          pctContribution.append(tempVal)
        pctDF = pd.DataFrame(pctContribution)
        # Set low scores to 0
        self.pctContribDFs[key] = pctDF * (pctDF > self.contrib_min_threshold)
        # Topic IDs are 0 based, increase all topic nums by 1, 
        #   mask out any where there are no corresponding pct contrib values
        #   reset topicIDs by subtracting 1
        self.topicIDDFs[key] = (((pd.DataFrame(topicID) + 1) * (self.pctContribDFs[key] > 0))) - 1
 
  # Returns a dict where each key is the Model's name and the data is reverse 
  #    sorted OrderedDict of TopicIDs, # of docs where that TopicID is dominant
  def GetNumDocsPerTopic(self, modelKeys = []) :
    if type(modelKeys) == dict :
      modelKeys = list(modelKeys.keys())
    elif type(modelKeys) == str :
      modelKeys = list([modelKeys])
    elif ((type(modelKeys) == list) & (len(modelKeys) == 0)):
      # run all models
      modelKeys = list(self.modelsdict.keys())
    elif type(modelKeys) != list :
      print('ERROR - keys must be dict(keys), a single string, or list of strings resolving to a valid model name')

    returnDict = {}
    for modelKey in modelKeys :
      returnDict[modelKey] = self._GetNumDocsPerTopic(modelKey)

    return returnDict

  # returns Ordered Dict of number of documents a topicID contributes to
  def _GetNumDocsPerTopic(self, modelKey) :
    topicIDDF = self.topicIDDFs[modelKey]
    # For each topic, mask frame T/F for topicID, 
    #   then sum and filter out < 1, 
    #   resulting shape[0] is number of docs containing that topicID
    TopicOccurence = {topicID: topicIDDF[pd.Series((topicIDDF == topicID).sum(axis=1))>0].shape[0] for topicID in range(self.modelsdict[modelKey])}
    return collections.OrderedDict(sorted(TopicOccurence.items(), key=lambda item: item[1], reverse=True))

  # Returns a dict where each key is the Model's name and the data is reverse 
  #    sorted OrderedDict of total of all percentage contributions that
  #    Topic makes to all of its dominant topic documents (potentially useful
  #    when comparing the weight of each topic to its documents)
  def GetTotContribPerTopic(self, modelKeys = []) :
    if type(modelKeys) == dict :
      modelKeys = list(modelKeys.keys())
    elif type(modelKeys) == str :
      modelKeys = list([modelKeys])
    elif ((type(modelKeys) == list) & (len(modelKeys) == 0)):
      # run all models
      modelKeys = list(self.modelsdict.keys())
    elif type(modelKeys) != list :
      print('ERROR - keys must be dict(keys), a single string, or list of strings resolving to a valid model name')
      return

    returnDict = {}
    for modelKey in modelKeys :
      returnDict[modelKey] = self._GetTotContribPerTopic(modelKey)

    return returnDict

  def _GetTotContribPerTopic(self, modelKey) :
    topicIDDF = self.topicIDDFs[modelKey]
    pctContribDF = self.pctContribDFs[modelKey]
    pctContrib = {topicID - 1 : ((pctContribDF * ((topicIDDF + 1) == topicID)).sum()).sum() for topicID in range(1, self.modelsdict[modelKey]+1)}
    return collections.OrderedDict(sorted(pctContrib.items(), key=lambda item: item[1], reverse=True))

  # Returns an ordered dict where the values are reverse sorted by the 
  #   average stringl length of each topic keyword.
  def GetAverageTopicWordLength(self, modelKeys = []) :
    if type(modelKeys) == dict :
      modelKeys = list(modelKeys.keys())
    elif type(modelKeys) == str :
      modelKeys = list([modelKeys])
    elif ((type(modelKeys) == list) & (len(modelKeys) == 0)):
      # run all models
      modelKeys = list(self.modelsdict.keys())
    elif type(modelKeys) != list :
      print('ERROR - keys must be dict(keys), a single string, or list of strings resolving to a valid model name')
      return

    returnDict = {}
    for modelKey in modelKeys :
      returnDict[modelKey] = self._GetAverageTopicWordLength(modelKey)

    return returnDict

  def _GetAverageTopicWordLength(self, modelKey) :

    model = self.models[modelKey]

    topicWords = [[topic[0] for topic in model.show_topic(topicnum, topn=self.num_topics_to_retrieve)]  for topicnum in range(len(model.get_topics()))]
    avgWordLengths = []
    for topic in topicWords :
      totalLen = 0
      for word in topic :
        totalLen += len(word)
      avgWordLengths.append((totalLen / len(topic)))

    avgTopicWordLengths = {topicNum: avg for topicNum, avg in enumerate(avgWordLengths) }
    return collections.OrderedDict(sorted(avgTopicWordLengths.items(), key=lambda item: item[1]))

  
  # Returns a list of words that make up a topic definition
  def GetTopics(self, modelKeys, topN=10) :
    if type(modelKeys) == dict :
      modelKeys = list(modelKeys.keys())
    elif type(modelKeys) == str :
      modelKeys = list([modelKeys])
    elif ((type(modelKeys) == list) & (len(modelKeys) == 0)):
      # run all models
      modelKeys = list(self.modelsdict.keys())
    elif type(modelKeys) != list :
      print('ERROR - keys must be dict(keys), a single string, or list of strings resolving to a valid model name')
      return

    returnDict = {}
    for modelKey in modelKeys :
      returnDict[modelKey] = self._GetTopics(self.models[modelKey], [*range(self.modelsdict[modelKey])], topN)

    return returnDict
    
    
  def _GetTopics(self, model, listOfTopics, topN=10) :
    vals = [[int(topic[0]) for topic in model.show_topic(topicnum, topn=topN)] for topicnum in listOfTopics]
    return [[self.id2word[topic] for topic in topiclist] for topiclist in vals]

  
  # Prints a descending sorted list of the most to least represented topic. 
  #   Includes number of documents the topic is dominant for, the topic id
  #   and the words that comprise that topic
  def PrintTopicsCount(self, modelKeys) :
    if type(modelKeys) == dict :
      modelKeys = list(modelKeys.keys())
    elif type(modelKeys) == str :
      modelKeys = list([modelKeys])
    elif ((type(modelKeys) == list) & (len(modelKeys) == 0)):
      # run all models
      modelKeys = list(self.modelsdict.keys())
    elif type(modelKeys) != list :
      print('ERROR - keys must be dict(keys), a single string, or list of strings resolving to a valid model name')
      return

    for modelKey in modelKeys :
      self._PrintTopicsCount(modelKey)

  def _PrintTopicsCount(self, modelkey) :
    counterDict = collections.OrderedDict(sorted(collections.Counter(self.topicIDDFs[modelkey][0]).items(), key=lambda x: x[1], reverse=True))
    print('')
    print('==============================  ' + modelkey + '  ===================================')
    numHD, topicHD, txtHD, underscore = 'Count', 'TopicID', 'Text', '-------'
    print(f'{numHD:^8} {topicHD:^10} {txtHD}')
    print(f'{underscore:^8} {underscore:^10} {underscore}')
    allTopics = self.GetTopics(modelkey)[modelkey]
    for topicID, count in counterDict.items()  :
        topiclst = ' '.join(allTopics[topicID])
        print(f'{count: ^ 8} {topicID: ^ 10} {topiclst:^}')

  # Prints n randomly selected sample text for a given model where the dominant
  #   topic for that text is within a range of likely presence in that Document
  #   Prints the model name, the document's id, the dominant topic, the percent
  #   liklihood that the topic is present in the document, the topic key word
  #   list and n lines of the sample text.
  def PrintSampleText(self, modelKeys, newsDF, MinRelevance=0.2, MaxRelevance=0.6, SampleSize=5, LinesToPrint=5) :
    if type(modelKeys) == dict :
      modelKeys = list(modelKeys.keys())
    elif type(modelKeys) == str :
      modelKeys = list([modelKeys])
    elif ((type(modelKeys) == list) & (len(modelKeys) == 0)):
      # run all models
      modelKeys = list(self.modelsdict.keys())
    elif type(modelKeys) != list :
      print('ERROR - keys must be dict(keys), a single string, or list of strings resolving to a valid model name')
      return

    for modelKey in modelKeys :
      self._PrintSampleText(modelKey, newsDF, MinRelevance, MaxRelevance, SampleSize, LinesToPrint )

  def _PrintSampleText(self, modelkey, newsDF, MinRelevance, MaxRelevance, SampleSize, LinesToPrint) :
    textSamples = pd.DataFrame()
    textSamples['pctContribution'] = self.pctContribDFs[modelkey][0]
    textSamples['topics'] = self.topicIDDFs[modelkey][0]
    textSamples['text'] = newsDF['Content']

    samplesDF = textSamples[(textSamples['pctContribution'] >= MinRelevance) & (textSamples['pctContribution'] <= MaxRelevance )].sample(n=SampleSize, replace=True)

    topicslist = self.GetTopics(modelkey)[modelkey]

    for row in samplesDF.iterrows() :
        print('')
        print('************************')
        print('Model: ', modelkey)
        print('Document ID: ', row[0])
        print('Topic: ', row[1]['topics'])
        print('Contribution: ', row[1]['pctContribution'])
        print('')
        print(' '.join(topicslist[row[1]['topics']]))
        print('')
        for line in textwrap.wrap(row[1]['text'])[:LinesToPrint] :
          print(line)
    print('')
    print('')

### Load Data 

In [None]:
NewsDF = pd.read_csv(DATA_DIR + '/NewsDF.csv')
StopWordsDF = pd.read_csv(DATA_DIR + '/ExcludelistDF.csv')
ModelRunMetricsDF = pd.read_csv(DATA_DIR + '/ModelRunMetrics.csv')

ModelsToEval = {'Five': 5, 'Ten': 10, 'Twenty': 20, 'Thirtyfive' : 35, 'Fifty' : 50, 'Eighty' : 80}

### Analyze

In [None]:
metrics = ['KLBac', 'KLDiv',  'KLUni', 'KLVac', 'JaccardSim', 'InvertedRBO', 'Topic_Diversity', 'NPMI', 'C_V']

In [None]:
fig = run_Topic_Metrics(ModelRunMetricsDF, metrics, rows=3)
fig.show()
fig.write_image('Fig1.jpeg')



In [None]:
ninetyRunsSummaryDF = ModelRunMetricsDF.groupby(['num_topics']).mean()
ninetyRunsSummaryDF['num_topics'] = ninetyRunsSummaryDF.index

In [None]:
fig = run_Topic_Metrics(ninetyRunsSummaryDF, metrics, rows=3)
fig.show()
fig.write_image('Fig2.jpeg')


In [None]:
for metric in metrics :
  k1 = KneeLocator(ninetyRunsSummaryDF['num_topics'], ninetyRunsSummaryDF[metric], curve='concave' )
  print(metric+':', k1.knee)

KLBac: 50
KLDiv: 50
KLUni: 80
KLVac: 50
JacardSim: 5
InvertedRBO: 35
Topic_Diversity: 10
NPMI: 20
C_V: 20


In [None]:
for metric in metrics :
  plotKnee(ninetyRunsSummaryDF['num_topics'], ninetyRunsSummaryDF[metric])

In [None]:
fig = make_subplots(rows=2, cols=3)
traces = []
xval = 1
yval = 1
for modelkey in ModelsToEval.keys() :
  xvals = MM.topicIDDFs[modelkey][0]
  traces.append(go.Histogram(x=xvals, name=modelkey))
for index, trace in enumerate(traces) :
  fig.append_trace(trace, yval, xval)
  xval += 1
  if xval > 3 :
    xval = 1
    yval += 1
fig.update_layout(title='Histogram - Count of Dominant Topics')
fig.update_xaxes(type='category', categoryorder='total descending')
fig.show()
fig.write_image('Fig3.png')


In [None]:
fig = make_subplots(rows=6, cols=1)
traces = []
xval = 1
yval = 1
for modelname in ModelsToEval.keys() :
  pctevalDF = MM.pctContribDFs[modelname]
  topicIDDF = MM.topicIDDFs[modelname]
  fig.append_trace(go.Box(y=pctevalDF[0], x=topicIDDF[0], name=modelname), yval, xval)
  yval += 1

fig.update_layout(width=1000, height=1800)
fig.update_yaxes(range=[0.0, 1.1], tickvals=[0, .25, .5, .75, 1])

fig.show()
fig.write_image('Fig4.png')
