<a href="https://colab.research.google.com/github/drob-xx/Is_LDA_Dead/blob/main/Is_LDA_Dead.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

### Installs

In [1]:
!pip install -U gensim==4.2.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gensim==4.2.0
  Downloading gensim-4.2.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 8.2 MB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.2.0


In [2]:
!pip install bertopic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bertopic
  Downloading bertopic-0.10.0-py2.py3-none-any.whl (58 kB)
[K     |████████████████████████████████| 58 kB 2.8 MB/s 
Collecting sentence-transformers>=0.4.1
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 4.3 MB/s 
Collecting umap-learn>=0.5.0
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[K     |████████████████████████████████| 88 kB 6.3 MB/s 
Collecting hdbscan>=0.8.28
  Downloading hdbscan-0.8.28.tar.gz (5.2 MB)
[K     |████████████████████████████████| 5.2 MB 37.5 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 37.2 MB/s 
Collecting se

### Imports

In [99]:
# import sqlite3
import pandas as pd
import pickle
import numpy as np
from copy import copy
from tqdm.notebook import tqdm

from collections import OrderedDict, Counter

from gensim import corpora
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.ensemblelda import EnsembleLda

from bertopic import BERTopic
from hdbscan import HDBSCAN

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE

import plotly.express as px

### Defs

In [4]:
def load(filepath):
      with open(filepath, 'rb') as fp:
          return pickle.load(fp)

def save(var, filepath):
      with open(filepath, 'wb') as fp:
          return pickle.dump(var, fp)

In [5]:
def CreateID2WordAndCorpus(TextLines, StopWords=[], id2word=None) :
  docs = [line.split() for line in TextLines]
  newlist = []
  for lines in docs :
    adoc = []
    for word in lines :
      if word not in StopWords :
        adoc.append(word)
    newlist.append(adoc)
  if id2word == None :
    id2word = corpora.Dictionary(newlist)
  corpus = [id2word.doc2bow(text) for text in newlist] 
  return id2word, corpus

In [6]:
def PrintTopics(lda_model, topic_dict) :
  topicWords = [[topic[0] for topic in lda_model.show_topic(topicnum, topn=10)] for topicnum in [*range(len(lda_model.get_topics()))]]
  numHD, topicHD, txtHD, underscore = 'Count', 'TopicID', 'Text', '-------'
  print(f'{topicHD:^10} {numHD:^8} {txtHD}')
  print(f'{underscore:^10} {underscore:^8} {underscore}')
  for idx, val in topic_dict.items() :
      vocab = ' '.join(topicWords[idx])
      print(f'{idx: ^10} {val: ^8} {vocab:^}')

In [7]:
def ProcessModel(model, corpus) :
  topic_ids = []
  pct_contribution = []
  for row in tqdm(model[corpus]) :
    row = sorted(row, key=lambda x: (x[1]), reverse=True)
    tempNum = []
    tempVal = []
    for tupe in row :
      tempNum.append(tupe[0])
      tempVal.append(tupe[1])
    topic_ids.append(tempNum)
    pct_contribution.append(tempVal)
  dominant_topics = [x[0] for x in topic_ids]
  topic_id_dict = pd.Series(dominant_topics).value_counts().to_dict()
  return pct_contribution, topic_ids, topic_id_dict


In [9]:
def CompareModelAssignments(a_model_assignments, b_model_assignments, idx_offset=0) :
  a_model_s = pd.Series(a_model_assignments)
  b_model_s = pd.Series(b_model_assignments)
  set_a = set(a_model_s.unique())
  set_b = set(b_model_s.unique())
  compare_matrix = np.zeros((len(set_a), len(set_b)))
  for a_row in tqdm(set_a) :
    for b_col in set_b :
        v_dict = b_model_s[a_model_s==a_row].value_counts().to_dict() 
        divisor = b_model_s[a_model_s==a_row].shape[0]
        for key in v_dict.keys() :
          compare_matrix[a_row+idx_offset, key+idx_offset] = v_dict[key] / divisor
  compDF = pd.DataFrame(compare_matrix)
  compDF.columns=[str(val) for val in set_b]
  compDF.index=[str(val) for val in set_a]
  return compDF

### Change Directory

In [10]:
cd /content/drive/MyDrive/Projects/IsLDADead/

/content/drive/MyDrive/Projects/IsLDADead


## Create Dictionary and Corpus

In [11]:
NewsDF = pd.read_csv('./NewsDF.csv')

In [12]:
# Add a universal ID number because later the data will be split in two parts
#  and the DocID will keep things in sync when the data is rejoined
if 'DocID' not in NewsDF.columns :
  NewsDF['DocID'] = [*range(NewsDF.shape[0])]
  NewsDF.to_csv('./NewsDF.csv', index=False)

In [7]:
ExcludeListDF = pd.read_csv('./ExcludelistDF.csv')

In [8]:
text = NewsDF['Clean_Text']
exclude = ExcludeListDF['word']

In [9]:
id2word, corpus = CreateID2WordAndCorpus(text, exclude)
save(id2word, './id2word')
save(corpus, './corpus')
id2word.filter_extremes(keep_n=20000)
id2word, corpus2 = CreateID2WordAndCorpus(text, exclude, id2word)


In [10]:
save(id2word, './id2word20K')
save(corpus2, './corpus20K')

In [14]:
id2word = load('./id2word20K')
corpus = load('./corpus20K')

## Run LDA

Assuming symmetric Dirichlet distributions (for simplicity), a low alpha value places more weight on having each document composed of only a few dominant topics (whereas a high value will return many more relatively dominant topics). Similarly, a low beta value places more weight on having each topic composed of only a few dominant words.

In [140]:
distance_workers = 2
ensemble_workers = 2
topic_model_class = LdaMulticore
passes = 15
num_topics=20
num_models=16
kwargs = {'alpha': 0.05, 'eta': 0.5}

LDAmodel1 = EnsembleLda(
            corpus=corpus,
            id2word=id2word,
            num_topics=num_topics,
            passes=passes,
            ensemble_workers=ensemble_workers,
            distance_workers=distance_workers,
            num_models=num_models,
            topic_model_class=topic_model_class,
            **kwargs,
          )
save(LDAmodel1, './EnsembleLDAmodel1_16models_20topics_15pass')

LDAmodel2 = EnsembleLda(
            corpus=corpus,
            id2word=id2word,
            num_topics=num_topics,
            passes=passes,
            ensemble_workers=ensemble_workers,
            distance_workers=distance_workers,
            num_models=num_models,
            topic_model_class=topic_model_class,
            **kwargs,
          )
save(LDAmodel2, './EnsembleLDAmodel2_16models_20topics_15pass')

[0.082343236, 0.20219533, 0.16497013, 0.23994662, 0.3098793]

In [119]:
LDAmodel1 = load('./EnsembleLDAmodel1_16models_20topics_15pass')
LDAmodel2 = load('./EnsembleLDAmodel2_16models_20topics_15pass')

In [13]:
LDAmodel.generate_gensim_representation()

In [93]:
PctDF, TopicIDs, TopicIDDict = ProcessModel(LDAmodel1.generate_gensim_representation(), corpus)

  0%|          | 0/30000 [00:00<?, ?it/s]

In [17]:
PctDF2, TopicIDs2, TopicIDDict2 = ProcessModel(LDAmodel2.generate_gensim_representation(), corpus)

  0%|          | 0/30000 [00:00<?, ?it/s]

In [18]:
comparisonDF = pd.DataFrame(CompareModelAssignments([aval[0] for aval in TopicIDs], [bval[0] for bval in TopicIDs2]))
comparisonDF.columns = [str(col) for col in comparisonDF.columns]
comparisonDF.index = [str(idx) for idx in comparisonDF.index]

  0%|          | 0/8 [00:00<?, ?it/s]

In [19]:
import plotly.express as px
fig = px.imshow(comparisonDF, color_continuous_scale='Portland')
fig.show()

In [21]:
PrintTopics(LDAmodel1.generate_gensim_representation(), TopicIDDict)

 TopicID    Count   Text
 -------   -------  -------
    7       10021   get know think life want people tell day look family
    0        4741   game player club play win team season first goal last
    3        4600   attack military force government group kill country people security official
    6        2825   mr government party uk people labour leader minister country election
    4        2348   company pay business money bank price cost cent market new
    5        2137   health study people patient use find dr woman risk drug
    1        1850   president obama state trump republican campaign vote american election white_house
    2        1478   use user facebook company online post new people phone apple


In [22]:
PrintTopics(LDAmodel2.generate_gensim_representation(), TopicIDDict2)

 TopicID    Count   Text
 -------   -------  -------
    6        4562   police officer man car tell find kill old report arrest
    8        3978   show get film know star people think see want write
    10       3630   family child mr tell mother old home day leave life
    3        3114   government attack military force country group people kill official security
    0        2645   club player game play season team goal win last first
    11       2528   mr government uk party people council labour bbc last work
    1        2050   game team win world first race sport play second last
    9        2035   company pay business new money bank cost price work market
    7        1953   health patient study people use doctor dr find risk drug
    5        1543   use user facebook company new online people post phone apple
    4        1516   president obama state trump republican vote campaign american election white_house
    2        446    win right goal half minute free_kick leave 

### Gen TNSE from LDA:

In [53]:
LDAModelMatrix = np.zeros((len(docreps), len(LDAmodel1.get_topics())))
for idx, row in enumerate(docreps): 
  for tupe in row :
    LDAModelMatrix[idx, tupe[0]] = tupe[1]


In [120]:
LDAModelMatrix2 = np.zeros((len(docreps), len(LDAmodel2.get_topics())))
for idx, row in enumerate(docreps): 
  for tupe in row :
    LDAModelMatrix2[idx, tupe[0]] = tupe[1]

In [None]:
tsne_LDA_model = TSNE(init='pca', metric='cosine', verbose=2)
tsne_LDA_embedding = tsne_LDA_model.fit_transform(LDAModelMatrix)

In [62]:
save(tsne_LDA_embedding, './tsne_LDA_embedding')

In [None]:
docreps = [doc for doc in LDAmodel1[corpus]]
docreps2 = [doc for doc in LDAmodel2[corpus]]

In [127]:
NewsVizDF = pd.DataFrame()
NewsVizDF['lda_x'] = tsne_LDA_embedding[:,0]
NewsVizDF['lda_y'] = tsne_LDA_embedding[:,1]

topics = np.full((len(docreps),), -1)
weights = np.zeros((len(docreps),))
for idx, row in enumerate(docreps) :
  tops = [tupe[0] for tupe in row]
  pcts = [tupe[1] for tupe in row]
  topics[idx] = tops[np.argmax(pcts)]
  weights[idx] = np.max(pcts)

NewsVizDF['lda1_topics'] = [str(top) for top in topics]
NewsVizDF['lda1_weights'] = weights

topics2 = np.full((len(docreps2),), -1)
weights2 = np.zeros((len(docreps2),))
for idx, row in enumerate(docreps2) :
  tops = [tupe[0] for tupe in row]
  pcts = [tupe[1] for tupe in row]
  topics2[idx] = tops[np.argmax(pcts)]
  weights2[idx] = np.max(pcts)

NewsVizDF['lda2_topics'] = [str(top) for top in topics2]
NewsVizDF['lda2_weights'] = weights2

TSNE_BERT = load('./TSNE_BERT_2D_embeddings')
NewsVizDF['bert_x'] = TSNE_BERT[:,0]
NewsVizDF['bert_y'] = TSNE_BERT[:,1]

In [131]:
fig = px.scatter(
    NewsVizDF,
    x='lda_x', y='lda_y',
    color='lda1_topics',
    width=1400,
    height=1000,
    hover_data = {'lda_x' : False,
                  'lda_y' : False,
                  'lda1_weights' : True}
    
)
fig.show()

In [132]:
fig = px.scatter(
    NewsVizDF,
    x='lda_x', y='lda_y',
    color='lda2_topics',
    width=1400,
    height=1000,
    hover_data = {'lda_x' : False,
                  'lda_y' : False,
                  'lda1_weights' : True}
    
)
fig.show()

## Run BERTopic

In [None]:
stop_words = text.ENGLISH_STOP_WORDS.union(['said', 'say', 'says', 'year', 'years', 'new', 'mr'])
vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words=stop_words)

BERT_ALL_2 = BERTopic(
                  vectorizer_model=vectorizer_model,
                  calculate_probabilities=False,
                  verbose=True,
                  low_memory=True,
                  min_topic_size=150
                  )

# BERT_ALL.get_params()
# BERT_ALL_1.hdbscan_model.min_cluster_size=156
# BERT_ALL_1.hdbscan_model.min_sample_size=int(156 * .25)

BERT_ALL_2.fit_transform(NewsDF['Content'])


In [15]:
BERT_ALL_1.save('./BERT_ALL_1')
BERT_ALL_2.save('./BERT_ALL_2')

In [13]:
BERT_ALL_1 = BERTopic.load('./BERT_ALL_1')

In [154]:
LDATopics = [aval[0] for aval in TopicIDs]
BERTopics = [bval+1 for bval in BERT_ALL_1.hdbscan_model.labels_]
comparisonDF = CompareModelAssignments(BERTopics, LDATopics)
comparisonDF.sum(axis=1)

  0%|          | 0/7 [00:00<?, ?it/s]

0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
5    1.0
6    1.0
dtype: float64

In [157]:
comparisonDF.index = [str(int(idx)-1) for idx in comparisonDF.index]

In [158]:

fig = px.imshow(comparisonDF, color_continuous_scale='Portland')
fig.update_layout(
    xaxis={'side': 'top'}, 
)
fig.show()

In [149]:
b_model_s = pd.Series(LDATopics)
a_model_s = pd.Series(BERTopics)
idx_offset = 0

set_a = set(a_model_s.unique())
set_b = set(b_model_s.unique())
compare_matrix = np.zeros((len(set_a), len(set_b)))
for a_row in tqdm(set_a) :
  for b_col in set_b :
      v_dict = b_model_s[a_model_s==a_row].value_counts().to_dict() 
      divisor = b_model_s[a_model_s==a_row].shape[0]
      for key in v_dict.keys() :
        compare_matrix[a_row+idx_offset, key+idx_offset] = v_dict[key] / divisor
compDF = pd.DataFrame(compare_matrix)
compDF.columns=[str(val) for val in set_b]
compDF.index=[str(val) for val in set_a]
compDF.sum(axis=1)

  0%|          | 0/7 [00:00<?, ?it/s]

0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
5    1.0
6    1.0
dtype: float64

### Tuning BERTopic

In [15]:
topic_labels = BERT_ALL_1._map_predictions(BERT_ALL_1.hdbscan_model.labels_)

In [44]:
News1DF = NewsDF[pd.Series(topic_labels)==0].copy()
News1DF.reset_index(inplace=True, drop=True)
News2DF = NewsDF[pd.Series(topic_labels)!=0].copy()
News2DF.reset_index(inplace=True, drop=True)
News1DF.shape, News2DF.shape

((24939, 9), (5061, 9))

In [45]:
stop_words = text.ENGLISH_STOP_WORDS.union(['said', 'say', 'says', 'year', 'years', 'new', 'mr'])
vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words=stop_words)


BERT_1 = BERTopic(
                  vectorizer_model=vectorizer_model,
                  calculate_probabilities=False,
                  verbose=True,
                  low_memory=True,                  
                  )

BERT_1.hdbscan_model.min_samples=165
BERT_1.hdbscan_model.min_cluster_size=330

# Fit the model

BERT_1_topics, _ = BERT_1.fit_transform(News1DF['Content'])

Batches:   0%|          | 0/780 [00:00<?, ?it/s]

2022-06-27 22:54:32,216 - BERTopic - Transformed documents to Embeddings
2022-06-27 22:54:52,697 - BERTopic - Reduced dimensionality
2022-06-27 22:54:56,005 - BERTopic - Clustered reduced embeddings


In [46]:
BERT_1.save('./BERT_SPLIT_A')

In [None]:
BERT_1.get_topic_info()

In [47]:
stop_words = text.ENGLISH_STOP_WORDS.union(['said', 'say', 'says', 'year', 'years', 'new', 'mr'])
vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words=stop_words)
hdbscan_model = HDBSCAN(min_samples=40,  
                        min_cluster_size=80)

BERT_2 = BERTopic(
                  vectorizer_model=vectorizer_model,
                  calculate_probabilities=False,
                  verbose=True,
                  low_memory=True,                  
                  )

# Set UMAPs random state so that UMAP output will be consistent across runs

BERT_2.hdbscan_model.min_samples=40
BERT_2.hdbscan_model.min_cluster_size=80


# Fit the model

BERT_2_Topics, _ = BERT_2.fit_transform(News2DF['Content'])



Batches:   0%|          | 0/159 [00:00<?, ?it/s]

2022-06-27 23:00:05,443 - BERTopic - Transformed documents to Embeddings
2022-06-27 23:00:16,426 - BERTopic - Reduced dimensionality
2022-06-27 23:00:16,656 - BERTopic - Clustered reduced embeddings


In [63]:
BERTopicsDF1 = News1DF[['DocID']].copy()
BERTopicsDF1['NewTopic'] = BERT_1._map_predictions(BERT_1.hdbscan_model.labels_)

BERTopicsDF2 = News2DF[['DocID']].copy()
BERTopicsDF2['NewTopic'] = BERT_2._map_predictions(BERT_2.hdbscan_model.labels_)

topic_offset = len(BERTopicsDF1['NewTopic'].unique()) - 1
BERTopicsDF2['NewTopic'] = [top + topic_offset if top != -1 else -1 for top in BERTopicsDF2['NewTopic']]

NewTopicsDF = pd.concat([BERTopicsDF1, BERTopicsDF2], axis=0)
NewTopicsDF.sort_values(by=['DocID'], inplace=True)
NewTopicsDF.reset_index(inplace=True, drop=True)