<a href="https://colab.research.google.com/github/drob-xx/Is_LDA_Dead/blob/main/Is_LDA_Dead.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

### Installs

In [1]:
!pip install -U gensim==4.2.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gensim==4.2.0
  Downloading gensim-4.2.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 1.2 MB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.2.0


In [2]:
!pip install bertopic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bertopic
  Downloading bertopic-0.10.0-py2.py3-none-any.whl (58 kB)
[K     |████████████████████████████████| 58 kB 2.8 MB/s 
Collecting umap-learn>=0.5.0
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[K     |████████████████████████████████| 88 kB 6.2 MB/s 
Collecting hdbscan>=0.8.28
  Downloading hdbscan-0.8.28.tar.gz (5.2 MB)
[K     |████████████████████████████████| 5.2 MB 19.1 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting sentence-transformers>=0.4.1
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 5.5 MB/s 
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 48.1 MB/s 
Collecting se

### Imports

In [3]:
# import sqlite3
import pandas as pd
import pickle
import numpy as np
from copy import copy
from tqdm.notebook import tqdm

from collections import OrderedDict, Counter

from gensim import corpora
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.ensemblelda import EnsembleLda

from bertopic import BERTopic

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

### Defs

In [4]:
def load(filepath):
      with open(filepath, 'rb') as fp:
          return pickle.load(fp)

def save(var, filepath):
      with open(filepath, 'wb') as fp:
          return pickle.dump(var, fp)

In [5]:
def CreateID2WordAndCorpus(TextLines, StopWords=[], id2word=None) :
  docs = [line.split() for line in TextLines]
  newlist = []
  for lines in docs :
    adoc = []
    for word in lines :
      if word not in StopWords :
        adoc.append(word)
    newlist.append(adoc)
  if id2word == None :
    id2word = corpora.Dictionary(newlist)
  corpus = [id2word.doc2bow(text) for text in newlist] 
  return id2word, corpus

In [6]:
def PrintTopics(lda_model, topic_dict) :
  topicWords = [[topic[0] for topic in lda_model.show_topic(topicnum, topn=10)] for topicnum in [*range(len(lda_model.get_topics()))]]
  numHD, topicHD, txtHD, underscore = 'Count', 'TopicID', 'Text', '-------'
  print(f'{topicHD:^10} {numHD:^8} {txtHD}')
  print(f'{underscore:^10} {underscore:^8} {underscore}')
  for idx, val in topic_dict.items() :
      vocab = ' '.join(topicWords[idx])
      print(f'{idx: ^10} {val: ^8} {vocab:^}')

In [7]:
def ProcessModel(model, corpus) :
  topic_ids = []
  pct_contribution = []
  for row in tqdm(model[corpus]) :
    row = sorted(row, key=lambda x: (x[1]), reverse=True)
    tempNum = []
    tempVal = []
    for tupe in row :
      tempNum.append(tupe[0])
      tempVal.append(tupe[1])
    topic_ids.append(tempNum)
    pct_contribution.append(tempVal)
  dominant_topics = [x[0] for x in topic_ids]
  topic_id_dict = pd.Series(dominant_topics).value_counts().to_dict()
  return pct_contribution, topic_ids, topic_id_dict


In [8]:
# def CompareModelAssignments(a_model_assignments, b_model_assignments) :
#   a_model_s = pd.Series(a_model_assignments)
#   b_model_s = pd.Series(b_model_assignments)
#   set_a = set(a_model_s.unique())
#   set_b = set(b_model_s.unique())
#   compare_matrix = np.empty((len(set_a), len(set_b)))
#   for a_row in tqdm(set_a) :
#     for b_col in set_b :
#       for key, val in b_model_s[a_model_s==a_row].value_counts().to_dict().items() :
#         compare_matrix[a_row, key] = val / a_model_s[a_model_s==a_row].shape[0]
#   return compare_matrix

In [150]:
def CompareModelAssignments(a_model_assignments, b_model_assignments, idx_offset=0) :
  a_model_s = pd.Series(a_model_assignments)
  b_model_s = pd.Series(b_model_assignments)
  set_a = set(a_model_s.unique())
  set_b = set(b_model_s.unique())
  compare_matrix = np.zeros((len(set_a), len(set_b)))
  for a_row in tqdm(set_a) :
    for b_col in set_b :
        v_dict = b_model_s[a_model_s==a_row].value_counts().to_dict() 
        divisor = b_model_s[a_model_s==a_row].shape[0]
        for key in v_dict.keys() :
          compare_matrix[a_row+idx_offset, key+idx_offset] = v_dict[key] / divisor
  compDF = pd.DataFrame(compare_matrix)
  compDF.columns=[str(val) for val in set_b]
  compDF.index=[str(val) for val in set_a]
  return compDF

### Change Directory

In [9]:
cd /content/drive/MyDrive/Projects/IsLDADead/

/content/drive/MyDrive/Projects/IsLDADead


## Create Dictionary and Corpus

In [10]:
NewsDF = pd.read_csv('./NewsDF.csv')

In [7]:
ExcludeListDF = pd.read_csv('./ExcludelistDF.csv')

In [8]:
text = NewsDF['Clean_Text']
exclude = ExcludeListDF['word']

In [9]:
id2word, corpus = CreateID2WordAndCorpus(text, exclude)
save(id2word, './id2word')
save(corpus, './corpus')
id2word.filter_extremes(keep_n=20000)
id2word, corpus2 = CreateID2WordAndCorpus(text, exclude, id2word)


In [10]:
save(id2word, './id2word20K')
save(corpus2, './corpus20K')

In [92]:
id2word = load('./id2word20K')
corpus = load('./corpus20K')

## Run LDA

Assuming symmetric Dirichlet distributions (for simplicity), a low alpha value places more weight on having each document composed of only a few dominant topics (whereas a high value will return many more relatively dominant topics). Similarly, a low beta value places more weight on having each topic composed of only a few dominant words.

In [140]:
distance_workers = 2
ensemble_workers = 2
topic_model_class = LdaMulticore
passes = 15
num_topics=20
num_models=16
kwargs = {'alpha': 0.05, 'eta': 0.5}

LDAmodel1 = EnsembleLda(
            corpus=corpus,
            id2word=id2word,
            num_topics=num_topics,
            passes=passes,
            ensemble_workers=ensemble_workers,
            distance_workers=distance_workers,
            num_models=num_models,
            topic_model_class=topic_model_class,
            **kwargs,
          )
save(LDAmodel1, './EnsembleLDAmodel1_16models_20topics_15pass')

LDAmodel2 = EnsembleLda(
            corpus=corpus,
            id2word=id2word,
            num_topics=num_topics,
            passes=passes,
            ensemble_workers=ensemble_workers,
            distance_workers=distance_workers,
            num_models=num_models,
            topic_model_class=topic_model_class,
            **kwargs,
          )
save(LDAmodel2, './EnsembleLDAmodel2_16models_20topics_15pass')

In [90]:
LDAmodel1 = load('./EnsembleLDAmodel1_16models_20topics_15pass')
LDAmodel2 = load('./EnsembleLDAmodel2_16models_20topics_15pass')

In [13]:
LDAmodel.generate_gensim_representation()

In [93]:
PctDF, TopicIDs, TopicIDDict = ProcessModel(LDAmodel1.generate_gensim_representation(), corpus)

  0%|          | 0/30000 [00:00<?, ?it/s]

In [17]:
PctDF2, TopicIDs2, TopicIDDict2 = ProcessModel(LDAmodel2.generate_gensim_representation(), corpus)

  0%|          | 0/30000 [00:00<?, ?it/s]

In [18]:
comparisonDF = pd.DataFrame(CompareModelAssignments([aval[0] for aval in TopicIDs], [bval[0] for bval in TopicIDs2]))
comparisonDF.columns = [str(col) for col in comparisonDF.columns]
comparisonDF.index = [str(idx) for idx in comparisonDF.index]

  0%|          | 0/8 [00:00<?, ?it/s]

In [19]:
import plotly.express as px
fig = px.imshow(comparisonDF, color_continuous_scale='Portland')
fig.show()

In [21]:
PrintTopics(LDAmodel1.generate_gensim_representation(), TopicIDDict)

 TopicID    Count   Text
 -------   -------  -------
    7       10021   get know think life want people tell day look family
    0        4741   game player club play win team season first goal last
    3        4600   attack military force government group kill country people security official
    6        2825   mr government party uk people labour leader minister country election
    4        2348   company pay business money bank price cost cent market new
    5        2137   health study people patient use find dr woman risk drug
    1        1850   president obama state trump republican campaign vote american election white_house
    2        1478   use user facebook company online post new people phone apple


In [22]:
PrintTopics(LDAmodel2.generate_gensim_representation(), TopicIDDict2)

 TopicID    Count   Text
 -------   -------  -------
    6        4562   police officer man car tell find kill old report arrest
    8        3978   show get film know star people think see want write
    10       3630   family child mr tell mother old home day leave life
    3        3114   government attack military force country group people kill official security
    0        2645   club player game play season team goal win last first
    11       2528   mr government uk party people council labour bbc last work
    1        2050   game team win world first race sport play second last
    9        2035   company pay business new money bank cost price work market
    7        1953   health patient study people use doctor dr find risk drug
    5        1543   use user facebook company new online people post phone apple
    4        1516   president obama state trump republican vote campaign american election white_house
    2        446    win right goal half minute free_kick leave 

## Run BERTopic

In [None]:
stop_words = text.ENGLISH_STOP_WORDS.union(['said', 'say', 'says', 'year', 'years', 'new', 'mr'])
vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words=stop_words)

BERT_ALL_2 = BERTopic(
                  vectorizer_model=vectorizer_model,
                  calculate_probabilities=False,
                  verbose=True,
                  low_memory=True,
                  min_topic_size=150
                  )

# BERT_ALL.get_params()
# BERT_ALL_1.hdbscan_model.min_cluster_size=156
# BERT_ALL_1.hdbscan_model.min_sample_size=int(156 * .25)

BERT_ALL_2.fit_transform(NewsDF['Content'])


In [15]:
BERT_ALL_1.save('./BERT_ALL_1')

In [17]:
BERT_ALL_2.save('./BERT_ALL_2')

In [19]:
BERT_ALL_1.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,24939,0_people_police_told_time
1,1,3631,1_league_club_season_game
2,-1,427,-1_yn_fight_pistorius_mayweather
3,2,393,2_gold_olympic_world_games
4,3,275,3_murray_tennis_open_wimbledon
5,4,178,4_hamilton_race_rosberg_mercedes
6,5,157,5_golf_mcilroy_open_woods


In [18]:
BERT_ALL_2.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,24947,0_people_police_told_time
1,1,3683,1_league_club_game_season
2,2,385,2_gold_olympic_world_games
3,-1,378,-1_yn_fight_pistorius_mayweather
4,3,273,3_murray_tennis_wimbledon_open
5,4,176,4_hamilton_race_rosberg_mercedes
6,5,158,5_golf_mcilroy_open_woods


In [97]:
pd.Series([aval[0] for aval in TopicIDs]).value_counts()

7    10022
0     4741
3     4599
6     2825
4     2348
5     2137
1     1850
2     1478
dtype: int64

In [154]:
LDATopics = [aval[0] for aval in TopicIDs]
BERTopics = [bval+1 for bval in BERT_ALL_1.hdbscan_model.labels_]
comparisonDF = CompareModelAssignments(BERTopics, LDATopics)
comparisonDF.sum(axis=1)

  0%|          | 0/7 [00:00<?, ?it/s]

0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
5    1.0
6    1.0
dtype: float64

In [157]:
comparisonDF.index = [str(int(idx)-1) for idx in comparisonDF.index]

In [158]:
import plotly.express as px
fig = px.imshow(comparisonDF, color_continuous_scale='Portland')
fig.update_layout(
    xaxis={'side': 'top'}, 
)
fig.show()

In [149]:
b_model_s = pd.Series(LDATopics)
a_model_s = pd.Series(BERTopics)
idx_offset = 0

set_a = set(a_model_s.unique())
set_b = set(b_model_s.unique())
compare_matrix = np.zeros((len(set_a), len(set_b)))
for a_row in tqdm(set_a) :
  for b_col in set_b :
      v_dict = b_model_s[a_model_s==a_row].value_counts().to_dict() 
      divisor = b_model_s[a_model_s==a_row].shape[0]
      for key in v_dict.keys() :
        compare_matrix[a_row+idx_offset, key+idx_offset] = v_dict[key] / divisor
compDF = pd.DataFrame(compare_matrix)
compDF.columns=[str(val) for val in set_b]
compDF.index=[str(val) for val in set_a]
compDF.sum(axis=1)

  0%|          | 0/7 [00:00<?, ?it/s]

0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
5    1.0
6    1.0
dtype: float64

In [147]:
compDF

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.58548,0.002342,0.021077,0.023419,0.009368,0.002342,0.131148,0.224824
1,0.011388,0.07374,0.058423,0.183568,0.092225,0.085088,0.107703,0.387866
2,0.814607,0.011236,0.016854,0.000677,0.02809,0.022472,0.033708,0.073034
3,0.941338,0.001102,0.001102,0.002479,0.01019,0.001102,0.015698,0.02699
4,0.671756,0.007634,0.010178,0.005089,0.002545,0.015267,0.045802,0.24173
5,0.840764,0.006369,0.006369,0.950796,0.002124,0.020177,0.006369,0.140127
6,0.901818,0.009579,0.965177,0.001297,0.003636,0.009479,0.003636,0.090909


In [142]:
compDF.sum(axis=1)

0    1.000000
1    1.000000
2    1.000677
3    1.000000
4    1.000000
5    1.973097
6    1.985532
dtype: float64

In [148]:
a_row = 5
v_dict = b_model_s[a_model_s==a_row].value_counts().to_dict() 
divisor = b_model_s[a_model_s==a_row].shape[0]
print(a_row)
tot = 0
for key in v_dict.keys() :
  print(key, v_dict[key] / divisor)


5
0 0.8407643312101911
7 0.14012738853503184
6 0.006369426751592357
2 0.006369426751592357
1 0.006369426751592357


In [144]:
a_row=5
a_model_s[a_model_s==a_row]

328      5
344      5
405      5
477      5
532      5
        ..
29263    5
29623    5
29660    5
29959    5
29962    5
Length: 157, dtype: int64