In [1]:
import re, pickle, os, torch, csv
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from bertopic import BERTopic
import datetime
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

import statsmodels.api as sm
from sklearn.preprocessing import normalize
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import minmax_scale
from scipy.cluster.hierarchy import fcluster, linkage
from sklearn.metrics.pairwise import cosine_similarity
from scipy.interpolate import interp1d
from sklearn.utils.extmath import safe_sparse_dot

from cuml.preprocessing import Normalizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#path to directory with all output files
output_dir = Path('/mnt/scratch/ande2472/model_output/topic_modeling/0_to_264')

In [6]:
#load clean docs
clean_path = Path('/mnt/scratch/ande2472/data/0_topjournals/0to264_topjournals_cleans.pickle')
with open(clean_path, "rb") as f:
    clean_docs = pickle.load(f)

FileNotFoundError: [Errno 2] No such file or directory: '/mnt/scratch/ande2472/data/0_topjournals/0to264_topjournals_cleans.pickle'

In [3]:
#load model
topic_model = BERTopic.load(output_dir/'generated_model')

In [4]:
#667 topics
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,-1,622745,-1_cells_cell_patients_protein,"[cells, cell, patients, protein, expression, c...","[significant, survival, group, therapy, functi...","[cell, protein, cancer, gene, human, disease, ...","[phase ii study su5416 , small molecule vascul..."
1,0,13194,0_catalyzed_enantioselective_chiral_reaction,"[catalyzed, enantioselective, chiral, reaction...","[alkynes, olefins, cyclization, heterocycles, ...","[enantioselective, aryl, catalyst, reactions, ...",[highly regio- enantioselective synthesis n-su...
2,1,6220,1_et al_al_et_issue,"[et al, al, et, issue, elsevier inc, inc, inc ...","[molecular cell, developmental cell, metabolis...","[et al, elsevier inc, inc, rights reserved, ri...",[leave sister behind .. recent work published ...
3,2,5362,2_rabbits_virus_serum_animals,"[rabbits, virus, serum, animals, pneumococcus,...","[guinea pigs, guinea pig, agglutination, pneum...","[rabbits, virus, pneumococcus, per cent, bacil...",[degree dispersion bacillus factor infection r...
4,3,5312,3_visual_cortex_neurons_neural,"[visual, cortex, neurons, neural, sensory, cor...","[visual cortex, visual system, motor cortex, n...","[visual, cortex, neurons, cortical, task, visu...",[frontal cortex selects representations talker...
...,...,...,...,...,...,...,...
662,661,101,661_catalyst_ni_pd_catalyzed,"[catalyst, ni, pd, catalyzed, reaction, bond, ...","[dinuclear pd, alkoxide, hydrogenation, hydrid...","[catalyst, pd, reaction, catalytic, reductive ...",[mechanistic study improved ni precatalyst suz...
663,662,101,662_methylation_demethylation_chromatin_dna me...,"[methylation, demethylation, chromatin, dna me...","[dna methylome, dna methylation, methylation r...","[methylation, demethylation, chromatin, dna me...",[single-cell multi-omics sequencing mouse earl...
664,663,101,663_b7_cd28_ctla_nk,"[b7, cd28, ctla, nk, costimulatory, nk cells, ...","[costimulatory molecules, costimulatory molecu...","[cd28, costimulatory, nk cells, b7 b7, nkg2d, ...",[comparative analysis b7-1 b7-2 costimulatory ...
665,664,101,664_stroke_carotid_ischaemic_aspirin,"[stroke, carotid, ischaemic, aspirin, endarter...","[randomisation, non inferiority, revascularisa...","[stroke, stenting, randomised, ischaemic strok...",[carotid artery stenting compared endarterecto...


In [5]:
topic_model.get_topics()

{-1: [('cells', 0.0013110234864890337),
  ('cell', 0.0012768745991260271),
  ('patients', 0.0012661012948828006),
  ('protein', 0.0010723240554863852),
  ('expression', 0.0010627801124749882),
  ('cancer', 0.0010184291737252943),
  ('gene', 0.0010087033712112123),
  ('human', 0.0009994589747171153),
  ('disease', 0.000989220981428524),
  ('dna', 0.0009832646830480881),
  ('associated', 0.0009781498238568948),
  ('study', 0.0009605585692257644),
  ('may', 0.0009429543580897462),
  ('two', 0.00094070572131812),
  ('also', 0.0009406947935524081),
  ('high', 0.0009266219571658358),
  ('specific', 0.0009198589647384121),
  ('activity', 0.0009193339968228444),
  ('using', 0.0009155131138680808),
  ('treatment', 0.0009148358332969366)],
 0: [('catalyzed', 0.016259045640393596),
  ('enantioselective', 0.010643851453741809),
  ('chiral', 0.010069455037776828),
  ('reaction', 0.009617634674689167),
  ('asymmetric', 0.008488189980715315),
  ('aryl', 0.007978158032115833),
  ('catalyst', 0.0071875