In [None]:
#import packages
import pandas as pd 
import numpy as np
import nltk
import re

import hdbscan
import umap
import contractions

from bertopic import BERTopic
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from utils import *

ESHOST = "ec2-54-90-163-248.compute-1.amazonaws.com" # ElasticSearch host. Where articles get stored
ESPORT = 9200 # ElasticSearch port. Where articles get stored
client = ESConnect(host=ESHOST, port=ESPORT) # get all titles from the ES dataset
titles = getTitles(client)


In [None]:
#load data
sum_list=[]

for i in range(len(titles)):
    sum_list.append(getSummary(client, titles[i]['label']))
df = pd.DataFrame(sum_list, columns =['text']) 

# remove protocols
df.text = df.apply(lambda row: re.sub(r"http\S+", "", row.text).lower(), 1)

# remove non-letters
df.text = df.apply(lambda row: " ".join(re.sub("[^a-zA-Z]+", " ", row.text).split()), 1)

# convert to lowercase
df.text = df.text.apply(lambda x: ' '.join([w.lower() for w in x.split()]))

# expand contractions  
df.text = df.text.apply(lambda x: ' '.join([contractions.fix(w) for w in x.split()]))

# remove stopwords
stop_words = stopwords.words('english')
df.text = df.text.apply(lambda x: ' '.join([w for w in x.split() if w not in stop_words]))

# remove short words
df.text = df.text.apply(lambda x: ' '.join([w.strip() for w in x.split() if len(w.strip()) >= 3]))

# lemmatize
df.text = df.text.apply(lambda x: ' '.join([WordNetLemmatizer().lemmatize(w) for w in x.split()]))

docs = df.text.to_list()
docs

In [None]:
# create model

# Load sentence transformer model
sentence_model = SentenceTransformer("all-distilroberta-v1")

# Create documents embeddings
embeddings = sentence_model.encode(docs, show_progress_bar=True)

# Define UMAP model to reduce embeddings dimension
umap_model = umap.UMAP(n_neighbors=8,
                       n_components=5,
                       min_dist=0.0,
                       metric='cosine',
                       low_memory=False)

# Define HDBSCAN model to perform documents clustering
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=3,
                                min_samples=1,
                                metric='euclidean',
                                cluster_selection_method='eom',
                                prediction_data=True)

# Create BERTopic model
topic_model = BERTopic(top_n_words=8,
                       n_gram_range=(1,3), 
                       calculate_probabilities=True,
                       umap_model= umap_model,
                       hdbscan_model=hdbscan_model,
                       verbose=True)

# Train model, extract topics and probabilities
topics, probabilities = topic_model.fit_transform(docs, embeddings)



In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_barchart(top_n_topics=30)


In [None]:
# get docs from topic

df = pd.DataFrame({'topic': topics, 'document': docs})
docs_in_topic = df[df.topic == 18]
docs_in_topic

In [None]:
topic_model.get_topic_info()