In [11]:
import numpy as np
import pandas as pd

from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

import nltk

nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk import TweetTokenizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/garethsmith/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
READ_DIR = "../data/processed"

FILENAME = "twitter_airline_sentiment_cleaned_emoji_urls_html_symbols@#_quotes_currency_whitespace"

EMBEDDING_MPNET = "twitter_airline_sentiment_cleaned_emoji_urls_html_symbols@#_quotes_currency_whitespace_all-mpnet-base-v2.npy"

EMBEDDING_TWHINBERT = "twitter_airline_sentiment_cleaned_emoji_urls_html_symbols@#_quotes_currency_whitespace_twhin-bert-base.npy"

In [8]:
# Load embeddings
embeddings_mpnet = np.load(f"{READ_DIR}/{EMBEDDING_MPNET}")
embeddings_twhinbert = np.load(f"{READ_DIR}/{EMBEDDING_TWHINBERT}")

In [9]:
# Load text data
df = pd.read_csv(f"{READ_DIR}/{FILENAME}.csv")

In [12]:
# Parameters for UMAP
n_neighbors = 15
n_components = 5
min_dist = 0.1
metric_umap = "cosine"
random_state = 0

# UMAP model
reducer = UMAP(
    n_neighbors=n_neighbors,
    n_components=n_components,
    min_dist=min_dist,
    metric=metric_umap,
    random_state=random_state,
)

# Parameters for HDBScan
min_cluster_size = 15
min_samples = 5
metric_hdbscan = "euclidean"
cluster_selection_method = "eom"

# HDBScan model
clusterer = HDBSCAN(
    min_cluster_size=min_cluster_size,
    min_samples=min_samples,
    metric=metric_hdbscan,
    cluster_selection_method=cluster_selection_method,
    prediction_data=True,
)

In [None]:
stop_words = stopwords.words("english")
tokenizer = TweetTokenizer().tokenize

# The lower and upper boundary of the range of n-values for different word n-grams
# or char n-grams to be extracted:
ngram_range = (1, 2)

# Convert a collection of text documents to a matrix of token counts:
vectorizer = CountVectorizer(
    stop_words=stop_words,
    tokenizer=tokenizer,
    token_pattern=None,
    ngram_range=ngram_range,
)

# Fit BERTopic model with customisation:
topic_model = BERTopic(
    vectorizer_model=vectorizer, umap_model=reducer, hdbscan_model=clusterer
)

topics, probs = topic_model.fit_transform(list(df.clean_text), embeddings=embeddings)

In [None]:
def get_bertmodel(ngram_range, umap_model, hdbscan_model):
    # Convert a collection of text documents to a matrix of token counts:
    vectorizer = CountVectorizer(
        stop_words=stopwords.words("english"),
        tokenizer=TweetTokenizer().tokenize,
        token_pattern=None,
        ngram_range=ngram_range,
    )

    # Fit BERTopic model with customisation:
    topic_model = BERTopic(
        vectorizer_model=vectorizer, umap_model=umap_model, hdbscan_model=hdbscan_model
    )

    return topic_model

def get_topics_probs(model, docs, embeddings):
    topics, probs = model.fit_transform(
    docs, embeddings=embeddings)
    return topics, probs

In [None]:
bertmodel = get_bertmodel(
        ngram_range=(1,2),
        umap_model=reducer, 
        hdbscan_model=clusterer)

get_topics_probs(model=bertmodel,
                 docs=list(df.clean_text),
                 embeddings=EMBEDDING_MPNET,)