In [11]:
import numpy as np
import pandas as pd

from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

import nltk

nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk import TweetTokenizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/garethsmith/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [17]:
READ_DIR = "../data/processed"

FILENAME = "twitter_airline_sentiment_cleaned_emoji_urls_html_symbols@#_quotes_currency_whitespace"

EMBEDDING_MPNET = "twitter_airline_sentiment_cleaned_emoji_urls_html_symbols@#_quotes_currency_whitespace_all-mpnet-base-v2.npy"

EMBEDDING_TWHINBERT = "twitter_airline_sentiment_cleaned_emoji_urls_html_symbols@#_quotes_currency_whitespace_twhin-bert-base.npy"

In [20]:
# Load embeddings
embeddings_mpnet = np.load(f"{READ_DIR}/{EMBEDDING_MPNET}")
embeddings_twhinbert = np.load(f"{READ_DIR}/{EMBEDDING_TWHINBERT}")

In [9]:
# Load text data
df = pd.read_csv(f"{READ_DIR}/{FILENAME}.csv")

In [13]:
# Parameters for UMAP
n_neighbors = 15
n_components = 5
min_dist = 0.1
metric_umap = "cosine"
random_state = 0

# UMAP model
reducer = UMAP(
    n_neighbors=n_neighbors,
    n_components=n_components,
    min_dist=min_dist,
    metric=metric_umap,
    random_state=random_state,
)

# Parameters for HDBScan
min_cluster_size = 15
min_samples = 5
metric_hdbscan = "euclidean"
cluster_selection_method = "eom"

# HDBScan model
clusterer = HDBSCAN(
    min_cluster_size=min_cluster_size,
    min_samples=min_samples,
    metric=metric_hdbscan,
    cluster_selection_method=cluster_selection_method,
    prediction_data=True,
)

In [14]:
def get_bertmodel(ngram_range, umap_model, hdbscan_model):
    # Convert a collection of text documents to a matrix of token counts:
    vectorizer = CountVectorizer(
        stop_words=stopwords.words("english"),
        tokenizer=TweetTokenizer().tokenize,
        token_pattern=None,
        ngram_range=ngram_range,
    )

    # Fit BERTopic model with customisation:
    topic_model = BERTopic(
        vectorizer_model=vectorizer, umap_model=umap_model, hdbscan_model=hdbscan_model
    )

    return topic_model

def get_topics_probs(model, docs, embeddings):
    topics, probs = model.fit_transform(
    docs, embeddings=embeddings)
    return topics, probs

In [21]:
# The lower and upper boundary of the range of n-values for different word n-grams
# or char n-grams to be extracted:
ngram_range = (1, 2)

bertmodel = get_bertmodel(
        ngram_range=ngram_range,
        umap_model=reducer, 
        hdbscan_model=clusterer)

topics_mpnet, probs_mpnet = get_topics_probs(model=bertmodel,
                 docs=list(df.clean_text),
                 embeddings=embeddings_mpnet)

In [28]:
def fit_bert(docs, embeddings, ngram_range, umap_model, hdbscan_model):
    bertmodel = get_bertmodel(
        ngram_range=ngram_range,
        umap_model=reducer, 
        hdbscan_model=clusterer)
    
    topics, probs = get_topics_probs(model=bertmodel,
                 docs=docs,
                 embeddings=embeddings)
    
    return bertmodel, topics, probs


In [None]:
bertmodel_mpnet, topics_mpnet, probs_mpnet = fit_bert(docs=list(df.clean_text),
    embeddings=embeddings_mpnet,
        ngram_range=ngram_range, 
        umap_model=reducer, 
        hdbscan_model=clusterer)

In [27]:
df_bertopics = bertmodel.get_topic_info()
print("Number of topics: {}".format( len(df_bertopics)))
df_bertopics['Percentage'] = round(df_bertopics['Count']/df_bertopics['Count'].sum() * 100,2)
df_bertopics


Number of topics: 125


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,Percentage
0,-1,7039,-1_flight_get_cancelled_thanks,"[flight, get, cancelled, thanks, plane, im, us...","[about time! Thank you!, too Late Flight now, ...",48.79
1,0,1061,0_bag_luggage_bags_baggage,"[bag, luggage, bags, baggage, lost, claim, lef...",[Really... .you charge me $25 to check a bag a...,7.35
2,1,362,1_hold_call_phone_minutes,"[hold, call, phone, minutes, hours, hung, hour...",[Was put on hold for 5.5 hrs then got a call b...,2.51
3,2,274,2_great_crew_great flight_attendant,"[great, crew, great flight, attendant, thanks,...","[Great flight, as always! Thank you to the gre...",1.90
4,3,184,3_hold_cancelled_cancelled flightled_flightled,"[hold, cancelled, cancelled flightled, flightl...",[been on hold for an hour. I need to rebook my...,1.28
...,...,...,...,...,...,...
120,119,15,119_apply_team job_job opening_recruiting,"[apply, team job, job opening, recruiting, isi...",[i hope i get the opportunity to join the team...,0.10
121,120,15,120_connection_phl_help delayed_miss,"[connection, phl, help delayed, miss, layover,...",[can you help me with a delayed flight as im g...,0.10
122,121,15,121_credit_purchased_refunded_receipt,"[credit, purchased, refunded, receipt, yr, dat...",[- He Cancelled Flightled a flight & was given...,0.10
123,122,15,122_worst experience_experience ever_worst_eve...,"[worst experience, experience ever, worst, eve...",[this has to be the absolute WORST EXPERIENCE ...,0.10


In [None]:
fig2 = topic_model_bad.visualize_barchart(top_n_topics = 10)
fig2.show()