In [1]:
import pandas as pd
import numpy as np
import re
import pickle
from tqdm.auto import tqdm
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.decomposition import PCA

# Minimal Cleaning


In [2]:
df = pd.read_parquet(r"C:\Users\fashaikh\Desktop\Thesis_main\topic_modeling.parquet")

In [3]:
def clean_tweet(text):
    # Remove mentions
    text = re.sub(r"@user", "", text)

    # Remove URLs
    text = re.sub(r"http", "", text)

    # Remove extra whitespaces
    text = re.sub(r"\s+", " ", text).strip()

    # Remove leading whitespaces
    text = re.sub(r"^\s+", "", text)

    # Replace &amp; with and
    text = re.sub(r"&amp;", "and", text)

    return text

In [4]:
df["clean"] = df["cleanedContent"].apply(clean_tweet)

In [7]:
df = df[["id", "category", "clean"]]

In [8]:
df.to_parquet(r"C:\Users\fashaikh\Desktop\Thesis_main\minimal_cleaned.parquet")

# Precompute Embeddings


In [2]:
df = pd.read_parquet(r"C:\Users\fashaikh\Desktop\Thesis_main\minimal_cleaned.parquet")

In [3]:
text = df["clean"].to_list()

In [4]:
sentence_model = SentenceTransformer("all-MiniLM-L12-v2", device="cuda")

In [5]:
embeddings = sentence_model.encode(text, show_progress_bar=True)

Batches:   0%|          | 0/146440 [00:00<?, ?it/s]

In [6]:
np.save("embeddings.npy", embeddings)

# Rescaling embeddings with PCA


In [2]:
embeddings = np.load(r"C:\Users\fashaikh\Desktop\Thesis_main\embeddings.npy")

In [3]:
def rescale(x, inplace=False):
    """Rescale an embedding so optimization will not have convergence issues."""
    if not inplace:
        x = np.array(x, copy=True)

    x /= np.std(x[:, 0]) * 10000

    return x

In [4]:
# Initialize and rescale PCA embeddings
pca_embeddings = rescale(PCA(n_components=5).fit_transform(embeddings))

In [5]:
np.save("pca_embeddings_5.npy", pca_embeddings)

# Topic Modeling


In [2]:
# pca_embeddings = np.load("pca_embeddings.npy")
embeddings = np.load(r"C:\Users\fashaikh\Desktop\Thesis_main\embeddings.npy")

Base Topic Model


In [7]:
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english", min_df=50)
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
representation_model = KeyBERTInspired()

# Start UMAP from PCA embeddings
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    unique=True,
    init=pca_embeddings,
    random_state=42,
)

topic_model = BERTopic(
    umap_model=umap_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    representation_model=representation_model,
    low_memory=True,
    verbose=True,
    calculate_probabilities=False,
)

In [8]:
topics, _ = topic_model.fit_transform(df["clean"])

Batches:   0%|          | 0/146440 [00:00<?, ?it/s]

2023-07-09 14:03:39,897 - BERTopic - Transformed documents to Embeddings


In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.get_topic(0)

In [None]:
topic_model.get_document_info(text)

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_heatmap()

In [None]:
topic_model.save(
    "path/to/my/model_dir",
    serialization="safetensors",
    save_ctfidf=True,
    save_embedding_model=sentence_model,
)

# Manual Topic Modeling

In [28]:
df = pd.read_parquet(r"C:\Users\fashaikh\Desktop\Thesis_main\minimal_cleaned.parquet")

In [29]:
category_mapping = {
    0: "Big Cities",
    1: "Exurbs",
    2: "Rural Middle America",
    3: "Evangelical Hubs",
    4: "Graying America",
    5: "Middle Suburbs",
    6: "College Towns",
    7: "Urban Suburbs",
    8: "Hispanic Centers",
    9: "Working Class Country",
    10: "African American South",
    11: "Aging Farmlands",
    12: "Military Posts",
    13: "Native American Lands",
    14: "LDS Enclaves",
}
df['target'] = df['category'].map({v: k for k, v in category_mapping.items()})

In [30]:
docs = df['clean'].to_list()
y = df['target'].to_list()

In [31]:
from bertopic import BERTopic
from bertopic.backend import BaseEmbedder
from bertopic.cluster import BaseCluster
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.dimensionality import BaseDimensionalityReduction

# Prepare our empty sub-models and reduce frequent words while we are at it.
empty_embedding_model = BaseEmbedder()
empty_dimensionality_model = BaseDimensionalityReduction()
empty_cluster_model = BaseCluster()
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")

# Fit BERTopic without actually performing any clustering
topic_model= BERTopic(
        embedding_model=empty_embedding_model,
        umap_model=empty_dimensionality_model,
        hdbscan_model=empty_cluster_model,
        ctfidf_model=ctfidf_model,
        vectorizer_model=vectorizer_model
)
topics, probs = topic_model.fit_transform(docs, y=y)


2023-07-09 16:50:57,049 - BERTopic - Transformed documents to Embeddings
2023-07-09 16:50:57,050 - BERTopic - The dimensionality reduction algorithm did not contain the `y` parameter and therefore the `y` parameter was not used
2023-07-09 16:50:57,050 - BERTopic - Reduced dimensionality
2023-07-09 16:50:57,989 - BERTopic - Clustered reduced embeddings


In [32]:
# Assign original classes to our topics
df = topic_model.get_topic_info()
df["Class"] = df.Topic.map(category_mapping)
df


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,Class
0,0,3004707,0_blacklivesmatter_blm_black_people,"[blacklivesmatter, blm, black, people, police,...",[hey District reassigned a middle school teach...,Big Cities
1,1,688950,1_blacklivesmatter_people_black_don,"[blacklivesmatter, people, black, don, justice...",[SAY THEIR NAMES! george floyd trayvon martin ...,Exurbs
2,2,275735,2_oh_blm_antifa_ny,"[oh, blm, antifa, ny, like, people, blm antifa...",[Hi could you please sign and share this petit...,Rural Middle America
3,3,140394,3_ok_al_blm_antifa,"[ok, al, blm, antifa, ok blm, ar, sharpton, al...",[Right. You people!?🤦🏽‍♀️ saying you don’t sup...,Evangelical Hubs
4,4,128348,4_ca_blm_id_antifa,"[ca, blm, id, antifa, like, people, just, don,...","[I don't vote R and I support equality, but yo...",Graying America
5,5,104128,5_blm_antifa_like_people,"[blm, antifa, like, people, don, tell, just, w...",[I support the protestors (NOT the rioters) wh...,Middle Suburbs
6,6,81498,6_ny_ar_blm_ms,"[ny, ar, blm, ms, antifa, ar 15, 15, tx, ky, a...",[Another little trump supporter with an AR 15 ...,College Towns
7,7,65176,7_hi_blm_tx_antifa,"[hi, blm, tx, antifa, blacklivesmatter hi, ant...",[Totalitarianism in all forms is Evil! All Fai...,Urban Suburbs
8,8,60391,8_ar_ar 15_15_blm,"[ar, ar 15, 15, blm, antifa, tx, antifa blm, w...",[Best if viewed on Desktop or to your home the...,Hispanic Centers
9,9,54507,9_oh_ar_ar 15_15,"[oh, ar, ar 15, 15, blm oh, blm, tn, oh right,...","[ATTN: suburban and urban women, children, and...",Working Class Country


# Topic Modeling per class

In [9]:
df = pd.read_parquet(r"C:\Users\fashaikh\Desktop\Thesis_main\minimal_cleaned.parquet")
df = df[df['category'] == 'African American South']
df = df.drop_duplicates(subset=['clean'])

In [10]:
docs = df['clean'].to_list()

In [11]:
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
representation_model = KeyBERTInspired()

umap_model = UMAP(n_neighbors=15, n_components=10, metric='cosine', low_memory=False, random_state=42)

hdbscan_model = HDBSCAN(min_cluster_size=30, metric='euclidean', prediction_data=True)

topic_model = BERTopic(nr_topics='auto',
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    representation_model=representation_model,
    verbose=True,
)

In [12]:
topics, probs = topic_model.fit_transform(docs)

Batches:   0%|          | 0/1166 [00:00<?, ?it/s]

2023-07-21 18:13:37,287 - BERTopic - Transformed documents to Embeddings
2023-07-21 18:14:16,143 - BERTopic - Reduced dimensionality
2023-07-21 18:14:22,057 - BERTopic - Clustered reduced embeddings
2023-07-21 18:14:38,549 - BERTopic - Reduced number of topics from 70 to 59


In [13]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,19102,-1_blm protests_black lives_blm antifa_antifa blm,"[blm protests, black lives, blm antifa, antifa...",[African Americans Black Hollywood celebrities...
1,0,5190,0_ar15 blm_ar blm_blm protesters_protest ar15,"[ar15 blm, ar blm, blm protesters, protest ar1...",[#KyleRittenhouse just 15 days before he gunne...
2,1,2772,1_blm antifa_antifa blm_blm protests_blm marxist,"[blm antifa, antifa blm, blm protests, blm mar...",[BLM R DOMESTIC TERRORISTS!! THEY MUST b TREAT...
3,2,1889,2_blacklivesmatter protest_blacklivesmatter im...,"[blacklivesmatter protest, blacklivesmatter im...",[thread) been thinking about the amazing #Blac...
4,3,958,3_floyds funeral_floyd funeral_floyd blacklive...,"[floyds funeral, floyd funeral, floyd blackliv...","[We don't want no favors (America), just get y..."
5,4,947,4_blm shes_support blm_blm movement_blm ms,"[blm shes, support blm, blm movement, blm ms, ...","[Hello , pls think: 1. posted a serious commen..."
6,5,620,5_outrage blm_black trump_al blm_sharpton racist,"[outrage blm, black trump, al blm, sharpton ra...",[Where is BLM now that Bernell Trammell was mu...
7,6,569,6_blm protests_protest tn_protests tn_nashvill...,"[blm protests, protest tn, protests tn, nashvi...","[1) 1 The residents of Millersville, TN showed..."
8,7,551,7_oppression black_black assaulted_lie blacks_...,"[oppression black, black assaulted, lie blacks...",[Pretty rich of you $Ms lecturing NFL owner $B...
9,8,316,8_blm donations_donations blm_donated blm_dona...,"[blm donations, donations blm, donated blm, do...",[Go to Black Lives Matter website and click on...


In [57]:
# Reduce outliers
new_topics = topic_model.reduce_outliers(docs, topics)

100%|██████████| 25/25 [00:06<00:00,  3.72it/s]


In [34]:
topic_model.get_topic(-1)

[('blm protest', 0.50741845),
 ('blm protests', 0.4843492),
 ('blm movement', 0.39731598),
 ('black lives', 0.39607164),
 ('blm antifa', 0.3852024),
 ('antifa blm', 0.3806836),
 ('blm', 0.35702345),
 ('blacks', 0.30573237),
 ('racism', 0.29835266),
 ('racist', 0.29817376)]

In [15]:
topic_model.visualize_barchart(topics=[3,7,5], title="Topics - African American South", width=350)


In [8]:
df_topic = topic_model.get_document_info(docs)

In [40]:
selected_df = df_topic[(df_topic['Topic'] == -1)]

In [41]:
docs = selected_df['Document'].to_list()

In [65]:
topic_model.visualize_topics()

In [20]:
hierarchical_topics = topic_model.hierarchical_topics(docs)


100%|██████████| 16/16 [00:01<00:00, 13.82it/s]


In [21]:
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)


In [94]:
tree = topic_model.get_topic_tree(hierarchical_topics)

In [95]:
print(tree)

.
├─racial gerrymandering_racial gerrymandering dont_racial gerrymandering dont deserve_voter suppressio
│    ├─■──suppression andor racial gerrymandering_states voter suppression_states voter suppression andor_raci ── Topic: 50
│    └─■──voter suppression states_voter suppression states dont_dont deserve athletes color_states dont deser ── Topic: 26
└─blm movement_blm ut_ut blm_blm_support blm
     ├─ut athletics_ut athletes_ut austin students_black students_black jerseys
     │    ├─black students_black student_racial justice_racism_blacklivesmatter ut
     │    │    ├─racistsexist government blm_black student_racistsexist government_black students_diversity community
     │    │    │    ├─■──raise awareness race racism_recently shared thoughts blacklivesmatter_received streams donated black ── Topic: 18
     │    │    │    └─■──project blm doing social_project blm doing_project blm_protests university supported black_black stu ── Topic: 20
     │    │    └─racial justice_austin ackn

In [37]:
topics_to_merge = [-1,0]
topic_model.merge_topics(docs, topics_to_merge)