In [2]:
import pandas as pd
import numpy as np
import re
import pickle
from tqdm.auto import tqdm
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.decomposition import PCA

# Minimal Cleaning


In [2]:
df = pd.read_parquet(r"C:\Users\fashaikh\Desktop\Thesis_main\topic_modeling.parquet")

In [3]:
def clean_tweet(text):
    # Remove mentions
    text = re.sub(r"@user", "", text)

    # Remove URLs
    text = re.sub(r"http", "", text)

    # Remove extra whitespaces
    text = re.sub(r"\s+", " ", text).strip()

    # Remove leading whitespaces
    text = re.sub(r"^\s+", "", text)

    # Replace &amp; with and
    text = re.sub(r"&amp;", "and", text)

    return text

In [4]:
df["clean"] = df["cleanedContent"].apply(clean_tweet)

In [7]:
df = df[["id", "category", "clean"]]

In [8]:
df.to_parquet(r"C:\Users\fashaikh\Desktop\Thesis_main\minimal_cleaned.parquet")

# Precompute Embeddings


In [2]:
df = pd.read_parquet(r"C:\Users\fashaikh\Desktop\Thesis_main\minimal_cleaned.parquet")

In [3]:
df = df[df['category'] == 'Big Cities']
df = df.drop_duplicates(subset=['clean'])

In [6]:
docs = df['clean'].to_list()

In [12]:
sentence_model = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")

In [13]:
embeddings = sentence_model.encode(df, show_progress_bar=True, convert_to_numpy=True)

Batches:   0%|          | 0/5699 [00:00<?, ?it/s]

In [14]:
np.save("exurbs_embeddings.npy", embeddings)

# Rescaling embeddings with PCA


In [2]:
embeddings = np.load(r"C:\Users\fashaikh\Desktop\Thesis_main\thesis\topic_modeling\big_cities_embeddings.npy")

In [3]:
def rescale(x, inplace=False):
    """Rescale an embedding so optimization will not have convergence issues."""
    if not inplace:
        x = np.array(x, copy=True)

    x /= np.std(x[:, 0]) * 10000

    return x

In [4]:
# Initialize and rescale PCA embeddings
pca_embeddings = rescale(PCA(n_components=5).fit_transform(embeddings))

In [5]:
np.save("pca_embeddings_big_cities.npy", pca_embeddings)

# Topic Modeling


In [2]:
# pca_embeddings = np.load("pca_embeddings.npy")
embeddings = np.load(r"C:\Users\fashaikh\Desktop\Thesis_main\embeddings.npy")

Base Topic Model


In [7]:
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english", min_df=50)
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
representation_model = KeyBERTInspired()

# Start UMAP from PCA embeddings
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    unique=True,
    init=pca_embeddings,
    random_state=42,
)

topic_model = BERTopic(
    umap_model=umap_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    representation_model=representation_model,
    low_memory=True,
    verbose=True,
    calculate_probabilities=False,
)

In [8]:
topics, _ = topic_model.fit_transform(df["clean"])

Batches:   0%|          | 0/146440 [00:00<?, ?it/s]

2023-07-09 14:03:39,897 - BERTopic - Transformed documents to Embeddings


In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.get_topic(0)

In [None]:
topic_model.get_document_info(text)

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_heatmap()

In [None]:
topic_model.save(
    "path/to/my/model_dir",
    serialization="safetensors",
    save_ctfidf=True,
    save_embedding_model=sentence_model,
)

# Manual Topic Modeling

In [28]:
df = pd.read_parquet(r"C:\Users\fashaikh\Desktop\Thesis_main\minimal_cleaned.parquet")

In [29]:
category_mapping = {
    0: "Big Cities",
    1: "Exurbs",
    2: "Rural Middle America",
    3: "Evangelical Hubs",
    4: "Graying America",
    5: "Middle Suburbs",
    6: "College Towns",
    7: "Urban Suburbs",
    8: "Hispanic Centers",
    9: "Working Class Country",
    10: "African American South",
    11: "Aging Farmlands",
    12: "Military Posts",
    13: "Native American Lands",
    14: "LDS Enclaves",
}
df['target'] = df['category'].map({v: k for k, v in category_mapping.items()})

In [30]:
docs = df['clean'].to_list()
y = df['target'].to_list()

In [31]:
from bertopic import BERTopic
from bertopic.backend import BaseEmbedder
from bertopic.cluster import BaseCluster
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.dimensionality import BaseDimensionalityReduction

# Prepare our empty sub-models and reduce frequent words while we are at it.
empty_embedding_model = BaseEmbedder()
empty_dimensionality_model = BaseDimensionalityReduction()
empty_cluster_model = BaseCluster()
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")

# Fit BERTopic without actually performing any clustering
topic_model= BERTopic(
        embedding_model=empty_embedding_model,
        umap_model=empty_dimensionality_model,
        hdbscan_model=empty_cluster_model,
        ctfidf_model=ctfidf_model,
        vectorizer_model=vectorizer_model
)
topics, probs = topic_model.fit_transform(docs, y=y)


2023-07-09 16:50:57,049 - BERTopic - Transformed documents to Embeddings
2023-07-09 16:50:57,050 - BERTopic - The dimensionality reduction algorithm did not contain the `y` parameter and therefore the `y` parameter was not used
2023-07-09 16:50:57,050 - BERTopic - Reduced dimensionality
2023-07-09 16:50:57,989 - BERTopic - Clustered reduced embeddings


In [32]:
# Assign original classes to our topics
df = topic_model.get_topic_info()
df["Class"] = df.Topic.map(category_mapping)
df


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,Class
0,0,3004707,0_blacklivesmatter_blm_black_people,"[blacklivesmatter, blm, black, people, police,...",[hey District reassigned a middle school teach...,Big Cities
1,1,688950,1_blacklivesmatter_people_black_don,"[blacklivesmatter, people, black, don, justice...",[SAY THEIR NAMES! george floyd trayvon martin ...,Exurbs
2,2,275735,2_oh_blm_antifa_ny,"[oh, blm, antifa, ny, like, people, blm antifa...",[Hi could you please sign and share this petit...,Rural Middle America
3,3,140394,3_ok_al_blm_antifa,"[ok, al, blm, antifa, ok blm, ar, sharpton, al...",[Right. You people!?🤦🏽‍♀️ saying you don’t sup...,Evangelical Hubs
4,4,128348,4_ca_blm_id_antifa,"[ca, blm, id, antifa, like, people, just, don,...","[I don't vote R and I support equality, but yo...",Graying America
5,5,104128,5_blm_antifa_like_people,"[blm, antifa, like, people, don, tell, just, w...",[I support the protestors (NOT the rioters) wh...,Middle Suburbs
6,6,81498,6_ny_ar_blm_ms,"[ny, ar, blm, ms, antifa, ar 15, 15, tx, ky, a...",[Another little trump supporter with an AR 15 ...,College Towns
7,7,65176,7_hi_blm_tx_antifa,"[hi, blm, tx, antifa, blacklivesmatter hi, ant...",[Totalitarianism in all forms is Evil! All Fai...,Urban Suburbs
8,8,60391,8_ar_ar 15_15_blm,"[ar, ar 15, 15, blm, antifa, tx, antifa blm, w...",[Best if viewed on Desktop or to your home the...,Hispanic Centers
9,9,54507,9_oh_ar_ar 15_15,"[oh, ar, ar 15, 15, blm oh, blm, tn, oh right,...","[ATTN: suburban and urban women, children, and...",Working Class Country


# Topic Modeling per class

In [165]:
df = pd.read_parquet(r"C:\Users\fashaikh\Desktop\Thesis_main\minimal_cleaned.parquet")
df.drop_duplicates(subset=['id'], inplace=True)
df = df[df['category'] == 'Exurbs']
df.drop_duplicates(subset=['clean'], inplace=True)

In [166]:
df = df['clean'].to_list()

In [167]:
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
representation_model = KeyBERTInspired()

umap_model = UMAP(n_neighbors=15, n_components=10, metric='cosine', random_state=42)

hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', prediction_data=True)

topic_model = BERTopic(nr_topics='auto',
                        top_n_words=5,
                        umap_model=umap_model,
                        hdbscan_model=hdbscan_model,
                        vectorizer_model=vectorizer_model,
                        ctfidf_model=ctfidf_model,
                        representation_model=representation_model,
                        verbose=True,
)

In [168]:
topics, probs = topic_model.fit_transform(df)

Batches:   0%|          | 0/5699 [00:00<?, ?it/s]

2023-08-13 18:58:38,994 - BERTopic - Transformed documents to Embeddings
2023-08-13 19:02:39,746 - BERTopic - Reduced dimensionality
2023-08-13 19:03:11,198 - BERTopic - Clustered reduced embeddings
2023-08-13 19:19:15,848 - BERTopic - Reduced number of topics from 2052 to 1515


In [169]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,100995,-1_blm protesters_blacklivesmatter blm_blm bla...,"[blm protesters, blacklivesmatter blm, blm bla...",[I’ve just been so fucking uncomfortable these...
1,0,3129,0_unfollow blm_unfollowed blm_blm unfollow_unf...,"[unfollow blm, unfollowed blm, blm unfollow, u...",[if u are unfollowing me bc i’m retweeting blm...
2,1,2132,1_floyd justice_floyd blacklivesmatter_floyd m...,"[floyd justice, floyd blacklivesmatter, floyd ...",[an innocent black cnn reporter gets arrested ...
3,2,1987,2_shes blm_blm shes_posted blm_shes protesting,"[shes blm, blm shes, posted blm, shes protesti...",[shes trying for the ‘i dnt have enough info r...
4,3,1967,3_tweeting blacklivesmatter_blacklivesmatter t...,"[tweeting blacklivesmatter, blacklivesmatter t...",[IMPORTANT‼️ Instead of tweeting #BlackLivesMa...
...,...,...,...,...,...
1510,1509,5,1509_petition blm_floyd justice_blm justice_fl...,"[petition blm, floyd justice, blm justice, flo...",[PLEASE SIGN THIS. GEORGE FLOYD AND HIS FAMILY...
1511,1510,5,1510_blacklivesmatter kids_black kids_youth bl...,"[blacklivesmatter kids, black kids, youth blac...","[These kids are supposed to enjoy childhood, c..."
1512,1511,5,1511_blm protest_protest blm_blm la_downtown p...,"[blm protest, protest blm, blm la, downtown pr...",[plz do NOT go protest for BLM in coral gables...
1513,1512,5,1512_blacklivesmatter protest_fontana blackliv...,"[blacklivesmatter protest, fontana blacklivesm...",[i just looked thru the comments i guess after...


In [11]:
topic_model.get_topic(28)

[('defund blm', 0.6660235),
 ('blm defund', 0.65281016),
 ('blm defunding', 0.63957137),
 ('defunded police', 0.6384733),
 ('police blm', 0.6242396)]

In [170]:
df_topic = topic_model.get_document_info(df)

In [46]:
topics_to_merge = [[64, 27]]
topic_model.merge_topics(df, topics_to_merge)

In [224]:
topic_model.set_topic_labels({89:'Living in Fear', 402:'Advocating for Change', 31:'Looting', 1022:'Callout for Peace', 1279:'Antipathy', 1471:'Trump Violence Claims', 1149:'White Privilege', 15:'Support for Victims', 917:'Opression'})

In [227]:
topic_model.visualize_barchart(topics=[15, 1429, 1471, 94, 1149, 1279, 917, 1022], title="Topics - Big Cities", width=450, custom_labels=True) # # 0,1,20,22,76, 78, 101 

In [81]:
topic_model.get_topics()

{-1: [('blm protests', 0.6739483),
  ('blm riots', 0.6669688),
  ('blm antifa', 0.620182),
  ('antifa blm', 0.60959405),
  ('support blm', 0.5876468)],
 0: [('blm riots', 0.69320124),
  ('blm protests', 0.67473924),
  ('antifa riots', 0.6060972),
  ('protests', 0.49380526),
  ('riots', 0.4884339)],
 1: [('shes blm', 0.59543407),
  ('blm shes', 0.5317078),
  ('blm stuff', 0.38399163),
  ('shes just', 0.37789968),
  ('elizabeth', 0.37305197)],
 2: [('maga', 0.5582672),
  ('maga republicans', 0.5346434),
  ('maga supporter', 0.51691717),
  ('tell maga', 0.5087316),
  ('ultra maga', 0.5019552)],
 3: [('blacklivesmatter freedom', 0.6173841),
  ('blacklivesmatter im', 0.61154586),
  ('blacklivesmatter people', 0.5839177),
  ('people blacklivesmatter', 0.56466174),
  ('blacklivesmatter hate', 0.56335133)],
 4: [('healer', 0.4842138),
  ('healers', 0.47501907),
  ('raids', 0.3960973),
  ('dungeons', 0.37069124),
  ('blm main', 0.37063685)],
 5: [('kanye blm', 0.65855724),
  ('blm kanye', 0.654

In [127]:
topic_model.visualize_topics()

In [77]:
hierarchical_topics = topic_model.hierarchical_topics(df)


100%|██████████| 156/156 [00:38<00:00,  4.04it/s]


In [None]:
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)


In [79]:
tree = topic_model.get_topic_tree(hierarchical_topics)

In [80]:
print(tree)

.
├─maga supporter_blm gay_blm lgbtq_maga_right wing
│    ├─■──attacker republican_attacked berkeley_republicans inferring_republican truth_sound republican ── Topic: 134
│    └─maga supporter_maga_like maga_blm gay_sound maga
│         ├─blm gay_blm lgbtq_lgbtq flag_flag blm_flags blm
│         │    ├─■──david depape_depape illegal_depape lives_depape_depape actually ── Topic: 32
│         │    └─maga supporter_maga_like maga_ultra maga_sound maga
│         │         ├─■──maga_maga republicans_maga supporter_tell maga_ultra maga ── Topic: 2
│         │         └─■──pelosis attacker_attacked pelosi_pelosi attacker_pelosi attack_pelosi attacked ── Topic: 6
│         └─■──paul attacked_pauls attacker_paul cheated_paul blm_paul lgbt ── Topic: 34
└─blm riots_black lives_riots_racism_blacklivesmatter
     ├─black privilege_white racist_theory racist_racist nazi_black crime
     │    ├─black privilege_white racist_black crime_privilege black_theory racist
     │    │    ├─black privilege_the