import random
from faker import Faker

fake = Faker()

In [68]:
texts = [
    "@testuser COVID-19, caused by the SARS-CoV-2 virus, is a highly contagious respiratory illness. #covid19",
    "The Coronavirus, or COVID-19, is a rapidly spreading respiratory disease caused by the SARS-CoV-2 virus.",
    "SARS-CoV-2, responsible for COVID-19, is a contagious respiratory infection with global impact.",
    "@lelele The SARS-CoV-2 virus is the causative agent of COVID-19, a highly contagious respiratory illness.",
    "COVID-19, caused by the SARS-CoV-2 virus, is a widespread respiratory disease affecting populations globally.",

    "Donald Trump served as the 45th President of the United States from 2017 to 2021.",
    "In the years 2017 to 2021, Donald Trump held the position of the 45th President of the United States.",
    "During the term from 2017 to 2021, Donald Trump assumed the role of the 45th President of the United States.",
    "From 2017 to 2021, Donald Trump functioned as the 45th President of the United States.",
    "Donald Trump, having served from 2017 to 2021, was the 45th President of the United States.",

    "The political debate heated up as candidates clashed over policies and ideologies.",
    "In the corridors of power, politicians strategized to gain support for their proposed legislation.",
    "Amidst political turmoil, citizens voiced their concerns about the direction of the government.",
    "A diplomatic summit aimed to address global challenges and foster international cooperation.",
    "Campaign rallies echoed with passionate speeches as candidates rallied their supporters.",

    "The chef skillfully prepared a gourmet dish with a delicate balance of flavors and textures.",
    "The artist meticulously crafted a masterpiece, capturing the essence of emotion on canvas.",
    "The programmer wrote efficient and bug-free code, ensuring the seamless operation of the software."
]


In [69]:

def clean_post_text(text: str) -> str:
    return " ".join(
        [
            word
            for word in text.split(" ")
            if not (word.startswith("#") or word.startswith("@"))
        ]
    )

cleaned_texts = [
    clean_post_text(text) for text in texts
]

print(cleaned_texts)

['COVID-19, caused by the SARS-CoV-2 virus, is a highly contagious respiratory illness.', 'The Coronavirus, or COVID-19, is a rapidly spreading respiratory disease caused by the SARS-CoV-2 virus.', 'SARS-CoV-2, responsible for COVID-19, is a contagious respiratory infection with global impact.', 'The SARS-CoV-2 virus is the causative agent of COVID-19, a highly contagious respiratory illness.', 'COVID-19, caused by the SARS-CoV-2 virus, is a widespread respiratory disease affecting populations globally.', 'Donald Trump served as the 45th President of the United States from 2017 to 2021.', 'In the years 2017 to 2021, Donald Trump held the position of the 45th President of the United States.', 'During the term from 2017 to 2021, Donald Trump assumed the role of the 45th President of the United States.', 'From 2017 to 2021, Donald Trump functioned as the 45th President of the United States.', 'Donald Trump, having served from 2017 to 2021, was the 45th President of the United States.', 'T

In [70]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
embeddings = model.encode(cleaned_texts)

print(embeddings)

[[-0.02865948 -0.03838501  0.01128075 ... -0.0220021   0.01910506
  -0.04755442]
 [-0.03130977 -0.07609831  0.00800171 ... -0.0065812   0.00458079
  -0.03720687]
 [-0.00278808 -0.01850671  0.01206771 ... -0.0227583  -0.00080379
  -0.06140418]
 ...
 [ 0.00150428 -0.07479999 -0.02893125 ...  0.00681199  0.01476072
  -0.04364553]
 [-0.0031171  -0.00495308 -0.02398848 ...  0.04534915 -0.00230241
  -0.04557031]
 [-0.03499632 -0.02626318 -0.02109579 ... -0.01396483  0.06009208
  -0.047226  ]]


In [71]:
import pandas as pd
import plotly.express as px
import numpy as np
from sklearn.manifold import TSNE

xembeddings = TSNE(n_components=2, perplexity=2).fit_transform(embeddings)

df_embeddings = pd.DataFrame(xembeddings)
df_embeddings = df_embeddings.rename(columns={0:'x',1:'y'})

fig = px.scatter(
    df_embeddings, x='x', y='y',
    color='x', labels={'color': 'label'},
    hover_data=['x'], title = 'Embedding visualization')
fig.show()

In [72]:
from scipy.cluster import hierarchy
import plotly.figure_factory as ff


#clusters = hierarchy.linkage(embeddings, "average", metric="cosine")
#flat_clusters = hierarchy.fcluster(clusters, threshold, criterion="distance")

fig = ff.create_dendrogram(embeddings,
                           linkagefun = lambda x: hierarchy.linkage(embeddings, "average", metric="cosine"))

fig.update_layout(title = 'Hierarchical Clustering', xaxis_title='Texts',
                   yaxis_title='Cosine', width=700, height=700)

fig.show()


In [73]:
import json

threshold = 0.2

clusters = hierarchy.linkage(embeddings, "average", metric="cosine")
flat_clusters = hierarchy.fcluster(clusters, threshold, criterion="distance")

num_of_clusters = len(set(flat_clusters))

groups: list[list[str]] = [[] for _ in range(num_of_clusters)]

for text, flat_cluster_num in zip(texts, flat_clusters):
    groups[flat_cluster_num - 1].append(text)

print(json.dumps(groups, indent=4))

[
    [
        "The chef skillfully prepared a gourmet dish with a delicate balance of flavors and textures."
    ],
    [
        "The artist meticulously crafted a masterpiece, capturing the essence of emotion on canvas."
    ],
    [
        "The programmer wrote efficient and bug-free code, ensuring the seamless operation of the software."
    ],
    [
        "@testuser COVID-19, caused by the SARS-CoV-2 virus, is a highly contagious respiratory illness. #covid19",
        "The Coronavirus, or COVID-19, is a rapidly spreading respiratory disease caused by the SARS-CoV-2 virus.",
        "SARS-CoV-2, responsible for COVID-19, is a contagious respiratory infection with global impact.",
        "@lelele The SARS-CoV-2 virus is the causative agent of COVID-19, a highly contagious respiratory illness.",
        "COVID-19, caused by the SARS-CoV-2 virus, is a widespread respiratory disease affecting populations globally."
    ],
    [
        "Donald Trump served as the 45th President 