In [None]:
# import libraries
# import transforms
# from transforms.api import Input, Output, transform
from bertopic import BERTopic
import numpy as np
import pandas as pd
from umap import UMAP
from hdbscan import HDBSCAN
import ast
# from pyspark.sql.types import StructType, StructField, IntegerType, ArrayType, StringType, DoubleType, FloatType
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, PartOfSpeech
import spacy
from bertopic.vectorizers import ClassTfidfTransformer
# import en_core_web_sm
from sentence_transformers import SentenceTransformer
# from transforms.external.systems import use_external_systems, EgressPolicy, Credential, ExportControl
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
def getSentimentGroup(compound):
    if 0.66 <= compound < 1:
        return 'Very Good'
    elif 0.33 <= compound < 0.66:
        return 'Good'
    elif 0 <= compound < 0.33:
        return 'Neutral Postive'
    elif -0.33 <= compound < 0:
        return 'Neutral Negative'
    elif -0.66 <= compound < -0.33:
        return 'Bad'
    elif -1 <= compound < -0.66:
        return 'Very Bad'
       
 
def getSentimentDict(sentence):
    analyzer = SentimentIntensityAnalyzer()
    score = analyzer.polarity_scores(sentence)
    compound = score #score['compound']
    return compound


# params
# params = params.dataframe()
# params = params.collect()[0][:-1]
# [n_neighbors, _, min_cluster_size, _, _] = params
# n_neighbors = 15
# min_cluster_size = 15

n_neighbors=15
min_topic_size=15
min_cluster_size=15
top_n_words=10
diversity = 0.9

# umap model
umap_model = UMAP(n_neighbors=n_neighbors, n_components=5, min_dist=0, metric='cosine', random_state=42)

# clustering
hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))
# representation model
keybert_model = KeyBERTInspired()
mmr_model = MaximalMarginalRelevance(diversity)

nlp = spacy.load("en_core_web_sm") #inshallah
pos = PartOfSpeech(top_n_words=top_n_words)

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "MMR": mmr_model,
    "pos": pos
}

#Train the model
# model = BERTopic(
#   # Pipeline models
#   embedding_model = embedding_model,
#   umap_model=umap_model,
#   hdbscan_model=hdbscan_model,
#   vectorizer_model=vectorizer_model,
#   representation_model=representation_model,
#   # Hyperparameters
#   top_n_words = top_n_words,
#   min_topic_size = min_topic_size,
#   #nr_topics=75,
#   ctfidf_model=ctfidf_model,
#   verbose=True
# )


embedding_model = SentenceTransformer('all-MiniLM-L6-v2',device="cpu")

model = BERTopic(
    verbose=True,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    representation_model= representation_model,
    embedding_model = embedding_model
)


# source_df = source_df.dataframe()
# # source_df = source_df.limit(2000)
# df = source_df.toPandas()

df = pd.read_csv('input/sentence_embedded.csv') # change the filename to your .csv file name and location

df = df.dropna(subset=['embedding'])

df['embedding'] = df['embedding'].apply(lambda x: ast.literal_eval(x))

docs = df.text.to_list()
embeddings = np.array(df['embedding'].apply(lambda x: np.array(x).astype(np.float32)).to_list())
# embeddings = np.array(df['embedding'].apply(lambda x: np.array(x, dtype=np.float32)).to_list())
topics, probs = model.fit_transform(docs, embeddings)
df["topic"] = topics
df["probability"] = probs

# get sentiment
df['sentiment_analysis'] = df['text'].apply(lambda x: getSentimentDict(x))
df['sentiment_compound'] = df['sentiment_analysis'].apply(lambda x: x['compound'])
df['sentiment'] = df['sentiment_compound'].apply(lambda x: getSentimentGroup(x)) # Good Bad

In [None]:
from scipy.cluster import hierarchy as sch

hierarchical_topics = model.hierarchical_topics(docs)

linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
hierarchical_topics = model.hierarchical_topics(docs, linkage_function=linkage_function)

# visulize the topics hierarchy
model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [None]:
hier_topics = model.hierarchical_topics(docs)

hier_topics.to_csv("temp_files/hier_topics.csv")

In [None]:
hier_topics = model.hierarchical_topics(docs)

expanded_rows = []

# Iterate through the DataFrame and expand the arrays
for index, row in hier_topics.iterrows():
    # array_values = row['Topics'].strip('[]').split(',')
    for value in row['Topics']:
        expanded_row = row.copy()
        expanded_row['Topics'] = value
        expanded_rows.append(expanded_row)

# Convert the list of rows to a DataFrame
expanded_df = pd.DataFrame(expanded_rows)

# Save the expanded DataFrame to a new CSV file
expanded_df.to_csv('temp_files/expanded_output.csv', index=False)


In [None]:
hier_topics = model.hierarchical_topics(docs)

hier_topics = hier_topics.explode('Topics')

# Save the expanded DataFrame to a new CSV file
hier_topics.to_csv('temp_files/expanded_output.csv', index=False)


In [None]:
tree = model.get_topic_tree(hierarchical_topics)
print(tree)

In [None]:
def categorize_topics(file_path, output_path):

    df = pd.read_csv(file_path)
    
    child_ids = set(df['Child_Left_ID']).union(set(df['Child_Right_ID']))
    parent_ids = set(df['Parent_ID'])
    
    criteria = df.apply(lambda row: row['Child_Left_ID'] not in parent_ids and 
                                   row['Child_Right_ID'] not in parent_ids, axis=1)
    
    df['category'] = criteria.apply(lambda x: 'yes' if x else 'no')
    
    df.to_csv(output_path, index=False)
    print(f"File saved to: {output_path}")

# Replace with your file paths
categorize_topics("temp_files/hier_topics.csv", "temp_files/hier_topics_with_categories.csv")


In [None]:
def categorize_topics_with_levels(file_path, output_path):

    df = pd.read_csv(file_path)
    
    child_ids = set(df['Child_Left_ID']).union(set(df['Child_Right_ID']))
    parent_ids = set(df['Parent_ID'])
    
    df['category'] = df.apply(lambda row: row['Child_Left_ID'] not in parent_ids and 
                                           row['Child_Right_ID'] not in parent_ids, axis=1)
    df['category'] = df['category'].apply(lambda x: 'yes' if x else 'no')
    
    df['category_lvl2'] = 'no'
    df['category_lvl3'] = 'no'
    
    category_rows = df[df['category'] == 'yes']
    for index, row in category_rows.iterrows():
        parent_id = row['Parent_ID']
        matching_rows = df[(df['Child_Left_ID'] == parent_id) | (df['Child_Right_ID'] == parent_id)]
        df.loc[matching_rows.index, 'category_lvl2'] = 'yes'
    
    category_lvl2_rows = df[df['category_lvl2'] == 'yes']
    for index, row in category_lvl2_rows.iterrows():
        parent_id = row['Parent_ID']
        matching_rows = df[(df['Child_Left_ID'] == parent_id) | (df['Child_Right_ID'] == parent_id)]
        df.loc[matching_rows.index, 'category_lvl3'] = 'yes'
    
    df.to_csv(output_path, index=False)
    print(f"File saved to: {output_path}")

# Replace with your file paths
categorize_topics_with_levels('temp_files/hier_topics.csv', 'temp_files/hier_topics_with_categories3.csv')
