#Attempting to create categories using Zero-Shot

In [None]:
# import libraries
# import transforms
# from transforms.api import Input, Output, transform
from bertopic import BERTopic
import numpy as np
import pandas as pd
from umap import UMAP
from hdbscan import HDBSCAN
import ast
# from pyspark.sql.types import StructType, StructField, IntegerType, ArrayType, StringType, DoubleType, FloatType
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, PartOfSpeech
import spacy
from bertopic.vectorizers import ClassTfidfTransformer
# import en_core_web_sm
from sentence_transformers import SentenceTransformer
# from transforms.external.systems import use_external_systems, EgressPolicy, Credential, ExportControl
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
from bertopic.representation import ZeroShotClassification
from transformers import pipeline

In [None]:
def getSentimentGroup(compound):
    if 0.66 <= compound < 1:
        return 'Very Good'
    elif 0.33 <= compound < 0.66:
        return 'Good'
    elif 0 <= compound < 0.33:
        return 'Neutral Postive'
    elif -0.33 <= compound < 0:
        return 'Neutral Negative'
    elif -0.66 <= compound < -0.33:
        return 'Bad'
    elif -1 <= compound < -0.66:
        return 'Very Bad'
       
 
def getSentimentDict(sentence):
    analyzer = SentimentIntensityAnalyzer()
    score = analyzer.polarity_scores(sentence)
    compound = score #score['compound']
    return compound


# params
# params = params.dataframe()
# params = params.collect()[0][:-1]
# [n_neighbors, _, min_cluster_size, _, _] = params
# n_neighbors = 15
# min_cluster_size = 15

n_neighbors=15
min_topic_size=15
min_cluster_size=15
top_n_words=10
diversity = 0.9

# umap model
umap_model = UMAP(n_neighbors=n_neighbors, n_components=5, min_dist=0, metric='cosine', random_state=42)

# clustering
hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))
# representation model
keybert_model = KeyBERTInspired()
mmr_model = MaximalMarginalRelevance(diversity)

nlp = spacy.load("en_core_web_sm") #inshallah
pos = PartOfSpeech(top_n_words=top_n_words)

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "MMR": mmr_model,
    "pos": pos
}

#Train the model
# model = BERTopic(
#   # Pipeline models
#   embedding_model = embedding_model,
#   umap_model=umap_model,
#   hdbscan_model=hdbscan_model,
#   vectorizer_model=vectorizer_model,
#   representation_model=representation_model,
#   # Hyperparameters
#   top_n_words = top_n_words,
#   min_topic_size = min_topic_size,
#   #nr_topics=75,
#   ctfidf_model=ctfidf_model,
#   verbose=True
# )


embedding_model = SentenceTransformer('all-MiniLM-L6-v2',device="cuda")

# Zero-shot
# candidate_topics = ["hotel", "staff", "location"]

# representation_model = ZeroShotClassification(candidate_topics, model="facebook/bart-large-mnli")

model = BERTopic(
    verbose=True,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    representation_model= representation_model,
    embedding_model = embedding_model
)


# source_df = source_df.dataframe()
# # source_df = source_df.limit(2000)
# df = source_df.toPandas()

df = pd.read_csv('input/sentence_embedded.csv') # change the filename to your .csv file name and location

df = df.dropna(subset=['embedding'])

df['embedding'] = df['embedding'].apply(lambda x: ast.literal_eval(x))

docs = df.text.to_list()
embeddings = np.array(df['embedding'].apply(lambda x: np.array(x).astype(np.float32)).to_list())
# embeddings = np.array(df['embedding'].apply(lambda x: np.array(x, dtype=np.float32)).to_list())
topics, probs = model.fit_transform(docs, embeddings)
df["topic"] = topics
df["probability"] = probs

# get sentiment
df['sentiment_analysis'] = df['text'].apply(lambda x: getSentimentDict(x))
df['sentiment_compound'] = df['sentiment_analysis'].apply(lambda x: x['compound'])
df['sentiment'] = df['sentiment_compound'].apply(lambda x: getSentimentGroup(x)) # Good Bad

MODEL: facebook/bart-large-mnli

In [None]:
df = pd.read_csv('temp_files/hier_topics.csv') # change the filename to your .csv file name and location

# change these categories to the desired categories
candidate_topics = ["Parking and Transportation",
                    "Special Occasions and Events",
                    "Value and Price",
                    "Room and Ameneties",
                    "Cleanliness and Maintenance",
                    "Facilities and Activities",
                    "Technologhy and Internet",
                    "Location",
                    "Service and Staff",
                    "Food and Beverage"]

zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def classify_document(text):
    result = zero_shot_classifier(text, candidate_topics)
    return result['labels'][0]  # returns the top category

df['Child_Left_Category'] = df['Child_Left_Name'].apply(classify_document)  # replace 'Parent_Name' with the correct column name
df['Child_Right_Category'] = df['Child_Right_Name'].apply(classify_document)  # replace 'Parent_Name' with the correct column name

df.to_csv("temp_files/clusters.csv", index=False)

print("Updated CSV file saved to temp_files/clusters.csv")

MODEL: tasksource/deberta-small-long-nli

In [None]:
df = pd.read_csv('temp_files/hier_topics.csv') # change the filename to your .csv file name and location

# change these categories to the desired categories
candidate_topics = ["Parking and Transportation",
                    "Special Occasions and Events",
                    "Value and Price",
                    "Room and Ameneties",
                    "Cleanliness and Maintenance",
                    "Facilities and Activities",
                    "Technologhy and Internet",
                    "Location",
                    "Service and Staff",
                    "Food and Beverage"]

zero_shot_classifier = pipeline("zero-shot-classification", model="tasksource/deberta-small-long-nli")

def classify_document(text):
    result = zero_shot_classifier(text, candidate_topics)
    return result['labels'][0]  # returns the top category

df['Child_Left_Category'] = df['Child_Left_Name'].apply(classify_document)  # replace 'Parent_Name' with the correct column name
df['Child_Right_Category'] = df['Child_Right_Name'].apply(classify_document)  # replace 'Parent_Name' with the correct column name

df.to_csv("temp_files/clusters2.csv", index=False)

print("Updated CSV file saved to temp_files/clusters2.csv")