**In this notebook, we perform topic modeling on a dataset of English hotel reviews from New York City. The process involves several key steps:**

**Text Preprocessing: We begin by cleaning and preprocessing the text data, including tokenization, stopword removal, and lemmatization, to prepare it for analysis.**

**Topic Modeling: Using the BERTopic framework, we apply dimensionality reduction, clustering, and topic extraction techniques to identify distinct topics within the reviews. The model identifies themes such as overall hotel experience, service issues, room cleanliness, noise levels, and specific amenities.**

**Visualization: The notebook includes visualizations to explore the distribution of topics, the relationships between them, and the most representative words for each topic.**

**Topic Mapping and Analysis: We map the identified topics to labels for easier interpretation and analyze the distribution of these topics across the dataset. The labeled topics are then used to further explore trends and patterns within the reviews, including filtering for negative sentiments.**

**The results are saved for subsequent analysis and interpretation, providing valuable insights into customer experiences and perceptions of hotels in New York City.**

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS


In [None]:
import torch
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.version.cuda)  # Check the CUDA version PyTorch is using
print(torch.backends.cudnn.enabled)  # Check if cuDNN is enabled
print(torch.cuda.is_available())  # Should return True if everything is set up correctly


In [None]:
import warnings

# Suppress specific warnings
warnings.filterwarnings('ignore', category=UserWarning, module='tqdm')
warnings.filterwarnings('ignore', category=FutureWarning, module='huggingface_hub')

In [None]:
import sys
print("Python version:", sys.version)


In [None]:
print("PyTorch version:", torch.__version__)

## Preprocess the Text Data:

In [None]:
df = pd.read_csv('./data/eng_reviews.csv')

In [None]:
# Check if 'punkt', 'stopwords', and 'wordnet' are already downloaded
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')

In [None]:
names = df['name'].unique()
# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Add domain-specific stop words
domain_stop_words = set(ENGLISH_STOP_WORDS).union({'hotel', 'stay', 'hotels', 'night', 'timesquare', 'times', 'square', 'hilton', 
                                                   'new', 'york', 'nyc', 'day', 'stayed', 'just', 'really', 'stay'
                                                   'amazing', 'awesome', 'best', 'better', 'excellent', 'fantastic', 'horrible', 'rooms', 'room', 'great', 'bad',
                                                   'perfect', 'poor', 'terrible', 'wonderful', 'awful', 'nice', 'okay', 'mediocre', 'good',
                                                   'superb', 'lousy', 'disappointing', 'satisfactory', 'decent', 'pleasant', 
                                                   'unpleasant', 'memorable', 'forgettable', 'unremarkable'})
# Convert names list to a set and merge with domain_stop_words
domain_stop_words = domain_stop_words.union(set(names))

domain_stop_words = list(domain_stop_words)

# Preprocess the text data again with the updated stop words
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in domain_stop_words]
    return ' '.join(filtered_tokens)


In [None]:
# Apply preprocessing to the 'cleaned_text' column
df['processed_text'] = df['cleaned_text'].apply(preprocess_text)

# Display the first few rows of the processed data
print(df[['cleaned_text', 'processed_text']].head())

## Apply Topic Modeling:

In [None]:
from sentence_transformers import SentenceTransformer

# Pre-calculate embeddings
embedding_model = SentenceTransformer('all-MiniLM-L6-v2', trust_remote_code=True).to(device)
# SentenceTransformer("all-MiniLM-L6-v2")
# SentenceTransformer('Alibaba-NLP/gte-large-en-v1.5', trust_remote_code=True)


In [None]:
# Encode the sentences
embeddings = embedding_model.encode(df['processed_text'], show_progress_bar=True, device=device)

In [None]:
from umap import UMAP

umap_model = UMAP(n_neighbors=20
                  , n_components=3, min_dist=0.0, metric='cosine', random_state=42)

In [None]:
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(min_cluster_size=150, metric='euclidean', cluster_selection_method='eom', prediction_data=True)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words=domain_stop_words, min_df=10, ngram_range=(1, 2))


In [None]:
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance


# KeyBERT
keybert_model = KeyBERTInspired()

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    # "OpenAI": openai_model,  # Uncomment if you will use OpenAI
    "MMR": mmr_model
}

In [None]:
from bertopic import BERTopic

In [None]:
# Refit the BERTopic model

topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  nr_topics=20,
  verbose=True

)

# Train model
topics, probs = topic_model.fit_transform(df['processed_text'], embeddings)


In [None]:
# Display the topics
topic_model.get_topic_info()

In [None]:
# Visualize UMAP projection and HDBSCAN clustering
topic_model.visualize_documents(df['processed_text'])

## Visualize Topics:

In [None]:
topic_model.visualize_topics()


In [None]:
topic_model.visualize_hierarchy()


## Analyze Topic Distribution:

In [None]:
# Add the topic results to the DataFrame
df['topic'] = topics

In [None]:
# Get the distribution of topics
topic_distribution = df['topic'].value_counts()
print(topic_distribution)

In [None]:
topic_model.visualize_distribution(probs)

## Examine Top Words per Topic:

In [None]:
# Extract topic information
topic_info = topic_model.get_topic_info()

In [None]:
# Get the top words for each topic
for topic_id in topic_info['Topic']:
    if topic_id != -1:  # -1 is the outlier/noise topic
        print(f"Topic {topic_id}: {topic_model.get_topic(topic_id)}")

In [None]:
new_topics = topic_model.reduce_outliers(df['processed_text'], topics)

In [None]:
df['new_topics'] = new_topics

# Topic Mapping

In [None]:
# Define the labels based on the topics
topic_labels = {
    0: "Overall Hotel Experience",
    1: "Front Desk and Service Issues",
    2: "Positive Experience: Location and Staff",
    3: "Comfort and Cleanliness of Rooms",
    4: "Noise and Sleep Quality",
    5: "Specific Hotel Chains and Suites",
    6: "Breakfast and Additional Amenities",
    7: "Hotel Names and Wine & Cheese Offerings",
    8: "WiFi and Internet Services",
    9: "Natural Disasters and Staff Response",
    10: "Hotel Names and Room Conditions",
    11: "Evening Offerings: Wine, Cheese, and Breakfast",
    12: "Hotel Pennsylvania and Room Conditions",
    13: "Bed Bug Issues",
    14: "Hotel Muse and Kimpton Properties",
    15: "Elevator Issues",
    16: "Service and Small Room Size",
    17: "Overall Hotel Atmosphere",
    18: "Room Amenities: Microwave and Fridge"
}

In [None]:
# Apply the labels to a new column in the DataFrame
df['label'] = df['new_topics'].map(topic_labels)

In [None]:
# Filter the DataFrame to only include rows where the sentiment is negative
negative_sentiment_df = df[df['sentiment'] == 'NEGATIVE']

In [None]:
df.to_csv('df_with_topics.csv', index= False)