In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd

filtered_reviews = pd.read_csv('clean_filtered_reviews.csv')

filtered_reviews = filtered_reviews.dropna(subset=['all_text'])

# Use TF-IDF Vectorizer with bigrams for richer context
tfidf_vectorizer = TfidfVectorizer(max_df=0.9, min_df=2, stop_words='english', ngram_range=(1, 2))
tfidf_matrix = tfidf_vectorizer.fit_transform(filtered_reviews['all_text'])

# Display feature names to verify the refined vocabulary
print("Sample feature names after TF-IDF vectorization:")
print(tfidf_vectorizer.get_feature_names_out()[:10])

# Fit LDA model
num_topics = 5
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(tfidf_matrix)

# Display topics
def display_topics(model, feature_names, num_top_words):
    for idx, topic in enumerate(model.components_):
        print(f"Topic {idx + 1}:")
        print([feature_names[i] for i in topic.argsort()[-num_top_words:]])

# Display top 10 words for each topic
display_topics(lda, tfidf_vectorizer.get_feature_names_out(), 10)

Sample feature names after TF-IDF vectorization:
['accessible' 'accessible food' 'accessible location' 'accommodate'
 'accommodate chicken' 'accommodate clean' 'accommodate delicious'
 'accommodate family' 'accommodate food' 'accommodate gluten']
Topic 1:
['best', 'staff', 'minutes', 'table', 'love', 'pho', 'time', 'good', 'service', 'food']
Topic 2:
['shrimp', 'fried', 'sauce', 'service', 'time', 'rice', 'sushi', 'chicken', 'good', 'food']
Topic 3:
['new', 'food delicious', 'best', 'recommend', 'amazing', 'service', 'friendly', 'delicious', 'pizza', 'food']
Topic 4:
['location', 'pork', 'chicken', 'pho', 'beef', 'food', 'sushi', 'soup', 'ramen', 'good']
Topic 5:
['price', 'chicken', 'portion', 'amazing', 'recommend', 'love', 'burger', 'food', 'vegan', 'delicious']


In [4]:
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

# Download VADER lexicon if not already done
#nltk.download('vader_lexicon')

# Instantiate Sentiment Intensity Analyzer
sia = SentimentIntensityAnalyzer()

# Function to calculate sentiment scores using VADER
def calculate_sentiment(text):
    score = sia.polarity_scores(text)
    return score['compound']  # Return the compound score which reflects the overall sentiment

# Apply the function to calculate sentiment score for each review
filtered_reviews['sentiment_score'] = filtered_reviews['all_text'].apply(calculate_sentiment)

# Verify that the sentiment scores have been added
print("Sample of sentiment scores added to DataFrame:")
print(filtered_reviews[['all_text', 'sentiment_score']].head())

Sample of sentiment scores added to DataFrame:
                                            all_text  sentiment_score
0  love kind time good last time staff friendly long           0.9287
1  disappointed chicken chicken chicken fried ric...          -0.0516
2                    good flavour salad service come           0.4404
3                        minutes interior asked back           0.0000
4                tasty good area friendly food quick           0.7269


In [5]:
# Assuming you have sentiment scores already computed for each review
for idx, topic in enumerate(lda.components_):
    # Identify relevant reviews based on words in the topic
    relevant_words = [tfidf_vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]]
    relevant_reviews = filtered_reviews[filtered_reviews['all_text'].apply(lambda x: any(word in x for word in relevant_words))]
    
    # Calculate the average sentiment score for the relevant reviews
    avg_sentiment = relevant_reviews['sentiment_score'].mean()
    print(f"Average Sentiment for Topic {idx + 1}: {avg_sentiment}")

Average Sentiment for Topic 1: 0.58713499413069
Average Sentiment for Topic 2: 0.5716405899781013
Average Sentiment for Topic 3: 0.598892210015694
Average Sentiment for Topic 4: 0.5818306917572224
Average Sentiment for Topic 5: 0.5976915769566482
