In [10]:
import pandas as pd

pd.set_option('display.max_colwidth', None)
df = pd.read_csv('processed_zillow_reviews.csv') 

#I'll sample proportionally based on existing data distribution
target_size = 10000

sampled_df = df.groupby('Rating').apply(
    lambda x: x.sample(frac=target_size / len(df), random_state=42)
).reset_index(drop=True)

data = sampled_df['Cleaned_Description'].dropna().astype(str).tolist()



  sampled_df = df.groupby('Rating').apply(


In [11]:
#tokenize pre-processed data (split into word)
tokenized_data = [text.split() for text in data]

In [12]:
#Preparing the data. LDA requires Corpus and Dictionary
from gensim.corpora.dictionary import Dictionary

# Create dictionary, each word will have a unique ID
id2word = Dictionary(tokenized_data)

# Filter extreme cases (optional, improves quality)  Remove words that appear in fewer than 10 documents and more than 50% of documents
id2word.filter_extremes(no_below=10, no_above=0.5)

# Create bag-of-words corpus
corpus = [id2word.doc2bow(text) for text in tokenized_data]

corpus

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 2),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 2),
  (28, 1),
  (29, 2),
  (30, 2),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 2),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 2),
  (43, 1),
  (44, 1)],
 [(4, 1),
  (10, 1),
  (30, 2),
  (42, 1),
  (44, 2),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1)],
 [(0, 1),
  (15, 2),
  (23, 1),
  (34, 1),
  (55, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 1)

In [13]:
from gensim.models import LdaModel

# Train LDA model
lda_model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=5,  # change to 6, 10, etc. to experiment
    random_state=42,
    update_every=1,
    chunksize=100,
    passes=10,
    alpha='auto',
    per_word_topics=True
)

In [14]:
# Print the keywords in each topic
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx + 1}: {topic}\n")


Topic 1: 0.045*"help" + 0.044*"work" + 0.043*"process" + 0.032*"time" + 0.026*"find" + 0.026*"buy" + 0.024*"purchase" + 0.022*"great" + 0.020*"highly" + 0.018*"experience"

Topic 2: 0.061*"provide" + 0.057*"service" + 0.049*"keep" + 0.047*"hard" + 0.039*"excellent" + 0.031*"advice" + 0.031*"local" + 0.019*"assist" + 0.018*"information" + 0.017*"attentive"

Topic 3: 0.047*"agent" + 0.035*"know" + 0.035*"real" + 0.033*"estate" + 0.023*"good" + 0.020*"like" + 0.020*"need" + 0.017*"family" + 0.015*"friend" + 0.015*"deal"

Topic 4: 0.065*"house" + 0.031*"offer" + 0.031*"property" + 0.025*"want" + 0.022*"get" + 0.018*"area" + 0.015*"look" + 0.015*"seller" + 0.015*"find" + 0.014*"day"

Topic 5: 0.076*"market" + 0.054*"knowledge" + 0.026*"expertise" + 0.022*"negotiation" + 0.021*"ensure" + 0.019*"professionalism" + 0.018*"skill" + 0.017*"san" + 0.016*"competitive" + 0.014*"navigate"



In [15]:
import pyLDAvis.gensim_models
import pyLDAvis

# Visualize topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

# Ask prof if these topics makes sense? what is her input on this topic modelling?

#top 6 words in Topic 0 - help, work, process, time, find, buy
#top 6 words in Topic 1 - house, offer, property, want, get, area
#top 6 words in Topic 2 - agent, know, real, estate, good, like
#top 6 words in Topic 3 - market, knowledge, expertise, negotiation, ensure, professionalism
#top 6 words in Topic 4 - provide, service, keep, hard, excellent, advice

In [16]:
pyLDAvis.save_html(vis, 'lda_visualization_5_topics.html')


In [11]:
'''
1)  Each circle represents a topic.
    Size of circle = how prevalent (dominant) that topic is in your dataset.
    Distance between circles = how different the topics are in terms of vocabulary.

2)  Saliency (blue bars):
    How much this word contributes to distinguishing topics.
    High-saliency words are useful for identifying topics.
    
    Red bars (when a topic is selected):
    How frequent this word is within the selected topic.
    You’ll see these when you click on a topic bubble (like Topic 1, 2, etc.).

3)  
'''

UsageError: Cell magic `%%htmlEach` not found.


In [17]:
#assigning topics to ratings
def get_dominant_topic(bow):
    topics = lda_model.get_document_topics(bow)
    topics = sorted(topics, key=lambda x: x[1], reverse=True)
    return topics[0][0] if topics else None  # topic number

# Apply to your reviews
sampled_df['Dominant_Topic'] = [get_dominant_topic(doc) for doc in corpus]


In [29]:
# Check a few rows
sampled_df['Dominant_Topic'].unique()


array([2, 3, 0, 4])

In [31]:
# Count of reviews per (Rating, Topic_Label) pair
topic_rating_counts = sampled_df.groupby(['Rating', 'Dominant_Topic']).size().reset_index(name='Count')

# Get the most frequent topic for each rating
dominant_topic_per_rating = topic_rating_counts.loc[
    topic_rating_counts.groupby('Rating')['Count'].idxmax()
].reset_index(drop=True)


#This shows the the topic that was mostly assigned to a Rating
print(dominant_topic_per_rating)


   Rating  Dominant_Topic  Count
0       1               2      8
1       2               2      1
2       3               0      3
3       4               0     44
4       5               0   9017
