# Zillow Word Clouds
### Or, how Zillow listing language changes in different LA neighborhoods

In [4]:
# Install required dependencies
# Only need to run this cell once

!pip install --quiet playwright pandas matplotlib seaborn wordcloud nltk
!playwright install  # This installs browser binaries needed by Playwright

# If you're running in Google Colab, you might need:
# !apt-get update
# !apt-get install -y xvfb
# !pip install playwright pandas matplotlib seaborn wordcloud nltk
# !playwright install

print("Dependencies installed successfully!")


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Dependencies installed successfully!


In [5]:
# Imports of various libraries

import os
import random
import time
import asyncio
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from playwright.async_api import async_playwright, expect

In [6]:
# Set up data directory
os.makedirs('data/', exist_ok=True)

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ryanbrooks/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ryanbrooks/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
# Custom real estate stopwords to filter out common terms
real_estate_stopwords = set([
    'home', 'property', 'house', 'listing', 'features', 'includes',
    'located', 'offers', 'contact', 'information', 'price', 'sale',
    'bedroom', 'bathroom', 'bath', 'bed', 'sq', 'ft', 'square', 'feet',
    'year', 'built', 'call', 'today', 'agent', 'new', 'view', 'tour'
])

# Target neighborhoods in LA
neighborhoods = [
    "Beverly Hills",
    "Boyle Heights",
    "Leimert Park", 
    "Sherman Oaks",
    "Koreatown"
]

In [8]:
# Starts an automated browser and opens a new window

async def open_browser(headless=False):
    playwright = await async_playwright().start()
    
    # Random user agent to appear more human
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
    
    # Launch browser with stealth settings
    browser = await playwright.chromium.launch(
        headless=headless,
        slow_mo=50
    )
    
    # Create context with custom settings to avoid detection
    context = await browser.new_context(
        user_agent=user_agent,
        viewport={'width': 1280, 'height': 800},
        device_scale_factor=1,
        is_mobile=False
    )
    
    # Create a new page
    page = await context.new_page()
    
    return browser, page, playwright

In [9]:
# Search for a specific neighborhood on Zillow

async def search_neighborhood(page, neighborhood):
    url = 'https://zillow.com'
    await page.goto(url)
    await asyncio.sleep(2)
    
    # Find search box using aria-label attribute
    search_box = page.get_by_role('textbox', name='Search')
    await search_box.fill(f"{neighborhood}, Los Angeles, CA")
    await asyncio.sleep(1)
    
    # Wait for suggestions and click the first one
    await expect(page.get_by_role('option').first).to_be_visible()
    await page.get_by_role('option').first.click()
    await asyncio.sleep(2)
    
    # Select "For Sale" if prompted
    for_sale_button = page.get_by_role('button', name='For Sale')
    if await for_sale_button.is_visible():
        await for_sale_button.click()
        await asyncio.sleep(2)

In [10]:
# Extract all listing descriptions from the current page

async def get_descriptions(page):
    descriptions = []
    prices = []
    addresses = []
    
    # Load all cards by scrolling
    N = 0
    while True:
        # Find all property cards
        cards = await page.locator('[data-test="property-card"]').all()
        if not cards:
            break
            
        # Scroll to the last visible card
        last_card = cards[-1]
        await last_card.scroll_into_view_if_needed()
        
        # Check if we've loaded all cards
        N_cards = len(cards)
        if N_cards == N:
            break
        N = N_cards
        await asyncio.sleep(2)
    
    # Extract information from each card
    for card in await page.locator('[data-test="property-card"]').all():
        # Get price
        price_elem = card.locator('[data-test="property-card-price"]')
        if await price_elem.count() > 0:
            price = await price_elem.text_content()
            prices.append(price)
        else:
            prices.append("N/A")
            
        # Get address
        address_elem = card.locator('[data-test="property-card-addr"]')
        if await address_elem.count() > 0:
            address = await address_elem.text_content()
            addresses.append(address)
        else:
            addresses.append("N/A")
            
        # Click card to view description
        await card.click()
        await asyncio.sleep(2)
        
        # Extract description from the detail page
        desc_elem = page.locator('.ds-overview-section')
        if await desc_elem.count() > 0:
            description = await desc_elem.text_content()
            descriptions.append(description)
        else:
            descriptions.append("")
            
        # Go back to results
        await page.go_back()
        await asyncio.sleep(1)
    
    return {
        'prices': prices,
        'addresses': addresses,
        'descriptions': descriptions
    }

In [11]:
async def scrape_neighborhood_data(neighborhood):
    """
    Scrape listing data for a specific neighborhood
    """
    browser, page, playwright = await open_browser()
    
    try:
        print(f"Searching for {neighborhood}...")
        await search_neighborhood(page, neighborhood)
        
        print(f"Extracting listings for {neighborhood}...")
        data = await get_descriptions(page)
        
        # Save the raw HTML
        html_content = await page.content()
        with open(f'data/{neighborhood.replace(" ", "_")}_listings.html', 'w', encoding='utf-8') as f:
            f.write(html_content)
            
        # Create dataframe and save to CSV
        df = pd.DataFrame({
            'neighborhood': [neighborhood] * len(data['descriptions']),
            'price': data['prices'],
            'address': data['addresses'],
            'description': data['descriptions']
        })
        
        df.to_csv(f'data/{neighborhood.replace(" ", "_")}_listings.csv', index=False)
        print(f"Saved {len(df)} listings for {neighborhood}")
        
        return df
        
    finally:
        await browser.close()
        await playwright.stop()

In [12]:
def create_word_cloud(neighborhood, description_text):
    """
    Generate word cloud for neighborhood listings
    """
    # Combine all descriptions
    all_text = ' '.join(description_text)
    
    # Tokenize and clean
    stop_words = set(stopwords.words('english')).union(real_estate_stopwords)
    tokens = [w.lower() for w in word_tokenize(all_text) 
              if w.isalpha() and w.lower() not in stop_words and len(w) > 2]
    
    # Count word frequencies
    word_freq = Counter(tokens)
    
    # Generate word cloud
    wordcloud = WordCloud(
        width=800, 
        height=400,
        background_color='white',
        max_words=100,
        contour_width=1
    ).generate_from_frequencies(word_freq)
    
    # Plot and save
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Most Common Terms in {neighborhood} Listings')
    plt.tight_layout()
    plt.savefig(f'data/{neighborhood.replace(" ", "_")}_wordcloud.png', dpi=300)
    plt.close()
    
    return word_freq

In [13]:
def create_comparison_charts(all_data):
    """
    Create comparison charts across neighborhoods
    """
    # Prepare data for charts
    all_neighborhoods_data = {}
    avg_prices = {}
    word_counts = {}
    
    for neighborhood, group in all_data.groupby('neighborhood'):
        # Process text for analysis
        all_text = ' '.join(group['description'].dropna())
        stop_words = set(stopwords.words('english')).union(real_estate_stopwords)
        tokens = [w.lower() for w in word_tokenize(all_text) 
                  if w.isalpha() and w.lower() not in stop_words and len(w) > 2]
        
        # Count word frequencies
        word_freq = Counter(tokens)
        all_neighborhoods_data[neighborhood] = word_freq
        
        # Calculate average price
        prices = group['price'].str.replace('$', '').str.replace(',', '').astype(float)
        avg_prices[neighborhood] = prices.mean()
        
        # Calculate average word count
        word_counts[neighborhood] = len(all_text.split()) / len(group)
    
    # Chart 1: Top terms comparison
    # Get common terms across all neighborhoods
    all_terms = set()
    for neighborhood, word_freq in all_neighborhoods_data.items():
        all_terms.update(word_freq.keys())
    
    # Find most common terms overall
    combined_freq = Counter()
    for word_freq in all_neighborhoods_data.values():
        combined_freq.update(word_freq)
    
    top_terms = [term for term, _ in combined_freq.most_common(10)]
    
    # Create DataFrame for plotting
    chart_data = []
    for term in top_terms:
        for neighborhood in all_neighborhoods_data.keys():
            freq = all_neighborhoods_data[neighborhood].get(term, 0)
            chart_data.append({
                'Term': term,
                'Neighborhood': neighborhood,
                'Frequency': freq
            })
    
    chart_df = pd.DataFrame(chart_data)
    
    # Plot bar chart
    plt.figure(figsize=(12, 8))
    chart = sns.barplot(x='Frequency', y='Term', hue='Neighborhood', data=chart_df)
    plt.title('Top 10 Terms by Neighborhood')
    plt.tight_layout()
    plt.savefig('data/top_terms_comparison.png', dpi=300)
    plt.close()
    
    # Chart 2: Price vs. Word Count
    price_data = pd.DataFrame({
        'Neighborhood': list(avg_prices.keys()),
        'Average Price': [avg_prices[n] for n in avg_prices.keys()],
        'Average Word Count': [word_counts[n] for n in avg_prices.keys()]
    })
    
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='Average Price', y='Average Word Count', data=price_data)
    
    # Add neighborhood labels to points
    for i, row in price_data.iterrows():
        plt.text(row['Average Price'], row['Average Word Count'], row['Neighborhood'])
    
    plt.title('Relationship Between Listing Price and Description Length')
    plt.tight_layout()
    plt.savefig('data/price_vs_wordcount.png', dpi=300)
    plt.close()

In [None]:
async def main():
    # Collect data for each neighborhood
    all_dfs = []
    
    for neighborhood in neighborhoods:
        df = await scrape_neighborhood_data(neighborhood)
        all_dfs.append(df)
        
        # Generate word cloud for each neighborhood
        create_word_cloud(neighborhood, df['description'])
        
        # Pause between neighborhoods to avoid detection
        await asyncio.sleep(5)
    
    # Combine all data
    all_data = pd.concat(all_dfs)
    all_data.to_csv('data/all_neighborhoods.csv', index=False)
    
    # Create comparison charts
    create_comparison_charts(all_data)
    
    print("Analysis complete! Check the data directory for results.")

# Run the main function
await main()