# Zillow Word Clouds
### Or, how Zillow listing language changes in different LA neighborhoods

In [21]:
# Install required dependencies
# Only need to run this cell once

!pip install --quiet playwright pandas matplotlib seaborn nltk wordcloud nest-asyncio
!python -m playwright install chromium  # This installs browser binaries needed by Playwright

# On Windows, this will download Chromium, Firefox, and WebKit
# !python -m playwright install

# If you're running in Google Colab, you might need:
# !apt-get update
# !apt-get install -y xvfb
# !pip install playwright pandas matplotlib seaborn wordcloud nltk
# !playwright install

# For Windows: 
# Install Microsoft Visual C++ Redistributable if needed
# This is generally required for Playwright on Windows
# Note: This command checks if already installed and only displays info
# You may need to manually install from: https://aka.ms/vs/17/release/vc_redist.x64.exe
# !powershell "Get-ItemProperty HKLM:\Software\Microsoft\Windows\CurrentVersion\Uninstall\* | Where-Object {$_.DisplayName -like '*Microsoft Visual C++*'} | Select-Object DisplayName"
!python -m playwright install chromium firefox webkit

# This is mostly here in case you're wrestling with Windows like I did!
try:
    from playwright.sync_api import sync_playwright
    print("Playwright is installed.")
except ImportError:
    print("Playwright is not installed.")

print("Dependencies installed successfully!")


[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Playwright is installed.
Dependencies installed successfully!


In [47]:
# Required library imports

import os
import random
import asyncio
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nest_asyncio
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
from playwright.async_api import async_playwright, expect

In [48]:
# Apply nest_asyncio to allow running asyncio in Jupyter
nest_asyncio.apply()

# Create data directory if it doesn't exist
os.makedirs('data', exist_ok=True)

# Download NLTK resources if needed
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')

print("Libraries imported and setup complete")

Libraries imported and setup complete


In [24]:
# Custom real estate stopwords to filter out common terms
real_estate_stopwords = set([
    'home', 'property', 'house', 'listing', 'features', 'includes',
    'located', 'offers', 'contact', 'information', 'price', 'sale',
    'bedroom', 'bathroom', 'bath', 'bed', 'sq', 'ft', 'square', 'feet',
    'year', 'built', 'call', 'today', 'agent', 'new', 'view', 'tour'
])

# Target neighborhoods in LA
neighborhoods = [
    "Beverly Hills",
    "Boyle Heights",
    "Leimert Park", 
    "Sherman Oaks",
    "Koreatown"
]

In [49]:
# Browser functions

async def open_browser(headless=False):
    """
    Opens a browser with settings to avoid bot detection
    """
    playwright = await async_playwright().start()
    
    # Use a realistic user agent (macOS Chrome)
    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
    
    # Launch browser (using Chromium for better compatibility with Zillow)
    browser = await playwright.chromium.launch(
        headless=headless,
        args=['--start-maximized']
    )
    
    # Create context with custom settings to avoid detection
    context = await browser.new_context(
        user_agent=user_agent,
        viewport={'width': 1280, 'height': 800},
        screen={'width': 1280, 'height': 800},
        device_scale_factor=2,
        has_touch=False,
        java_script_enabled=True,
        locale='en-US',
        timezone_id='America/Los_Angeles'
    )
    
    # Create page with longer timeout
    page = await context.new_page()
    page.set_default_timeout(60000)  # 60 seconds
    
    return browser, page, playwright

# Test browser connection
async def test_browser():
    browser, page, playwright = await open_browser(headless=False)
    try:
        print("Opening Zillow homepage...")
        await page.goto('https://www.zillow.com')
        print(f"Page title: {await page.title()}")
        
        # Check for CAPTCHA/bot detection
        if "Access to this page has been denied" in await page.title():
            print("CAPTCHA detected! Please solve it manually in the browser window.")
            input("Press Enter once you've completed the verification...")
            
        return browser, page, playwright
    except Exception as e:
        print(f"Error: {e}")
        await browser.close()
        await playwright.stop()
        return None, None, None

browser, page, playwright = await test_browser()

NotImplementedError: 

In [37]:
# Scraping functions start here
# Search for a specific neighborhood on Zillow

def search_neighborhood(page, neighborhood, listing_type="buy"):
    """
    Search for a specific neighborhood on Zillow
    listing_type can be "buy" or "rent"
    """
    # Direct URL approach (more reliable than using search)
    base_url = 'https://www.zillow.com'
    
    # Format neighborhood for URL
    formatted_neighborhood = neighborhood.lower().replace(' ', '-')
    
    if listing_type == "rent":
        url = f'{base_url}/{formatted_neighborhood}-ca/rentals/'
    else:
        url = f'{base_url}/{formatted_neighborhood}-ca/'
    
    print(f"Navigating to {url}")
    page.goto(url)
    time.sleep(3)  # Give page time to load
    
    # Check for cookie consent dialog and accept if present
    try:
        cookie_button = page.locator('button[data-testid="cookie-consent-button"]')
        if cookie_button.count() > 0:
            cookie_button.click()
            time.sleep(1)
    except:
        pass  # No cookie dialog

In [38]:
def scroll_to_load_all_cards(page):
    """
    Scroll down the page to load all property cards
    """
    print("Scrolling to load all listings...")
    
    previous_card_count = 0
    max_scroll_attempts = 10
    scroll_attempts = 0
    
    while scroll_attempts < max_scroll_attempts:
        # Get current number of cards
        cards = page.locator('[data-test="property-card"]')
        current_card_count = cards.count()
        
        # If no more cards are loading, break the loop
        if current_card_count == previous_card_count:
            scroll_attempts += 1
        else:
            scroll_attempts = 0
            previous_card_count = current_card_count
        
        # Scroll to the bottom of the page
        page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
        time.sleep(2)  # Wait for new cards to load
        
        print(f"Currently loaded {current_card_count} cards")
    
    print(f"Finished scrolling, total cards: {previous_card_count}")

In [39]:
def get_listings(page, listing_type="buy"):
    """
    Extract all listing information from the current page
    """
    print("Extracting listing information...")
    listings = []
    
    # Scroll to load all cards
    scroll_to_load_all_cards(page)
    
    # Find all property cards
    # Modern Zillow uses data-test="property-card" attribute
    property_cards = page.locator('[data-test="property-card"]')
    count = property_cards.count()
    print(f"Found {count} property cards")
    
    for i in range(count):
        try:
            card = property_cards.nth(i)
            
            # Extract address
            address_elem = card.locator('address')
            address = address_elem.text_content() if address_elem.count() > 0 else "N/A"
            
            # Extract price
            price_elem = card.locator('[data-test="property-card-price"]')
            price = price_elem.text_content() if price_elem.count() > 0 else "N/A"
            
            # Extract details (beds, baths, sqft)
            details = {}
            details_list = card.locator('ul li')
            details_count = details_list.count()
            
            for j in range(details_count):
                detail_text = details_list.nth(j).text_content()
                if "bd" in detail_text:
                    details["beds"] = detail_text.split("bd")[0].strip()
                elif "ba" in detail_text:
                    details["baths"] = detail_text.split("ba")[0].strip()
                elif "sqft" in detail_text:
                    details["sqft"] = detail_text.split("sqft")[0].strip()
            
            # Extract URL
            link_elem = card.locator('[data-test="property-card-link"]')
            url = link_elem.get_attribute('href') if link_elem.count() > 0 else None
            if url and not url.startswith('http'):
                url = 'https://www.zillow.com' + url
            
            listings.append({
                'address': address,
                'price': price,
                'beds': details.get('beds', 'N/A'),
                'baths': details.get('baths', 'N/A'),
                'sqft': details.get('sqft', 'N/A'),
                'url': url,
                'type': 'Rental' if listing_type == 'rent' else 'For Sale'
            })
            
        except Exception as e:
            print(f"Error extracting card {i}: {e}")
            continue
    
    return listings

In [40]:
def get_detailed_descriptions(page, listings, max_details=5):
    """
    Visit each listing to get detailed descriptions
    Limit to max_details to avoid excessive scraping
    """
    detailed_listings = []
    
    # Limit to avoid excessive scraping
    for i, listing in enumerate(listings[:max_details]):
        if not listing['url']:
            continue
            
        print(f"Getting details for listing {i+1}/{min(len(listings), max_details)}")
        
        try:
            # Navigate to the listing page
            page.goto(listing['url'])
            time.sleep(3)  # Wait for page to load
            
            # Extract description
            # For rentals, look for overview section
            description_elem = page.locator('.ds-overview-section, [data-testid="description"]')
            description = description_elem.text_content() if description_elem.count() > 0 else ""
            
            # Add description to the listing
            listing['description'] = description.strip()
            detailed_listings.append(listing)
            
            # Pause between requests to avoid rate limiting
            time.sleep(random.uniform(2, 4))
            
        except Exception as e:
            print(f"Error getting details for listing {i+1}: {e}")
            listing['description'] = ""
            detailed_listings.append(listing)
    
    return detailed_listings

In [41]:
def scrape_neighborhood_data(neighborhood, listing_type="rent"):
    """
    Scrape listing data for a specific neighborhood
    """
    with sync_playwright() as playwright:
        browser = playwright.chromium.launch(headless=False)
        context = browser.new_context(
            user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            viewport={'width': 1280, 'height': 800}
        )
        page = context.new_page()
        page.set_default_timeout(60000)  # 60 seconds
        
        try:
            print(f"Searching for {neighborhood} {listing_type} listings...")
            search_neighborhood(page, neighborhood, listing_type)
            
            print(f"Extracting listings for {neighborhood}...")
            listings = get_listings(page, listing_type)
            
            if not listings:
                print(f"No listings found for {neighborhood}")
                return pd.DataFrame()
            
            print(f"Getting detailed descriptions...")
            detailed_listings = get_detailed_descriptions(page, listings)
            
            # Create dataframe
            df = pd.DataFrame(detailed_listings)
            
            # Save to CSV
            csv_filename = f'data/{neighborhood.replace(" ", "_")}_{listing_type}_listings.csv'
            df.to_csv(csv_filename, index=False)
            print(f"Saved {len(df)} listings to {csv_filename}")
            
            return df
            
        finally:
            browser.close()

In [42]:
# Data analysis functions

def clean_price(price_str):
    """Extract numeric price from price string"""
    if pd.isna(price_str) or price_str == 'N/A':
        return None
    
    # Extract digits and convert to float
    price_digits = re.sub(r'[^\d.]', '', price_str)
    if price_digits:
        return float(price_digits)
    return None

def clean_size(size_str):
    """Extract numeric size from size string"""
    if pd.isna(size_str) or size_str == 'N/A':
        return None
    
    # Extract digits and convert to float
    size_digits = re.sub(r'[^\d.]', '', size_str)
    if size_digits:
        return float(size_digits)
    return None

def analyze_rental_data(df):
    """Analyze rental listing data"""
    if df.empty:
        print("No data to analyze")
        return
    
    # Clean numeric data
    df['price_numeric'] = df['price'].apply(clean_price)
    df['sqft_numeric'] = df['sqft'].apply(clean_size)
    df['beds_numeric'] = pd.to_numeric(df['beds'], errors='coerce')
    df['baths_numeric'] = pd.to_numeric(df['baths'], errors='coerce')
    
    # Calculate price per square foot
    df['price_per_sqft'] = df.apply(
        lambda x: x['price_numeric'] / x['sqft_numeric'] if x['price_numeric'] and x['sqft_numeric'] else None, 
        axis=1
    )
    
    # Summary statistics
    print("\nSummary Statistics:")
    print(f"Total listings: {len(df)}")
    print(f"Average price: ${df['price_numeric'].mean():.2f}")
    print(f"Median price: ${df['price_numeric'].median():.2f}")
    print(f"Average sqft: {df['sqft_numeric'].mean():.2f}")
    print(f"Average price per sqft: ${df['price_per_sqft'].mean():.2f}")
    
    return df

def visualize_rental_data(df, neighborhood):
    """Create visualizations for rental data"""
    if df.empty:
        print("No data to visualize")
        return
    
    # Set style
    sns.set(style="whitegrid")
    
    # Create figure with multiple subplots
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Plot 1: Price distribution
    sns.histplot(df['price_numeric'].dropna(), kde=True, ax=axes[0, 0])
    axes[0, 0].set_title(f'Rental Price Distribution in {neighborhood}')
    axes[0, 0].set_xlabel('Monthly Rent ($)')
    axes[0, 0].set_ylabel('Count')
    
    # Plot 2: Price vs Size
    sns.scatterplot(data=df, x='sqft_numeric', y='price_numeric', hue='beds_numeric', ax=axes[0, 1])
    axes[0, 1].set_title(f'Price vs. Size in {neighborhood}')
    axes[0, 1].set_xlabel('Square Footage')
    axes[0, 1].set_ylabel('Monthly Rent ($)')
    
    # Plot 3: Price by bedroom count
    sns.boxplot(data=df, x='beds_numeric', y='price_numeric', ax=axes[1, 0])
    axes[1, 0].set_title(f'Price by Bedroom Count in {neighborhood}')
    axes[1, 0].set_xlabel('Number of Bedrooms')
    axes[1, 0].set_ylabel('Monthly Rent ($)')
    
    # Plot 4: Price per sqft distribution
    sns.histplot(df['price_per_sqft'].dropna(), kde=True, ax=axes[1, 1])
    axes[1, 1].set_title(f'Price per Square Foot in {neighborhood}')
    axes[1, 1].set_xlabel('Price per Square Foot ($/sqft)')
    axes[1, 1].set_ylabel('Count')
    
    plt.tight_layout()
    plt.savefig(f'data/{neighborhood.replace(" ", "_")}_analysis.png')
    plt.show()

def generate_word_cloud(df, neighborhood):
    """Generate word cloud from listing descriptions"""
    if df.empty or 'description' not in df.columns:
        print("No description data for word cloud")
        return
    
    # Combine all descriptions
    all_text = ' '.join(df['description'].dropna().astype(str))
    
    # Tokenize and remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(all_text.lower())
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    
    # Additional real estate specific words to filter
    real_estate_stop_words = ['property', 'home', 'house', 'apartment', 'unit', 'zillow']
    filtered_tokens = [word for word in filtered_tokens if word not in real_estate_stop_words]
    
    # Create frequency distribution
    word_freq = Counter(filtered_tokens)
    
    # Generate and display word cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white', 
                          max_words=100, contour_width=3, contour_color='steelblue')
    wordcloud.generate_from_frequencies(word_freq)
    
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Most Common Words in {neighborhood} Rental Listings')
    plt.tight_layout()
    plt.savefig(f'data/{neighborhood.replace(" ", "_")}_wordcloud.png')
    plt.show()

In [32]:
async def analyze_neighborhood_rentals(neighborhood):
    """
    Complete pipeline: scrape, analyze, and visualize rental data for a neighborhood
    """
    print(f"Starting analysis for {neighborhood}...")
    
    # Scrape data
    df = await scrape_neighborhood_data(neighborhood, listing_type="rent")
    
    if df.empty:
        print(f"No rental data found for {neighborhood}")
        return
    
    # Analyze data
    df = analyze_rental_data(df)
    
    # Visualize data
    visualize_rental_data(df, neighborhood)
    generate_word_cloud(df, neighborhood)
    
    print(f"Analysis for {neighborhood} completed!")
    return df

In [43]:
def analyze_neighborhood_rentals(neighborhood):
    """
    Complete pipeline: scrape, analyze, and visualize rental data for a neighborhood
    """
    print(f"Starting analysis for {neighborhood}...")
    
    # Scrape data
    df = scrape_neighborhood_data(neighborhood, listing_type="rent")
    
    if df is None or df.empty:
        print(f"No rental data found for {neighborhood}")
        return
    
    # Analyze data
    df = analyze_rental_data(df)
    
    # Visualize data
    visualize_rental_data(df, neighborhood)
    generate_word_cloud(df, neighborhood)
    
    print(f"Analysis for {neighborhood} completed!")
    return df

In [44]:
df = analyze_neighborhood_rentals("Beverly Hills")

Starting analysis for Beverly Hills...


Error: It looks like you are using Playwright Sync API inside the asyncio loop.
Please use the Async API instead.

In [45]:
# Quick test to verify Playwright is working
with sync_playwright() as p:
    browser = p.chromium.launch(headless=False)
    page = browser.new_page()
    page.goto("https://www.zillow.com")
    print("Successfully connected to Zillow")
    browser.close()

Error: It looks like you are using Playwright Sync API inside the asyncio loop.
Please use the Async API instead.