In [None]:
# Improved Word Cloud Generation with Cohesive Colors and SVG Export

This notebook generates word clouds from rental listing descriptions with cohesive color schemes and exports both PNG and SVG formats.


In [None]:
# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
import os
import matplotlib.colors as mcolors

# Download NLTK resources if needed
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')

# Set matplotlib to use high DPI for better quality
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['figure.figsize'] = [12, 8]

print("Libraries imported successfully!")


In [None]:
# Custom real estate stopwords
real_estate_stopwords = set([
    'home', 'property', 'house', 'listing', 'features', 'includes',
    'located', 'offers', 'contact', 'information', 'price', 'sale',
    'bedroom', 'bathroom', 'bath', 'bed', 'sq', 'ft', 'square', 'feet',
    'year', 'built', 'call', 'today', 'agent', 'new', 'view', 'tour',
    'zillow', 'apartment', 'unit', 'rental', 'rent', 'available', 'lease',
    'monthly', 'deposit', 'utilities', 'included', 'pets', 'parking', 'laundry',
    'apt', 'blvd', 'dr', 'st', 'ave', 'street', 'drive', 'boulevard', 'avenue',
    'road', 'lane', 'place', 'court', 'way', 'circle', 'terrace', 'plaza',
    'floor', 'suite', 'building', 'complex', 'community', 'residence', 'residential',
    'luxury', 'premium', 'exclusive', 'modern', 'contemporary', 'traditional',
    'spacious', 'cozy', 'charming', 'beautiful', 'stunning', 'gorgeous', 'amazing',
    'perfect', 'ideal', 'wonderful', 'fantastic', 'excellent', 'outstanding',
    'convenient', 'close', 'near', 'walking', 'distance', 'minutes', 'blocks',
    'downtown', 'uptown', 'midtown', 'suburban', 'urban', 'residential', 'commercial',
    # Los Angeles neighborhood names
    'beverly', 'hills', 'koreatown', 'korea', 'town', 'echo', 'park', 'pacoima', 
    'watts', 'silverlake', 'silver', 'lake', 'boyle', 'heights', 'hollywood',
    'downtown', 'westwood', 'santa', 'monica', 'venice', 'marina', 'del', 'rey',
    'culver', 'city', 'brentwood', 'bel', 'air', 'west', 'hollywood', 'fairfax',
    'miracle', 'mile', 'mid', 'wilshire', 'koreatown', 'thai', 'town', 'little',
    'tokyo', 'chinatown', 'china', 'town', 'olympics', 'olympic', 'boulevard',
    'sunset', 'strip', 'melrose', 'avenue', 'rodeo', 'drive',
    'la', 'brea', 'avenue', 'fairfax', 'avenue',
    'santa', 'monica', 'boulevard', 'wilshire', 'boulevard', 'olympic', 'boulevard',
    'sunset', 'boulevard', 'melrose', 'avenue', 'la', 'cienega', 'boulevard',
    'robertson', 'boulevard', 'san', 'vicente', 'boulevard',
    'pacific', 'coast', 'highway', 'sepulveda', 'boulevard', 'ventura', 'boulevard',
    'laurel', 'canyon', 'mulholland', 'drive', 'topanga', 'canyon',
    'beverly', 'glen', 'benedict', 'canyon', 'coldwater', 'canyon',
    'franklin', 'avenue', 'vine', 'street', 'hollywood', 'boulevard'
])

print(f"Loaded {len(real_estate_stopwords)} custom stopwords")


In [None]:
# Define cohesive color schemes
color_schemes = {
    'cool_blues': ['#1f77b4', '#3182bd', '#4299e1', '#63b3ed', '#90cdf4', '#bee3f8', '#dbeafe'],
    'warm_reds': ['#dc2626', '#ef4444', '#f87171', '#fca5a5', '#fecaca', '#fee2e2', '#fef2f2'],
    'greens': ['#059669', '#10b981', '#34d399', '#6ee7b7', '#a7f3d0', '#d1fae5', '#ecfdf5'],
    'purples': ['#7c3aed', '#8b5cf6', '#a78bfa', '#c4b5fd', '#ddd6fe', '#ede9fe', '#f3f4f6'],
    'oranges': ['#ea580c', '#f97316', '#fb923c', '#fdba74', '#fed7aa', '#ffedd5', '#fff7ed'],
    'teals': ['#0d9488', '#14b8a6', '#2dd4bf', '#5eead4', '#99f6e4', '#ccfbf1', '#f0fdfa']
}

def create_custom_colormap(colors):
    """Create a custom colormap from a list of colors"""
    return mcolors.LinearSegmentedColormap.from_list('custom', colors, N=len(colors))

print("Available color schemes:")
for scheme_name in color_schemes.keys():
    print(f"  - {scheme_name}")
    
# Show a sample of each color scheme
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.flatten()

for i, (scheme_name, colors) in enumerate(color_schemes.items()):
    # Create a color bar for each scheme
    colormap = create_custom_colormap(colors)
    gradient = np.linspace(0, 1, 256).reshape(1, -1)
    axes[i].imshow(gradient, aspect='auto', cmap=colormap)
    axes[i].set_title(scheme_name, fontsize=12, fontweight='bold')
    axes[i].set_xticks([])
    axes[i].set_yticks([])

plt.tight_layout()
plt.suptitle('Available Color Schemes', fontsize=16, fontweight='bold', y=1.02)
plt.show()


In [None]:
# List available CSV files
csv_files = [f for f in os.listdir('data') if f.endswith('_rentals.csv')]
print(f"Available CSV files: {len(csv_files)}")
for file in csv_files:
    print(f"  - {file}")


In [None]:
def generate_improved_wordcloud(df, neighborhood, color_scheme='cool_blues', export_svg=True):
    """Generate word cloud with cohesive colors and SVG export"""
    if df.empty or 'description' not in df.columns:
        print(f"No description data for word cloud in {neighborhood}")
        return
    
    # Combine all descriptions
    descriptions = df['description'].dropna().astype(str)
    if len(descriptions) == 0:
        print(f"No descriptions available for {neighborhood}")
        return
    
    all_text = ' '.join(descriptions)
    
    # Tokenize and remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(all_text.lower())
    
    # Filter tokens
    filtered_tokens = []
    for word in tokens:
        if (word.isalpha() and 
            len(word) > 2 and 
            word not in stop_words and 
            word not in real_estate_stopwords):
            filtered_tokens.append(word)
    
    if not filtered_tokens:
        print(f"No meaningful words found for {neighborhood}")
        return
    
    # Create frequency distribution
    word_freq = Counter(filtered_tokens)
    
    # Create custom colormap
    colors = color_schemes[color_scheme]
    custom_colormap = create_custom_colormap(colors)
    
    # Generate word cloud with improved parameters
    wordcloud = WordCloud(
        width=1200, 
        height=800, 
        background_color='white',
        max_words=150, 
        contour_width=2, 
        contour_color='#374151',
        colormap=custom_colormap,
        relative_scaling=0.5,
        min_font_size=12,
        max_font_size=200,
        random_state=42,  # For reproducible results
        prefer_horizontal=0.7,  # Mix of horizontal and vertical words
        collocations=False  # Don't repeat phrases
    )
    wordcloud.generate_from_frequencies(word_freq)
    
    # Create figure
    fig, ax = plt.subplots(figsize=(16, 10), dpi=300)
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.axis('off')
    ax.set_title(f'Most Common Words in {neighborhood} Rental Listings\n(Total: {len(df)} listings)', 
                  fontsize=20, fontweight='bold', pad=20, color='#1f2937')
    
    # Save as PNG (high resolution)
    png_filename = f'data/{neighborhood.replace(" ", "_")}_description_wordcloud.png'
    plt.savefig(png_filename, dpi=300, bbox_inches='tight', facecolor='white', edgecolor='none')
    print(f"PNG word cloud saved to {png_filename}")
    
    # Save as SVG if requested
    if export_svg:
        svg_filename = f'data/{neighborhood.replace(" ", "_")}_description_wordcloud.svg'
        plt.savefig(svg_filename, format='svg', bbox_inches='tight', facecolor='white', edgecolor='none')
        print(f"SVG word cloud saved to {svg_filename}")
    
    plt.show()
    
    # Print top words
    print(f"\nTop 15 words in {neighborhood}:")
    for word, count in word_freq.most_common(15):
        print(f"  {word}: {count}")
    
    return word_freq

print("Word cloud generation function defined!")


In [None]:
## Generate Word Clouds for Each Neighborhood

Now we'll generate word clouds for each neighborhood with different color schemes.


In [None]:
# Beverly Hills - Cool Blues
neighborhood = 'Beverly_Hills'
csv_file = f'data/{neighborhood}_rentals.csv'
color_scheme = 'cool_blues'

if os.path.exists(csv_file):
    print(f"Processing {neighborhood} with {color_scheme} color scheme")
    print("="*60)
    
    # Load data
    df = pd.read_csv(csv_file)
    print(f"Loaded {len(df)} listings")
    
    # Show sample data
    print(f"\nSample descriptions:")
    for j, desc in enumerate(df['description'].head(2)):
        if pd.notna(desc) and desc != 'N/A':
            print(f"{j+1}. {desc[:150]}...")
    
    # Generate word cloud
    word_freq_bh = generate_improved_wordcloud(df, neighborhood.replace('_', ' '), color_scheme)
    
else:
    print(f"CSV file not found: {csv_file}")


In [None]:
# Koreatown - Warm Reds
neighborhood = 'Koreatown'
csv_file = f'data/{neighborhood}_rentals.csv'
color_scheme = 'warm_reds'

if os.path.exists(csv_file):
    print(f"Processing {neighborhood} with {color_scheme} color scheme")
    print("="*60)
    
    # Load data
    df = pd.read_csv(csv_file)
    print(f"Loaded {len(df)} listings")
    
    # Show sample data
    print(f"\nSample descriptions:")
    for j, desc in enumerate(df['description'].head(2)):
        if pd.notna(desc) and desc != 'N/A':
            print(f"{j+1}. {desc[:150]}...")
    
    # Generate word cloud
    word_freq_kt = generate_improved_wordcloud(df, neighborhood.replace('_', ' '), color_scheme)
    
else:
    print(f"CSV file not found: {csv_file}")


In [None]:
# Echo Park - Greens
neighborhood = 'Echo_Park'
csv_file = f'data/{neighborhood}_rentals.csv'
color_scheme = 'greens'

if os.path.exists(csv_file):
    print(f"Processing {neighborhood} with {color_scheme} color scheme")
    print("="*60)
    
    # Load data
    df = pd.read_csv(csv_file)
    print(f"Loaded {len(df)} listings")
    
    # Show sample data
    print(f"\nSample descriptions:")
    for j, desc in enumerate(df['description'].head(2)):
        if pd.notna(desc) and desc != 'N/A':
            print(f"{j+1}. {desc[:150]}...")
    
    # Generate word cloud
    word_freq_ep = generate_improved_wordcloud(df, neighborhood.replace('_', ' '), color_scheme)
    
else:
    print(f"CSV file not found: {csv_file}")


In [None]:
# Generate word clouds for additional neighborhoods (if available)
additional_neighborhoods = ['Pacoima', 'Watts', 'Boyle_Heights']
additional_color_schemes = ['purples', 'oranges', 'teals']

for i, neighborhood in enumerate(additional_neighborhoods):
    csv_file = f'data/{neighborhood}_rentals.csv'
    color_scheme = additional_color_schemes[i]
    
    if os.path.exists(csv_file):
        print(f"\nProcessing {neighborhood} with {color_scheme} color scheme")
        print("="*60)
        
        # Load data
        df = pd.read_csv(csv_file)
        print(f"Loaded {len(df)} listings")
        
        # Generate word cloud
        word_freq = generate_improved_wordcloud(df, neighborhood.replace('_', ' '), color_scheme)
        
    else:
        print(f"\nCSV file not found: {csv_file}")


In [None]:
## Summary

This notebook has generated improved word clouds with:
- **Cohesive color schemes** for each neighborhood
- **High-resolution PNG** files for web use
- **SVG files** for infinite scalability
- **Comparative analysis** capabilities

The color schemes used are:
- **Beverly Hills**: Cool Blues (luxury and exclusivity)
- **Koreatown**: Warm Reds (urban energy)
- **Echo Park**: Greens (creative and vibrant)
- **Pacoima**: Purples (family-oriented)
- **Watts**: Oranges (opportunity and growth)
- **Boyle Heights**: Teals (cultural heritage)


In [None]:
# List all generated files
print("Generated files:")
print("=" * 50)

neighborhoods = ['Beverly_Hills', 'Koreatown', 'Echo_Park', 'Pacoima', 'Watts', 'Boyle_Heights']

for neighborhood in neighborhoods:
    png_file = f'data/{neighborhood}_description_wordcloud.png'
    svg_file = f'data/{neighborhood}_description_wordcloud.svg'
    
    if os.path.exists(png_file):
        print(f"  ✓ {png_file}")
    if os.path.exists(svg_file):
        print(f"  ✓ {svg_file}")

print("\nAll word clouds have been generated with cohesive color schemes and exported in both PNG and SVG formats!")
print("The SVG files are perfect for your scrollytelling web page since they scale infinitely without quality loss.")
