In [4]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import os

def load_data():
    high_dollar = pd.read_csv('/Users/vickychan/Desktop/Google_high_dollar_Sentiment.csv')
    high_double_dollar = pd.read_csv('/Users/vickychan/Desktop/Google_high_double_dollar_Sentiment.csv')
    mod_dollar = pd.read_csv('/Users/vickychan/Desktop/Google_mod_dollar_Sentiment.csv')
    mod_double_dollar = pd.read_csv('/Users/vickychan/Desktop/Google_mod_double_dollar_Sentiment.csv')
    
    high_dollar['Category'] = 'High $'
    high_double_dollar['Category'] = 'High $$'
    mod_dollar['Category'] = 'Moderate $'
    mod_double_dollar['Category'] = 'Moderate $$'
    
    high_dollar['Price'] = '$'
    high_double_dollar['Price'] = '$$'
    mod_dollar['Price'] = '$'
    mod_double_dollar['Price'] = '$$'
    
    return high_dollar, high_double_dollar, mod_dollar, mod_double_dollar

def process_data(high_dollar, high_double_dollar, mod_dollar, mod_double_dollar):
    high_dollar_stats = high_dollar.groupby('Restaurant').agg({
        'Overall Rating': 'first',
        'Review Rating': 'mean',
        'Sentiment Score': 'mean',
        'Positive %': 'mean', 
        'Neutral %': 'mean',
        'Negative %': 'mean',
        'Category': 'first',
        'Price': 'first'
    }).reset_index()
    
    high_double_dollar_stats = high_double_dollar.groupby('Restaurant').agg({
        'Overall Rating': 'first',
        'Review Rating': 'mean',
        'Sentiment Score': 'mean',
        'Positive %': 'mean',
        'Neutral %': 'mean',
        'Negative %': 'mean',
        'Category': 'first',
        'Price': 'first'
    }).reset_index()
    
    mod_dollar_stats = mod_dollar.groupby('Restaurant').agg({
        'Overall Rating': 'first',
        'Review Rating': 'mean',
        'Sentiment Score': 'mean',
        'Positive %': 'mean',
        'Neutral %': 'mean',
        'Negative %': 'mean',
        'Category': 'first',
        'Price': 'first'
    }).reset_index()
    
    mod_double_dollar_stats = mod_double_dollar.groupby('Restaurant').agg({
        'Overall Rating': 'first',
        'Review Rating': 'mean',
        'Sentiment Score': 'mean',
        'Positive %': 'mean',
        'Neutral %': 'mean',
        'Negative %': 'mean',
        'Category': 'first',
        'Price': 'first'
    }).reset_index()
    
    # Combine all stats
    all_stats = pd.concat([high_dollar_stats, high_double_dollar_stats, mod_dollar_stats, mod_double_dollar_stats])
    
    all_stats['Rating_Difference'] = all_stats['Review Rating'] - all_stats['Overall Rating']
    all_stats['Normalized_Sentiment'] = (all_stats['Sentiment Score'] + 1) * 2.5
    all_stats['Combined_Score'] = (all_stats['Review Rating'] + all_stats['Normalized_Sentiment']) / 2
    
    all_stats['Alignment_Score'] = abs(all_stats['Overall Rating'] / 5 - (all_stats['Sentiment Score'] + 1) / 2)
    
    all_stats = all_stats.sort_values('Combined_Score', ascending=False)
    
    return all_stats

def create_combined_score_chart(df):
    # Display only $ and $$ categories
    df_filtered = df[df['Category'].isin(['High $', 'High $$', 'Moderate $', 'Moderate $$'])]
    df_filtered = df_filtered.sort_values('Combined_Score', ascending=False).head(25)
    
    color_map = {
        'High $': '#22c55e',   
        'High $$': '#3b82f6',   
        'Moderate $': '#f97316',
        'Moderate $$': '#8b5cf6' 
    }
    
    fig = px.bar(
        df_filtered,
        x='Restaurant',
        y='Combined_Score',
        color='Category',
        color_discrete_map=color_map,
        title='Top Restaurants by Combined Score',
        labels={'Combined_Score': 'Combined Score (0-5)'}
    )
    
    fig.update_layout(
        xaxis_tickangle=-45,
        height=500,
        legend_title="Category"
    )
    
    return fig

def create_price_category_charts(df):
    dollar_df = df[df['Price'] == '$'].sort_values('Combined_Score', ascending=False).head(15)
    double_dollar_df = df[df['Price'] == '$$'].sort_values('Combined_Score', ascending=False).head(15)
    
    fig_dollar = px.bar(
        dollar_df,
        x='Restaurant',
        y='Combined_Score',
        color='Category',
        title='Top $ Restaurants Overall (Both High & Moderate)',
        labels={'Combined_Score': 'Combined Score (0-5)'},
        color_discrete_map={
            'High $': '#22c55e',       
            'Moderate $': '#f97316' 
        }
    )
    
    fig_dollar.update_layout(
        xaxis_tickangle=-45,
        height=500,
        legend_title="Category"
    )
    
    fig_double_dollar = px.bar(
        double_dollar_df,
        x='Restaurant',
        y='Combined_Score',
        color='Category',
        title='Top $$ Restaurants Overall (Both High & Moderate)',
        labels={'Combined_Score': 'Combined Score (0-5)'},
        color_discrete_map={
            'High $$': '#3b82f6',    
            'Moderate $$': '#8b5cf6'  
        }
    )
    
    fig_double_dollar.update_layout(
        xaxis_tickangle=-45,
        height=500,
        legend_title="Category"
    )
    
    return fig_dollar, fig_double_dollar

def create_rating_comparison_chart(df):
    # Get top 20 restaurants by combined score
    top_restaurants = df.sort_values('Combined_Score', ascending=False).head(20)
    
    fig = go.Figure()
    
    fig.add_trace(go.Bar(
        x=top_restaurants['Restaurant'],
        y=top_restaurants['Overall Rating'],
        name='Google Rating',
        marker_color='#6C7AE0'  # Blue
    ))
    
    fig.add_trace(go.Bar(
        x=top_restaurants['Restaurant'],
        y=top_restaurants['Review Rating'],
        name='Review Rating',
        marker_color='#6AE0AA'  # Green
    ))
    
    fig.add_trace(go.Bar(
        x=top_restaurants['Restaurant'],
        y=top_restaurants['Normalized_Sentiment'],
        name='Normalized Sentiment (0-5)',
        marker_color='#E06A9C'  # Pink
    ))
    
    fig.update_layout(
        title='Restaurant Ratings Comparison',
        xaxis=dict(
            title='Restaurant',
            tickangle=-45
        ),
        yaxis=dict(
            title='Rating (0-5 scale)',
            range=[0, 5.5]
        ),
        barmode='group',
        height=600,
        margin=dict(l=50, r=50, t=80, b=150),
        legend=dict(
            x=1.0,
            y=1.0
        )
    )
    
    return fig

def create_scatter_alignment_plot(df):
    fig = px.scatter(
        df,
        x='Overall Rating', 
        y='Sentiment Score',
        color='Category',
        size='Alignment_Score',
        size_max=15,
        hover_name='Restaurant',
        title='Rating vs. Sentiment Alignment',
        labels={
            'Overall Rating': 'Google Rating (0-5)',
            'Sentiment Score': 'Sentiment Score (-1 to +1)'
        },
        color_discrete_map={
            'High $': '#22c55e',
            'High $$': '#3b82f6',
            'Moderate $': '#f97316',
            'Moderate $$': '#8b5cf6'
        }
    )
    

    x_vals = np.linspace(min(df['Overall Rating'])-0.2, max(df['Overall Rating'])+0.2, 100)
    y_vals = 2*(x_vals/5)-1  # Convert from 0-5 scale to -1 to +1 scale
    
    fig.add_trace(
        go.Scatter(
            x=x_vals,
            y=y_vals,
            mode='lines',
            line=dict(color='grey', width=1, dash='dash'),
            name='Perfect Alignment'
        )
    )
    
    fig.update_layout(
        height=500,
        legend_title="Category",
    )
    
    return fig

def create_misalignment_chart(df):
    df = df.copy()
    

    df_sorted = df.sort_values('Alignment_Score', ascending=False).head(15)
    
    fig = px.bar(
        df_sorted,
        x='Restaurant',
        y='Alignment_Score',
        color='Category',
        title='Restaurants with Greatest Rating-Sentiment Misalignment',
        labels={'Alignment_Score': 'Misalignment Score'},
        color_discrete_map={
            'High $': '#22c55e',
            'High $$': '#3b82f6',
            'Moderate $': '#f97316',
            'Moderate $$': '#8b5cf6'
        }
    )
    
    fig.add_trace(
        go.Scatter(
            x=df_sorted['Restaurant'],
            y=df_sorted['Overall Rating'],
            name='Google Rating',
            mode='markers',
            marker=dict(
                color='black',
                symbol='circle',
                size=8
            ),
            yaxis='y2'
        )
    )
    
    fig.add_trace(
        go.Scatter(
            x=df_sorted['Restaurant'],
            y=df_sorted['Normalized_Sentiment'],
            name='Normalized Sentiment',
            mode='markers',
            marker=dict(
                color='red',
                symbol='star',
                size=10
            ),
            yaxis='y2'
        )
    )
    
    fig.update_layout(
        xaxis_tickangle=-45,
        height=500,
        legend_title="Category",
        yaxis2=dict(
            title='Rating / Sentiment (0-5)',
            overlaying='y',
            side='right'
        )
    )
    
    return fig

def create_pie_chart(df):
    category_counts = df['Category'].value_counts().reset_index()
    category_counts.columns = ['Category', 'Count']
    
    color_map = {
        'High $': '#22c55e',     
        'High $$': '#3b82f6',   
        'Moderate $': '#f97316', 
        'Moderate $$': '#8b5cf6' 
    }
    
    fig = px.pie(
        category_counts, 
        values='Count', 
        names='Category',
        title='Distribution of Restaurant Categories',
        color='Category',
        color_discrete_map=color_map
    )
    
    fig.update_layout(
        height=500,
        legend_title="Category"
    )
    
    fig.update_traces(
        textinfo='label+percent+value', 
        textposition='inside'
    )
    
    return fig

def export_html_files():
    high_dollar, high_double_dollar, mod_dollar, mod_double_dollar = load_data()
    all_stats = process_data(high_dollar, high_double_dollar, mod_dollar, mod_double_dollar)

    combined_score = create_combined_score_chart(all_stats)
    dollar_chart, double_dollar_chart = create_price_category_charts(all_stats)
    rating_comparison = create_rating_comparison_chart(all_stats)
    scatter_alignment = create_scatter_alignment_plot(all_stats)
    misalignment_chart = create_misalignment_chart(all_stats)
    pie_chart = create_pie_chart(all_stats)
    
    best_high_dollar = all_stats[all_stats['Category'] == 'High $'].sort_values('Combined_Score', ascending=False).iloc[0]
    best_high_double_dollar = all_stats[all_stats['Category'] == 'High $$'].sort_values('Combined_Score', ascending=False).iloc[0]
    best_mod_dollar = all_stats[all_stats['Category'] == 'Moderate $'].sort_values('Combined_Score', ascending=False).iloc[0]
    best_mod_double_dollar = all_stats[all_stats['Category'] == 'Moderate $$'].sort_values('Combined_Score', ascending=False).iloc[0]
    
    # Find best overall $ and $$ restaurants (across high and moderate categories)
    best_dollar_overall = all_stats[all_stats['Price'] == '$'].sort_values('Combined_Score', ascending=False).iloc[0]
    best_double_dollar_overall = all_stats[all_stats['Price'] == '$$'].sort_values('Combined_Score', ascending=False).iloc[0]
    
    # Find restaurants with most positive and negative sentiment
    most_positive = all_stats.sort_values('Sentiment Score', ascending=False).iloc[0]
    most_negative = all_stats.sort_values('Sentiment Score', ascending=True).iloc[0]
    
    # Find most misaligned restaurants
    most_misaligned = all_stats.sort_values('Alignment_Score', ascending=False).iloc[0]
    most_misaligned_dollar = all_stats[all_stats['Category'].isin(['High $', 'Moderate $'])].sort_values('Alignment_Score', ascending=False).iloc[0]
    most_misaligned_double_dollar = all_stats[all_stats['Category'].isin(['High $$', 'Moderate $$'])].sort_values('Alignment_Score', ascending=False).iloc[0]
    
    # HTML link
    html_content = f"""
<!DOCTYPE html>
<html>
<head>
    <title>Restaurant Sentiment Analysis & Review Insights</title>
    <style>
        body {{ 
            font-family: Arial, sans-serif; 
            margin: 20px;
        }}
        h1 {{ 
            color: #333;
            text-align: center;
        }}
        .section {{
            margin-bottom: 30px;
        }}
        .highlight {{
            font-weight: bold;
            color: #2980b9;
        }}
        .special-highlight {{
            font-weight: bold;
            color: #e74c3c;
            font-size: 1.1em;
        }}
        .legend {{
            margin: 20px 0;
            text-align: center;
        }}
        .legend-item {{
            display: inline-block;
            margin-right: 15px;
        }}
        .legend-color {{
            display: inline-block;
            width: 15px;
            height: 15px;
            margin-right: 5px;
            vertical-align: middle;
        }}
        .row {{
            display: flex;
            flex-wrap: wrap;
            margin: 0 -10px;
        }}
        .column {{
            flex: 50%;
            padding: 0 10px;
        }}
        @media screen and (max-width: 768px) {{
            .column {{
                flex: 100%;
            }}
        }}
    </style>
</head>
<body>
    <h1>Restaurant Sentiment Analysis & Review Insights</h1>
    
    <div class="legend">
        <div class="legend-item">
            <span class="legend-color" style="background-color: #22c55e;"></span>
            <span>High $ (4+ stars, $)</span>
        </div>
        <div class="legend-item">
            <span class="legend-color" style="background-color: #3b82f6;"></span>
            <span>High $$ (4+ stars, $$)</span>
        </div>
        <div class="legend-item">
            <span class="legend-color" style="background-color: #f97316;"></span>
            <span>Moderate $ (3-3.9 stars, $)</span>
        </div>
        <div class="legend-item">
            <span class="legend-color" style="background-color: #8b5cf6;"></span>
            <span>Moderate $$ (3-3.9 stars, $$)</span>
        </div>
    </div>
    
    <div class="section">
        <h2>Key Findings</h2>
        <div class="row">
            <div class="column">
                <h3>Best Restaurants by Category</h3>
                <ul>
                    <li>Best High $ restaurant: <span class="highlight">{best_high_dollar['Restaurant']}</span> with a combined score of <span class="highlight">{best_high_dollar['Combined_Score']:.2f}/5</span></li>
                    <li>Best High $$ restaurant: <span class="highlight">{best_high_double_dollar['Restaurant']}</span> with a combined score of <span class="highlight">{best_high_double_dollar['Combined_Score']:.2f}/5</span></li>
                    <li>Best Moderate $ restaurant: <span class="highlight">{best_mod_dollar['Restaurant']}</span> with a combined score of <span class="highlight">{best_mod_dollar['Combined_Score']:.2f}/5</span></li>
                    <li>Best Moderate $$ restaurant: <span class="highlight">{best_mod_double_dollar['Restaurant']}</span> with a combined score of <span class="highlight">{best_mod_double_dollar['Combined_Score']:.2f}/5</span></li>
                </ul>
                
                <h3>Best Overall By Price Category</h3>
                <ul>
                    <li>Best Overall $ Restaurant: <span class="special-highlight">{best_dollar_overall['Restaurant']}</span> ({best_dollar_overall['Category']}) with a combined score of <span class="special-highlight">{best_dollar_overall['Combined_Score']:.2f}/5</span></li>
                    <li>Best Overall $$ Restaurant: <span class="special-highlight">{best_double_dollar_overall['Restaurant']}</span> ({best_double_dollar_overall['Category']}) with a combined score of <span class="special-highlight">{best_double_dollar_overall['Combined_Score']:.2f}/5</span></li>
                </ul>
            </div>
            <div class="column">
                <h3>Sentiment Analysis</h3>
                <ul>
                    <li>Restaurant with most positive TextBlob sentiment: <span class="highlight">{most_positive['Restaurant']}</span> ({most_positive['Category']}) (score: <span class="highlight">{most_positive['Sentiment Score']:.3f}</span>)</li>
                    <li>Restaurant with most negative TextBlob sentiment: <span class="highlight">{most_negative['Restaurant']}</span> ({most_negative['Category']}) (score: <span class="highlight">{most_negative['Sentiment Score']:.3f}</span>)</li>
                </ul>
                <h3>Rating-Sentiment Misalignment</h3>
                <ul>
                    <li>Most misaligned overall: <span class="highlight">{most_misaligned['Restaurant']}</span> ({most_misaligned['Category']}) - Google Rating: {most_misaligned['Overall Rating']:.1f}, Sentiment: {most_misaligned['Sentiment Score']:.3f}</li>
                    <li>Most misaligned $ restaurant: <span class="highlight">{most_misaligned_dollar['Restaurant']}</span> ({most_misaligned_dollar['Category']}) - Google Rating: {most_misaligned_dollar['Overall Rating']:.1f}, Sentiment: {most_misaligned_dollar['Sentiment Score']:.3f}</li>
                    <li>Most misaligned $$ restaurant: <span class="highlight">{most_misaligned_double_dollar['Restaurant']}</span> ({most_misaligned_double_dollar['Category']}) - Google Rating: {most_misaligned_double_dollar['Overall Rating']:.1f}, Sentiment: {most_misaligned_double_dollar['Sentiment Score']:.3f}</li>
                </ul>
            </div>
        </div>
    </div>
    
    <div class="section">
        <h2>Best Overall Restaurants By Price Category</h2>
        <div class="row">
            <div class="column">
                <h3>Top $ Restaurants (Both High & Moderate)</h3>
                <iframe src="dollar_chart.html" width="100%" height="500px" frameborder="0"></iframe>
            </div>
            <div class="column">
                <h3>Top $$ Restaurants (Both High & Moderate)</h3>
                <iframe src="double_dollar_chart.html" width="100%" height="500px" frameborder="0"></iframe>
            </div>
        </div>
    </div>
    
    <div class="section">
        <h2>Restaurant Ratings Comparison</h2>
        <iframe src="rating_comparison.html" width="100%" height="600px" frameborder="0"></iframe>
    </div>
    
    <div class="row">
        <div class="column">
            <div class="section">
                <h2>Distribution of Restaurant Categories</h2>
                <iframe src="pie_chart.html" width="100%" height="500px" frameborder="0"></iframe>
            </div>
        </div>
        <div class="column">
            <div class="section">
                <h2>Top Restaurants by Combined Score</h2>
                <iframe src="combined_score.html" width="100%" height="500px" frameborder="0"></iframe>
            </div>
        </div>
    </div>
    
    <div class="section">
        <h2>Rating vs. Sentiment Alignment</h2>
        <p>Bubble size indicates the degree of misalignment (larger = more misaligned)</p>
        <iframe src="scatter_alignment.html" width="100%" height="500px" frameborder="0"></iframe>
    </div>
    
    <div class="section">
        <h2>Restaurants with Greatest Rating-Sentiment Misalignment</h2>
        <p>Black dots = Google Rating, Red stars = Normalized Sentiment (both on 0-5 scale)</p>
        <iframe src="misalignment.html" width="100%" height="500px" frameborder="0"></iframe>
    </div>
</body>
</html>
    """
    
    output_dir = '/Users/vickychan/Documents/newer analysis'
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # index.html
    with open(f"{output_dir}/index.html", "w") as f:
        f.write(html_content)
    
    combined_score.write_html(f"{output_dir}/combined_score.html")
    dollar_chart.write_html(f"{output_dir}/dollar_chart.html")
    double_dollar_chart.write_html(f"{output_dir}/double_dollar_chart.html")
    rating_comparison.write_html(f"{output_dir}/rating_comparison.html")
    scatter_alignment.write_html(f"{output_dir}/scatter_alignment.html")
    misalignment_chart.write_html(f"{output_dir}/misalignment.html")
    pie_chart.write_html(f"{output_dir}/pie_chart.html")
    
export_html_files()

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import re

def load_data():
    high_dollar = pd.read_csv('/Users/vickychan/Desktop/Google_high_dollar_Sentiment.csv')
    high_double_dollar = pd.read_csv('/Users/vickychan/Desktop/Google_high_double_dollar_Sentiment.csv')
    mod_dollar = pd.read_csv('/Users/vickychan/Desktop/Google_mod_dollar_Sentiment.csv')
    mod_double_dollar = pd.read_csv('/Users/vickychan/Desktop/Google_mod_double_dollar_Sentiment.csv')
    
    all_data = pd.concat([high_dollar, high_double_dollar, mod_dollar, mod_double_dollar])
    
    return all_data

def clean_text(text):
    if not isinstance(text, str):
        return ""
    
    text = text.lower()
    
    text = re.sub(r'[^\w\s]', ' ', text)
    
    text = re.sub(r'\d+', '', text)
    
    # stopwords
    stop_words = {'a', 'an', 'the', 'and', 'but', 'if', 'or', 'of', 'to', 'in', 'is', 'it'}
    
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    
    return ' '.join(filtered_words)

def create_wordclouds():
    all_reviews = load_data()
    
    # Split by sentiment
    positive_reviews = all_reviews[all_reviews['Sentiment'] == 'Positive']
    negative_reviews = all_reviews[all_reviews['Sentiment'] == 'Negative']
    
    print(f"Total reviews: {len(all_reviews)}")
    print(f"Positive reviews: {len(positive_reviews)}")
    print(f"Negative reviews: {len(negative_reviews)}")
    
    positive_text = ' '.join(positive_reviews['Cleaned Review'].dropna().astype(str))
    negative_text = ' '.join(negative_reviews['Cleaned Review'].dropna().astype(str))
    
    positive_text = clean_text(positive_text)
    negative_text = clean_text(negative_text)
    
    # positive wordcloud
    positive_wordcloud = WordCloud(
        width=1000,
        height=600,
        background_color='white',
        colormap='viridis',
        max_words=100,
        contour_width=1,
        contour_color='steelblue',
        random_state=42
    ).generate(positive_text)
    
    # negative wordcloud
    negative_wordcloud = WordCloud(
        width=1000,
        height=600,
        background_color='white',
        colormap='plasma',
        max_words=100,
        contour_width=1,
        contour_color='firebrick',
        random_state=42
    ).generate(negative_text)
    
    plt.figure(figsize=(14, 8))
    plt.imshow(positive_wordcloud, interpolation='bilinear')
    plt.title('Frequent Words in Positive Sentiment Reviews', fontsize=18)
    plt.axis('off')
    plt.tight_layout()
    plt.savefig('/Users/vickychan/Desktop/positive_wordcloud.png')
    plt.close()
    
    plt.figure(figsize=(14, 8))
    plt.imshow(negative_wordcloud, interpolation='bilinear')
    plt.title('Frequent Words in Negative Sentiment Reviews', fontsize=18)
    plt.axis('off')
    plt.tight_layout()
    plt.savefig('/Users/vickychan/Desktop/negative_wordcloud.png')
    plt.close()
    
    # Create side-by-side comparison
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))
    ax1.imshow(positive_wordcloud, interpolation='bilinear')
    ax1.set_title('Frequent Words in Positive Sentiment Reviews', fontsize=18)
    ax1.axis('off')
    
    ax2.imshow(negative_wordcloud, interpolation='bilinear')
    ax2.set_title('Frequent Words in Negative Sentiment Reviews', fontsize=18)
    ax2.axis('off')
    
    plt.tight_layout()
    plt.savefig('/Users/vickychan/Desktop/wordcloud_comparison.png')
    plt.close()

if __name__ == "__main__":
    create_wordclouds()

Total reviews: 475
Positive reviews: 425
Negative reviews: 46
