In [3]:
from prophet import Prophet
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

Importing plotly failed. Interactive plots will not work.


In [7]:
import os

# Define the file path
file_path = 'aggregated_sentiment_analysis.csv'

# Load the entire dataset
df = pd.read_csv(file_path)

# Define the cities of interest
cities = ['new-york-city', 'san-francisco', 'austin', 'boston', 'barcelona', 'nashville', 
          'milan', 'munich', 'istanbul', 'london', 'riga', 'hong-kong', 'bangkok', 
          'singapore', 'melbourne', 'mexico-city', 'rio-de-janeiro', 'belize', 
          'santiago', 'toronto']

# Process the DataFrame for each city
for city in cities:
    # Filter the DataFrame for the current city
    # Ensure case-insensitive matching and strip potential leading/trailing whitespaces
    city_data = df[df['city'].str.contains(city, case=False, na=False)].copy()
    city_data['city'] = city_data['city'].str.strip()

    if not city_data.empty:
        # Split city data by room type
        for room_type in city_data['room_type'].unique():
            room_data = city_data[city_data['room_type'] == room_type]
            
            # Define the directory path based on city and room type
            # Replace '/' in room_type with '-' to avoid path issues
            directory_path = f"./{city}/{room_type.replace('/', '-')}"
            os.makedirs(directory_path, exist_ok=True)  # Create the directory if it doesn't exist
            
            # Define the file name and save the DataFrame to CSV
            file_name = f"{directory_path}/{city}_{room_type.replace('/', '-')}.csv"
            room_data.to_csv(file_name, index=False)

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [6]:
def process_and_forecast(city, room_type):
    plot_dir = f"./plotted_graph/{city}"
    os.makedirs(plot_dir, exist_ok=True)
    
    file_path = f"./data/{city}/{room_type.replace('/', '-')}/{city}_{room_type.replace('/', '-')}.csv"
        
    if not os.path.exists(file_path):
        print(f"File does not exist: {file_path}")
        return
        
    df = pd.read_csv(file_path)
    df['date'] = pd.to_datetime(df['date'])
    df.sort_values('date', inplace=True)
    df.set_index('date', inplace=True)
    
    df['standardized_compound'] = (df['compound'] - df['compound'].mean()) / df['compound'].std()

    # Calculate monthly averages for the actual data
    actual_monthly_avg = df[['standardized_compound']].resample('M').mean()

    # Filter data up to the specified date for training
    train_cutoff = pd.to_datetime('2020-03-01')
    train_df = df[df.index <= train_cutoff]

    #train_cutoff = pd.to_datetime('2022-01-01')
    #train_df = df[df.index < train_cutoff]

    # Calculate monthly averages for training data
    monthly_train_avg = train_df[['standardized_compound']].resample('M').mean().reset_index()
    monthly_train_avg.columns = ['ds', 'y']

    model = Prophet()
    model.fit(monthly_train_avg)

    # Extend original data frame for future predictions beyond the training period
    last_date = df.index.max()
    future = model.make_future_dataframe(periods=(last_date - train_cutoff).days, freq='D')
    forecast = model.predict(future)

# Plotting
    plt.figure(figsize=(12, 8))
    actual_monthly_avg = df[['standardized_compound']].resample('M').mean()
    forecast_monthly_avg = forecast.set_index('ds').resample('M').mean()['yhat']

    plt.plot(actual_monthly_avg.index, actual_monthly_avg['standardized_compound'], label="Actual", marker='o', color='blue')
    plt.plot(forecast_monthly_avg.index, forecast_monthly_avg, label="Forecast", linestyle='--', color='red')
    plt.title(f"{room_type} Sentiment Analysis for {city.capitalize()}")
    plt.xlabel('Date')
    plt.ylabel('Standardized Sentiment Score')
    plt.legend()
    plt.grid(True, which='both', linestyle='--', linewidth=0.5)
    plt.xticks(rotation=45)
    plt.tight_layout()

    plt.savefig(f"{plot_dir}/{city}_{room_type.replace('/', '-')}_sentiment_analysis.png")
    #plt.savefig(f"{plot_dir}/{city}_{room_type.replace('/', '-')}_postCovid_sentiment_analysis.png")
    plt.close()

In [8]:
cities = ['new-york-city','san-francisco','austin','boston','barcelona','nashville','milan','munich','istanbul','london','riga','hong-kong','bangkok','singapore','melbourne','mexico-city','rio-de-janeiro','belize','santiago','toronto']
room_types = ['Entire home-apt', 'Private room', 'Shared room']
for city in cities:
    for room_type in room_types:
        process_and_forecast(city, room_type)

22:10:33 - cmdstanpy - INFO - Chain [1] start processing
22:10:33 - cmdstanpy - INFO - Chain [1] done processing
22:10:37 - cmdstanpy - INFO - Chain [1] start processing
22:10:37 - cmdstanpy - INFO - Chain [1] done processing
22:10:38 - cmdstanpy - INFO - Chain [1] start processing
22:10:38 - cmdstanpy - INFO - Chain [1] done processing
22:10:40 - cmdstanpy - INFO - Chain [1] start processing
22:10:41 - cmdstanpy - INFO - Chain [1] done processing
22:10:43 - cmdstanpy - INFO - Chain [1] start processing
22:10:43 - cmdstanpy - INFO - Chain [1] done processing
22:10:44 - cmdstanpy - INFO - Chain [1] start processing
22:10:44 - cmdstanpy - INFO - Chain [1] done processing
22:10:48 - cmdstanpy - INFO - Chain [1] start processing
22:10:48 - cmdstanpy - INFO - Chain [1] done processing
22:10:49 - cmdstanpy - INFO - Chain [1] start processing
22:10:50 - cmdstanpy - INFO - Chain [1] done processing
22:10:50 - cmdstanpy - INFO - Chain [1] start processing
22:10:50 - cmdstanpy - INFO - Chain [1]

In [20]:
def plot_box(city, room_type):
    file_path = f"./data/{city}/{room_type.replace('/', '-')}/{city}_{room_type.replace('/', '-')}.csv"
    if not os.path.exists(file_path):
        print(f"File does not exist: {file_path}")
        return
    
    # Load data
    df = pd.read_csv(file_path)
    df['date'] = pd.to_datetime(df['date'])
    df.sort_values('date', inplace=True)
    df['year'] = df['date'].dt.year  # Extract year from date
    
    # Group by year
    yearly_groups = df.groupby('year')['compound']

    # Plot
    plt.figure(figsize=(12, 8))
    sns.boxplot(x=df['year'], y=df['compound'])
    plt.title(f"Yearly Sentiment Score Distribution for {city.capitalize()}, {room_type}")
    plt.xlabel('Year')
    plt.ylabel('Sentiment Score')
    plt.xticks(rotation=45)
    plt.tight_layout()
    
    # Save plot to file
    plot_dir = f"./plotted_graph/{city}"
    os.makedirs(plot_dir, exist_ok=True)
    plt.savefig(f"{plot_dir}/{city}_{room_type.replace('/', '-')}_box_plot.png")
    plt.close()


In [21]:
cities = ['new-york-city','san-francisco','austin','boston','barcelona','nashville','milan','munich','istanbul','london','riga','hong-kong','bangkok','singapore','melbourne','mexico-city','rio-de-janeiro','belize','santiago','toronto']
room_types = ['Entire home-apt', 'Private room', 'Shared room']
for city in cities:
    for room_type in room_types:
        plot_box(city, room_type)