In [4]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# datasets 
disney_df = pd.read_csv('data\DisneylandReviews.csv', encoding='utf-8', encoding_errors='replace')

uss_df = pd.read_csv('data\universal_studio_branches.csv', encoding='utf-8', encoding_errors='replace')

# basic info
print("Disneyland Reviews:")
print(disney_df.info())
print("\nUniversal Studios Reviews:")
print(uss_df.info())

# rename columns
disney_renamed = disney_df.rename(columns={
    'Review_ID': 'review_id',
    'Rating': 'rating',
    'Year_Month': 'date',
    'Reviewer_Location': 'reviewer_location',
    'Review_Text': 'review_text',
    'Branch': 'branch'
})

uss_renamed = uss_df.rename(columns={
    'reviewer': 'reviewer_location',
    'written_date': 'date',
    'title': 'review_title',
    'review_text': 'review_text'
})

# tag label for disney and uss
disney_renamed['park_type'] = 'Disney'
uss_renamed['park_type'] = 'USS'

# add missing columns
if 'review_id' not in uss_renamed.columns:
    uss_renamed['review_id'] = uss_renamed.index + len(disney_renamed)
if 'review_title' not in disney_renamed.columns:
    disney_renamed['review_title'] = np.nan

# select common columns and merge datasets
common_columns = ['review_id', 'rating', 'date', 'reviewer_location', 'review_text', 'review_title', 'branch', 'park_type']
disney_common = disney_renamed[common_columns]
uss_common = uss_renamed[common_columns]

# merge datasets
combined_df = pd.concat([disney_common, uss_common], ignore_index=True)

# check merged dataset
print("\nMerged dataset info:")
print(combined_df.info())
print(combined_df.head())

Disneyland Reviews:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42656 entries, 0 to 42655
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Review_ID          42656 non-null  int64 
 1   Rating             42656 non-null  int64 
 2   Year_Month         42656 non-null  object
 3   Reviewer_Location  42656 non-null  object
 4   Review_Text        42656 non-null  object
 5   Branch             42656 non-null  object
dtypes: int64(2), object(4)
memory usage: 2.0+ MB
None

Universal Studios Reviews:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50904 entries, 0 to 50903
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   reviewer      50904 non-null  object 
 1   rating        50904 non-null  float64
 2   written_date  50904 non-null  object 
 3   title         50904 non-null  object 
 4   review_text   50904 non-null  object 
 5  

In [6]:
# standardize date
def standardize_date(date_str):
    try:
        # try to parse disney format (Year_Month)
        if isinstance(date_str, str) and len(date_str.split('-')) == 2:
            year, month = date_str.split('-')
            return f"{year}-{month}-01"
        # try to parse uss format
        elif isinstance(date_str, str):
            return pd.to_datetime(date_str).strftime('%Y-%m-%d')
        else:
            return np.nan
    except:
        return np.nan

combined_df['standardized_date'] = combined_df['date'].apply(standardize_date)
combined_df['standardized_date'] = pd.to_datetime(combined_df['standardized_date'], errors='coerce')

# extract year and month
combined_df['year'] = combined_df['standardized_date'].dt.year
combined_df['month'] = combined_df['standardized_date'].dt.month

# handle missing values
print(f"Before handling missing values: \n{combined_df.isnull().sum()}")

# fill missing review titles
combined_df['review_title'] = combined_df['review_title'].fillna('No Title')

# delete rows without review text
combined_df = combined_df.dropna(subset=['review_text'])

print(f"After handling missing values: \n{combined_df.isnull().sum()}")

# convert rating to numeric
combined_df['rating'] = pd.to_numeric(combined_df['rating'], errors='coerce')

# create negative experience label (rating <= 4 is negative experience)
combined_df['bad_experience'] = (combined_df['rating'] <= 4).astype(int)

# check the distribution of negative experience
print("\nNegative experience distribution:")
print(combined_df['bad_experience'].value_counts())
print(f"Negative experience ratio: {combined_df['bad_experience'].mean():.2%}")

Before handling missing values: 
review_id               0
rating                  0
date                    0
reviewer_location       0
review_text             0
review_title            0
branch                  0
park_type               0
standardized_date    2613
year                 2613
month                2613
bad_experience          0
dtype: int64
After handling missing values: 
review_id               0
rating                  0
date                    0
reviewer_location       0
review_text             0
review_title            0
branch                  0
park_type               0
standardized_date    2613
year                 2613
month                2613
bad_experience          0
dtype: int64

Negative experience distribution:
bad_experience
0    51348
1    42212
Name: count, dtype: int64
Negative experience ratio: 45.12%


In [7]:
import matplotlib.pyplot as plt
import seaborn as sns

# set visualization style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set(font_scale=1.2)

# 1. rating distribution
plt.figure(figsize=(12, 6))
sns.countplot(x='rating', hue='park_type', data=combined_df)
plt.title('Disney and USS rating distribution')
plt.xlabel('rating')
plt.ylabel('number')
plt.savefig('rating_distribution.png')
plt.close()

# 2. rating distribution of different branches
plt.figure(figsize=(14, 8))
sns.boxplot(x='branch', y='rating', hue='park_type', data=combined_df)
plt.title('rating distribution of different branches')
plt.xlabel('branch')
plt.ylabel('rating')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('branch_ratings.png')
plt.close()

# 3. rating trend over time
monthly_ratings = combined_df.groupby(['year', 'month', 'park_type'])['rating'].mean().reset_index()
monthly_ratings['date'] = pd.to_datetime(monthly_ratings[['year', 'month']].assign(day=1))

plt.figure(figsize=(16, 6))
for park in monthly_ratings['park_type'].unique():
    park_data = monthly_ratings[monthly_ratings['park_type'] == park]
    plt.plot(park_data['date'], park_data['rating'], label=park)
plt.title('rating trend over time')
plt.xlabel('date')
plt.ylabel('average rating')
plt.legend()
plt.tight_layout()
plt.savefig('rating_trend.png')
plt.close()

# 4. rating difference by different locations
top_locations = combined_df['reviewer_location'].value_counts().head(15).index
location_df = combined_df[combined_df['reviewer_location'].isin(top_locations)]

plt.figure(figsize=(14, 8))
sns.boxplot(x='reviewer_location', y='rating', data=location_df)
plt.title('rating distribution by different locations')
plt.xlabel('reviewer location')
plt.ylabel('rating')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('location_ratings.png')
plt.close()

# 5. relationship between negative experience and branches
plt.figure(figsize=(14, 6))
bad_exp_by_branch = combined_df.groupby(['branch', 'park_type'])['bad_experience'].mean().reset_index()
sns.barplot(x='branch', y='bad_experience', hue='park_type', data=bad_exp_by_branch)
plt.title('negative experience ratio by branches')
plt.xlabel('branch')
plt.ylabel('negative experience ratio')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('bad_experience_by_branch.png')
plt.close()