In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load Data

In [2]:
from google.colab import drive

# Mount your Google Drive
drive.mount('/content/drive')

# Replace with the actual file path in your Google Drive
file_path = '/content/drive/MyDrive/CMPE-256-Shared/data/combined_filtered_reviews.csv' #  Replace 'your_folder_name' and 'your_file.csv'

try:
  # Read the CSV file into a pandas DataFrame
  # typically it will cost around3 mins.
  df = pd.read_csv(file_path)

  # Print or process the DataFrame
  print("Loading data successfully")

except FileNotFoundError:
  print(f"Error: File not found at {file_path}. Please check the file path.")
except pd.errors.ParserError:
  print(f"Error: Could not parse the file at {file_path}.  Is it a valid CSV file?")
except Exception as e:
  print(f"An unexpected error occurred: {e}")

Mounted at /content/drive
Loading data successfully


In [3]:
df.sample(1)

Unnamed: 0,date,rating,title,text,property_dict,hotel_id,author_id
618256,2013-10-01T00:00:00,4.0,A Nice Haven in the Storm,Our visit to Ucluelet was framed by the area's...,"{'service': 4.0, 'cleanliness': 4.0, 'value': ...",13313,1266119


In [4]:
df_input = df[['author_id', 'hotel_id', 'rating']]

In [5]:
df_input.sample(1)

Unnamed: 0,author_id,hotel_id,rating
5157265,202,346217,5.0


In [6]:
print(f"Number of unique authors: {df_input['author_id'].nunique()}")
print(f"Number of unique hotels: {df_input['hotel_id'].nunique()}")
print(f"Total number of ratings: {len(df_input)}")

Number of unique authors: 189992
Number of unique hotels: 329340
Total number of ratings: 7379698


# Split Data

In [7]:
# Split data into training and testing sets
train_data, test_data = train_test_split(df_input, test_size=0.05, random_state=42)

# Average User Rating Predictor

In [8]:
# prompt: 基于df_input，我想进行rating预测。步骤如下：
# 1、将df_input分成training（0.8） and testing（0.2）
# 2、在training中，计算每一个author_id的打分平均值，存入一个dict中。并计算全体平均值
# 3、在testing中进行预测，如果author_id在dict中，则使用相应的平均值预测。如果不在，则使用全体平均值进行预测
# 4、计算RMSE

# Assuming df_input is already defined from the previous code

# 2. Calculate average rating for each author in the training set
author_avg_ratings = train_data.groupby('author_id')['rating'].mean().to_dict()
global_avg_rating = train_data['rating'].mean()

# 3. Predict ratings for the testing set
predictions = []
for index, row in test_data.iterrows():
    author_id = row['author_id']
    if author_id in author_avg_ratings:
        predictions.append(author_avg_ratings[author_id])
    else:
        predictions.append(global_avg_rating)

# 4. Calculate RMSE
rmse = np.sqrt(mean_squared_error(test_data['rating'], predictions))

print(f"RMSE: {rmse}")

RMSE: 0.9725495655073169


# Average Item Rating Predictor

In [9]:
# Calculate average rating for each author in the training set
item_avg_ratings = train_data.groupby('hotel_id')['rating'].mean().to_dict()
global_avg_rating = train_data['rating'].mean()

# 3. Predict ratings for the testing set
predictions = []
for index, row in test_data.iterrows():
    hotel_id = row['hotel_id']
    if hotel_id in item_avg_ratings:
        predictions.append(item_avg_ratings[hotel_id])
    else:
        predictions.append(global_avg_rating)

# 4. Calculate RMSE
rmse = np.sqrt(mean_squared_error(test_data['rating'], predictions))

print(f"RMSE: {rmse}")

RMSE: 0.9151625080218683
