In [1]:
import pandas as pd
import numpy as np

# Load Data

In [2]:
from google.colab import drive

# Mount your Google Drive
drive.mount('/content/drive')

# Replace with the actual file path in your Google Drive
file_path = '/content/drive/MyDrive/CMPE-256-Shared/data/combined_filtered_reviews.csv' #  Replace 'your_folder_name' and 'your_file.csv'

try:
  # Read the CSV file into a pandas DataFrame
  # typically it will cost around3 mins.
  df = pd.read_csv(file_path)

  # Print or process the DataFrame
  print("Loading data successfully")

except FileNotFoundError:
  print(f"Error: File not found at {file_path}. Please check the file path.")
except pd.errors.ParserError:
  print(f"Error: Could not parse the file at {file_path}.  Is it a valid CSV file?")
except Exception as e:
  print(f"An unexpected error occurred: {e}")

Mounted at /content/drive
Loading data successfully


In [3]:
df.sample(5)

Unnamed: 0,date,rating,title,text,property_dict,hotel_id,author_id
5726368,2015-03-01T00:00:00,5.0,Good stay. Good food. Good pub. Good drinks.,"I stayed here twice, like a transit point betw...",{},353975,121820
2453265,2015-02-01T00:00:00,5.0,Fantastic hotel - even better staff!,This hotel is perfectly situated int he heart ...,"{'location': 5.0, 'sleep quality': 5.0, 'servi...",52728,2505956
4404679,2018-10-01T00:00:00,4.0,Convenient and good enough,Entering this hotel is almost like travelling ...,"{'sleep quality': 4.0, 'service': 4.0, 'locati...",246429,517092
2800133,2013-04-01T00:00:00,4.0,Choice of chandigarh hotels is wide.,my recent stay at chandigarh hotel regenta cen...,"{'rooms': 5.0, 'value': 5.0, 'cleanliness': 5....",66447,2165898
5978346,2017-06-01T00:00:00,5.0,Great place great location,The hotel is in a great spot walking distance ...,"{'service': 5.0, 'rooms': 5.0, 'location': 5.0}",357035,848527


In [4]:
print(f"author_id number: {df['author_id'].nunique()}")
print(f"hotel_id number: {df['hotel_id'].nunique()}")
print(f"interaction number: {len(df)}")

author_id number: 189992
hotel_id number: 329340
interaction number: 7379698


In [5]:
print(f"rating min: {df['rating'].min()}")
print(f"rating max: {df['rating'].max()}")

rating min: 1.0
rating max: 5.0


In [6]:
value_counts = df['author_id'].value_counts()

print(f'The minimum number of ratings given by a user: {value_counts.min()}')
print(f'The maximum number of ratings given by a user: {value_counts.max()}')
print(f'mean number: {value_counts.mean()}')

value_counts = df['hotel_id'].value_counts()

print(f'The minimum number of ratings for a hotel: {value_counts.min()}')
print(f'The maximum number of ratings for a hotel: {value_counts.max()}')
print(f'mean number: {value_counts.mean()}')

The minimum number of ratings given by a user: 20
The maximum number of ratings given by a user: 1193017
mean number: 38.84215124847362
The minimum number of ratings for a hotel: 1
The maximum number of ratings for a hotel: 2987
mean number: 22.407536284690593


In [7]:
has_duplicates = df.duplicated().any()
print(f"Does the DataFrame have duplicate rows? {has_duplicates}")

Does the DataFrame have duplicate rows? True


In [8]:
df = df.drop_duplicates()

print(f"author_id number: {df['author_id'].nunique()}")
print(f"hotel_id number: {df['hotel_id'].nunique()}")
print(f"interaction number: {len(df)}")

author_id number: 189992
hotel_id number: 329340
interaction number: 7379690


In [9]:
author_id_counts = df['author_id'].value_counts()
top_two_author_ids = author_id_counts.nlargest(2)
top_two_author_ids

Unnamed: 0_level_0,count
author_id,Unnamed: 1_level_1
202,1193014
1504,23889


In [10]:
df = df[~df['author_id'].isin(top_two_author_ids.index)]

In [11]:
df_rating = df[['author_id', 'hotel_id', 'rating']]
total_na = df_rating.isna().sum().sum()
print(f"Total number of NaN values in df_rating: {total_na}")

Total number of NaN values in df_rating: 0


# Save Model Function

In [12]:
import pickle

def save_model(model, model_name):
    model_filename = f'/content/drive/MyDrive/CMPE-256-Shared/model/{model_name}.pkl'
    with open(model_filename, 'wb') as file:
        pickle.dump(model, file)

    print(f"Saved to {model_filename}")

# svd

reference: https://surprise.readthedocs.io/en/stable/matrix_factorization.html

In [13]:
!pip install scikit-surprise

from surprise import Dataset, Reader
from surprise import SVD
from surprise.model_selection.split import train_test_split
from surprise import accuracy

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357292 sha256=d6d7ed6e852fd596b36c93b0d1a438b3b6654b29cf3de5c3f058f33da338c0a5
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Succe

In [14]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_rating,reader)


## Option 1: Simple Split

In [15]:
trainingSet, testSet = train_test_split(data, test_size=0.2, train_size=None, random_state=None, shuffle=True)

In [16]:
#SVD Matrix Factorization
algo = SVD(n_factors=6, reg_all=0.01, n_epochs=20, verbose=True)
algo.fit(trainingSet)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7dfd113b94e0>

In [17]:
predictions_svd = algo.test(testSet)
accuracy.rmse(predictions_svd, verbose=False)

0.8253429454186038

In [18]:
save_model(algo, 'svd_simple_split')

Saved to /content/drive/MyDrive/CMPE-256-Shared/model/svd_simple_split.pkl


## Option 2: GridSearchCV

In [None]:
from surprise.model_selection import  GridSearchCV

In [None]:
param_grid = {'n_factors': [6, 8],
              'reg_all': [0.01],
              'n_epochs': [10, 20]
              }

gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(data)

In [None]:
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

0.8821888426292733
{'n_factors': 6, 'reg_all': 0.01, 'n_epochs': 20}


In [None]:
save_model(gs, 'svd_grid_search')

Saved to /content/drive/MyDrive/CMPE-256-Shared/model/svd_grid_search.pkl


In [None]:
save_model(gs.best_estimator['rmse'], 'svd_best_estimator')

Saved to /content/drive/MyDrive/CMPE-256-Shared/model/svd_best_estimator.pkl


## Option 3: Finer-grained Split

In [19]:
def split_data(total_df, train_ratio=0.8):
    """Highly optimized splitting of DataFrame by 'author_id' using groupby.

    Args:
      total_df: The input DataFrame.
      train_ratio: The ratio of data to be used for training.

    Returns:
      A tuple containing the training DataFrame and the testing DataFrame.
    """
    # Use lists to collect training and testing data
    training_list = []
    test_list = []

    # Group data by 'author_id'
    grouped = total_df.groupby('author_id')

    for _, author_data in grouped:
        n_samples = len(author_data)
        n_train = int(n_samples * train_ratio)

        # Shuffle indices and split
        shuffled_indices = np.random.permutation(n_samples)
        train_indices = shuffled_indices[:n_train]
        test_indices = shuffled_indices[n_train:]

        # Append split data to the lists
        training_list.append(author_data.iloc[train_indices])
        test_list.append(author_data.iloc[test_indices])

    # Concatenate all data at once
    training_df = pd.concat(training_list, ignore_index=True)
    test_df = pd.concat(test_list, ignore_index=True)

    return training_df, test_df


In [20]:
training_df, test_df = split_data(df_rating)
df_rating = pd.concat([training_df, test_df], ignore_index=True)
data = Dataset.load_from_df(df_rating,reader)
trainingSet, testSet = train_test_split(data, test_size=0.2, train_size=None, shuffle=False)


In [24]:
#SVD Matrix Factorization
algo = SVD(n_factors=6, reg_all=0.01, n_epochs=20, verbose=True)
algo.fit(trainingSet)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7dfc0d9dadd0>

In [25]:
predictions_svd = algo.test(testSet)
accuracy.rmse(predictions_svd, verbose=False)

0.8251044300571109

In [26]:
save_model(algo, 'svd_fine_grained_split')

Saved to /content/drive/MyDrive/CMPE-256-Shared/model/svd_fine_grained_split.pkl
