In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
ca_users_path = '/content/drive/MyDrive/CMPE256/Data/CSVfiles/CA_users.csv'
ca_review_path = '/content/drive/MyDrive/CMPE256/Data/CSVfiles/California_reviews.csv'
business_path = '/content/drive/MyDrive/CMPE256/Data/CSVfiles/business.csv'

In [None]:
!pip install scikit-surprise
from surprise import SVD



In [None]:
import pandas as pd
from surprise import Dataset, Reader
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise.model_selection import KFold
from surprise import SVD

# from surprise.prediction_algorithms.matrix_factorization import SVDpp

# Creating Dataframes

In [None]:
reviews = pd.read_csv(ca_review_path, usecols= ['business_id', 'stars', 'review_id', 'user_id'])
business = pd.read_csv(business_path, usecols= ['business_id', 'name'])
users = pd.read_csv(ca_users_path, usecols= ['user_id', 'is_elite'])

In [None]:
## Since users and business dataframe both have the same column names we will update them accordingly
users = users.rename(columns = { 'name': 'Username' })
business = business.rename(columns = { 'name': 'Restaurant name' })

In [None]:
reviews.columns
reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars
0,pUycOfUwM8vqX7KjRRhUEA,59MxRhNVhU9MYndMkz0wtw,gebiRewfieSdtt17PTW6Zg,3.0
1,eCiWBf1CJ0Zdv1uVarEhhw,OhECKhQEexFypOMY6kypRw,vC2qm1y3Au5czBtbhc-DNw,4.0
2,YbMyvlDA2W3Py5lTz8VK-A,4hBhtCSgoxkrFgHa4YAD-w,bbEXAEFr4RYHLlZ-HFssTA,5.0
3,L0jv8c2FbpWSlfNC6bbUEA,bFPdtzu11Oi0f92EAcjqmg,IDtLPgUrqorrpqSLdfMhZQ,5.0
4,4zopEEPqfwm-c_FNpeHZYw,JYYYKt6TdVA4ng9lLcXt_g,SZU9c8V2GuREDN5KgyHFJw,5.0


In [None]:

business.columns


Index(['business_id', 'Restaurant name'], dtype='object')

In [None]:
users.columns


Index(['user_id', 'is_elite'], dtype='object')

In [None]:
users.head()

Unnamed: 0,user_id,is_elite
0,qVc8ODYU5SZjKXVBgXdI7w,True
1,SZDeASXq7o05mMNLshsdIA,True
2,q_QQ5kBBwlCcbL1s4NVK3g,True
3,iYzhPPqnrjJkg1JHZyMhzA,False
4,QF1Kuhs8iwLWANNZxebTow,True


## Elite Reviews Only

In [None]:
elite_reviews_df = reviews.merge(users, how='left', on=['user_id'], indicator=True)
elite_reviews_df.shape

(239065, 6)

In [None]:
elite_reviews_df = elite_reviews_df.loc[elite_reviews_df['is_elite'] == True]
elite_reviews_df.shape

(48367, 6)

In [None]:
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, business id and stars (in that order).
data = Dataset.load_from_df(elite_reviews_df[['user_id', 'business_id', 'stars']], reader)

## User based collaborative filtering

In [None]:
# Combine the columns from the given dataframe to consist only of required columns for model
ratings_data = reviews.join(business.set_index('business_id'), on='business_id')
ratings_data = ratings_data.join(users.set_index('user_id'), on = 'user_id')
ratings_data.head()

Unnamed: 0,review_id,user_id,business_id,stars,Restaurant name,is_elite
0,pUycOfUwM8vqX7KjRRhUEA,59MxRhNVhU9MYndMkz0wtw,gebiRewfieSdtt17PTW6Zg,3.0,Hibachi Steak House & Sushi Bar,False
1,eCiWBf1CJ0Zdv1uVarEhhw,OhECKhQEexFypOMY6kypRw,vC2qm1y3Au5czBtbhc-DNw,4.0,Sushi Teri,False
2,YbMyvlDA2W3Py5lTz8VK-A,4hBhtCSgoxkrFgHa4YAD-w,bbEXAEFr4RYHLlZ-HFssTA,5.0,The Original Habit Burger Grill,True
3,L0jv8c2FbpWSlfNC6bbUEA,bFPdtzu11Oi0f92EAcjqmg,IDtLPgUrqorrpqSLdfMhZQ,5.0,Helena Avenue Bakery,False
4,4zopEEPqfwm-c_FNpeHZYw,JYYYKt6TdVA4ng9lLcXt_g,SZU9c8V2GuREDN5KgyHFJw,5.0,Santa Barbara Shellfish Company,True


In [None]:
# An antiset is a set of those user and item pairs for which a rating doesn't exist in original dataset
# We will use this data for testing
anti_set = data.build_full_trainset().build_anti_testset()

In [None]:
business = business.drop_duplicates(['business_id' , 'Restaurant name'])

### Fit the model with 3 splits

In [None]:
## Fit the model with 3 splits
kf = KFold(n_splits=3)
algo = SVD()

In [None]:
for trainset, testset in kf.split(data):
    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)
    # Compute and print Root Mean Squared Error
    rmse = accuracy.rmse(predictions, verbose=True)

RMSE: 0.9403
RMSE: 0.9384
RMSE: 0.9538


https://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVD

In [None]:
def gridSearchCV():
  param_grid = {
    'n_factors':[1,2],
    'n_epochs': [10, 20, 30, 40], 
    'lr_all': [0.005, 0.007],
    'reg_all': [0.02,0.04,0.1],
    'lr_pu': [0.005,3,5,10],
    'reg_pu': [0.005,3,5,10]}
    #'reg_all': [0.02, 0.04]
    # 'n_factors':[5, 10, 15, 20],
  gs_model = GridSearchCV(algo_class=SVD, param_grid=param_grid, measures=['rmse'], cv=3)
  return gs_model

In [None]:
gs_model = gridSearchCV()

In [None]:
gs_model.fit(data)

  est = self.estimate(iuid, iiid)


In [None]:
print(gs_model.best_params)

{'rmse': {'n_factors': 1, 'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.02, 'lr_pu': 3, 'reg_pu': 3}}


In [None]:
# best RMSE score
print(gs_model.best_score['rmse'])

1.3936559692937995


# Item based Collaborative Filtering

In [None]:
def gridSearchCV():
  param_grid = {
    'n_factors':[1,2],
    'n_epochs': [10, 20, 30, 40], 
    'lr_all': [0.005, 0.007],
    'reg_all': [0.02,0.04,0.1],
    'lr_qi': [2,4,6],
    'reg_qi': [2,4,6]}
    #'reg_all': [0.02, 0.04]
    # 'n_factors':[5, 10, 15, 20],
  gs_model_item = GridSearchCV(algo_class=SVD, param_grid=param_grid, measures=['rmse'], cv=3)
  return gs_model_item

In [None]:
gs_model_item = gridSearchCV()

In [None]:
gs_model_item.fit(data)

In [None]:
print(gs_model_item.best_params)

In [None]:
# best RMSE score
print(gs_model_item.best_score['rmse'])

In [None]:
def get_train_data(train_file):
  # reader = Reader(rating_scale=(1,5))
  # return Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader=reader)
  reader = Reader(line_format='user item rating', sep=',')
  return Dataset.load_from_file(train_file, reader=reader)
  # reader = Reader(line_format='user item rating timestamp', sep='\t')
  # return Dataset.load_from_file(train_file, reader=reader)

In [None]:
def test_model(gs_model, testset):
  predictions = gs_model.test(testset)
  print("RMSE on test set", accuracy.rmse(predictions))

In [None]:
def predict_ratings(model,test_df,format_df):
  for index, row in test_df.iterrows():
      uid = row["user_id"]
      iid = row["item_id"]
      pred = model.predict(str(uid), str(iid))
      format_df.at[index] = pred.est
  return format_df

In [None]:
def gridSearchCV():
  param_grid = {
    'n_factors':[1,2],
    'n_epochs': [10, 20, 30, 40], 
    'lr_all': [0.005, 0.007],
    'reg_all': [0.02,0.04,0.1]}
    #'reg_all': [0.02, 0.04]
    # 'n_factors':[5, 10, 15, 20],
  gs_model = GridSearchCV(algo_class=SVD, param_grid=param_grid, measures=['rmse'], cv=3)
  return gs_model

In [None]:
def main():
  train_df, test_df, format_df = create_dataframes()
  print(train_df.dtypes)
  train_df = train_df.drop(['timestamp'], axis=1)
  train_df.to_csv('train_processed.dat',index=False, header=False)
  print(train_df.dtypes)
  train_data = get_train_data('train_processed.dat')
  # train_data = get_train_data(train_df)
  cv_train_set, testset = train_test_split(train_data,test_size=0.2) 
  gs_model = gridSearchCV()
  gs_model.fit(train_data)
  print(gs_model.best_params)
  print("RMSE on train set", gs_model.best_score['rmse'])
  algo = gs_model.best_estimator['rmse']
  algo.fit(train_data.build_full_trainset())
  test_model(algo, testset)
  predict_ratings(algo,test_df,format_df)
  format_df.to_csv('format7.dat',index=False, header=False)

In [None]:
main()

user_id      int64
item_id      int64
rating       int64
timestamp    int64
dtype: object
user_id    int64
item_id    int64
rating     int64
dtype: object
{'rmse': {'n_factors': 2, 'n_epochs': 30, 'lr_all': 0.007, 'reg_all': 0.04}}
RMSE on train set 0.9408690742918076
RMSE: 0.8732
RMSE on test set 0.8732354694465266
