# Import libraries

In [1]:
import random
import numpy as np
import pandas as pd

from sklearn.metrics import precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer

from surprise import Dataset, Reader, SVD, KNNBasic, accuracy
from surprise.model_selection import train_test_split, GridSearchCV
from surprise.accuracy import rmse

# Load data

In [2]:
df_business_data = pd.read_json("yelp_academic_dataset_business.json", lines=True)

file_path = 'yelp_academic_dataset_review.json'

chunk_size = 100000
chunks = []
for chunk in pd.read_json(file_path, lines=True, chunksize=chunk_size):
    chunks.append(chunk)

df_review_data = pd.concat(chunks, ignore_index=True)

# Obtain negative samples

### Create a user id and business id pairs

In [3]:
unique_users = df_review_data['user_id'].unique()
unique_businesses = df_review_data['business_id'].unique()

business_user_reviewed_set = set(zip(df_review_data['user_id'], df_review_data['business_id'])) 
print((list(business_user_reviewed_set)[:10]))

[('fyIYESjyhbnfRwpcJcFd7Q', 'wYdlP6g2y6gK_RmLrPhsGA'), ('1xFpt3S7c-8wkE18oG6QqA', 'H1FXzbmnMPXRfOrTo1qxKw'), ('2gIOVJFi6qhKxm6wTmuB-g', 'xJIeT5--AiPdvPb5BRflyQ'), ('vUxnuFH1NYQskixF__ICEg', 'RSOAZcjiShJkeMjZ1C0eKA'), ('lD6aiEdvZmvosmbRRWtD7A', 'H26zRyQkXXYzUZUOtppFcA'), ('fgmZp-kpG_ZKIVhG6SbgEg', 'tV46IhCfHbsx_af-pMupiw'), ('WlGHvDqe-P5HrDm1IofOTw', 'Zp72O1eR_uaE2xZ-tqM63g'), ('UnH5LH_-I2uio4ObVzuQ9w', 'e-M9xlXXQWIa1n6Jy8gEtA'), ('mzHN0S3C39hsjJJ53EuGEQ', 'wBFmEtHy7M3IuyY7asQu4w'), ('VWJ8PSz6Sg5_AlBvQyGvpw', 'Bg7D8LrsW9XbYlEhT9yekw')]


### Get positive samples

In [4]:
train_data = pd.DataFrame(list(business_user_reviewed_set), columns=["user_id", "business_id"])
train_data["reviewed"] = 1
train_data.head()

Unnamed: 0,user_id,business_id,reviewed
0,fyIYESjyhbnfRwpcJcFd7Q,wYdlP6g2y6gK_RmLrPhsGA,1
1,1xFpt3S7c-8wkE18oG6QqA,H1FXzbmnMPXRfOrTo1qxKw,1
2,2gIOVJFi6qhKxm6wTmuB-g,xJIeT5--AiPdvPb5BRflyQ,1
3,vUxnuFH1NYQskixF__ICEg,RSOAZcjiShJkeMjZ1C0eKA,1
4,lD6aiEdvZmvosmbRRWtD7A,H26zRyQkXXYzUZUOtppFcA,1


### Obtain negative samples

In [5]:
negative_samples = []
for user in unique_users[:len(train_data)]:  
    random_business = random.choice(unique_businesses)
    while (user, random_business) in business_user_reviewed_set:
        random_business = random.choice(unique_businesses)
    negative_samples.append((user, random_business, 0))

### Combine samples and save to CSV

In [6]:
review_prediction_data = pd.concat([
    train_data,
    pd.DataFrame(negative_samples, columns=["user_id", "business_id", "reviewed"])
])

review_prediction_data.to_csv("truedataset.csv", index=False)

### Sample data to predict

In [7]:
data_review = review_prediction_data.sample(frac=0.2, random_state=42)[["user_id", "business_id"]]
data_review.to_csv("review.csv", index=False)

# Models

## SVD - Binary

### Load and split data

In [8]:
reader = Reader(rating_scale=(0, 1)) # 0 not reviewed , 1  reviewed
surprise_data = Dataset.load_from_df(review_prediction_data[["user_id", "business_id", "reviewed"]], reader)

trainset, testset = train_test_split(surprise_data, test_size=0.2)

### Fit SVD

model = SVD()
model.fit(trainset)
predictions = model.test(testset)

rmse(predictions)

### Tune params

In [9]:
param_grid = {
    "n_factors": [25],
    "n_epochs": [20],
    "lr_all": [0.005],
    "reg_all": [0.02]
}

grid_search = GridSearchCV(SVD, param_grid, measures = ["rmse"], cv = 3)
grid_search.fit(surprise_data)

best_param = grid_search.best_params["rmse"]

print("Best Params:", best_param)

best_model = SVD(**best_param)
best_model.fit(trainset)

predictions = best_model.test(testset)

rmse(predictions)

Best Params: {'n_factors': 25, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.02}
RMSE: 0.3634


0.3633573154447676

### Save prediction

In [10]:
prediction_svd_df = pd.DataFrame([(p.uid, p.iid, p.est) for p in predictions], columns=["user_id", "business_id", "predicted"])
prediction_svd_df.to_csv("predictions_review_svd.csv", index=False)

## KNN - Binary

### Load and split data

In [None]:
review_prediction_data = pd.read_csv('truedataset.csv')
filtered_df = review_prediction_data.groupby("user_id").filter(lambda x: len(x) > 1)

small_df = filtered_df.sample(n=30000, random_state=42) 

reader = Reader(rating_scale=(0, 1))
surprise_data = Dataset.load_from_df(small_df[["user_id", "business_id", "reviewed"]], reader)
trainset, testset = train_test_split(surprise_data, test_size=0.2, random_state=42)

### Fit KNN

sim_options = {"name": "cosine", "user_based": True, "k": 5,"min_support": 2}  
model = KNNBasic(sim_options=sim_options)
model.fit(trainset)
predictions = model.test(testset)

rmse = accuracy.rmse(predictions)
print("RMSE:", rmse)

### Tune params

In [None]:
param_grid = {
    "name": ["cosine"],
    "user_based": [True],
    "k": [5],
    "min_support": [1]
}

grid_search = GridSearchCV(KNNBasic, param_grid, measures = ["rmse"], cv = 3)
grid_search.fit(surprise_data)

best_param = grid_search.best_params["rmse"]

print("Best Params:", best_param)

best_model = KNNBasic(**best_param)
best_model.fit(trainset)

predictions = best_model.test(testset)

rmse = accuracy.rmse(predictions)

### Save predictions

In [None]:
predictions_df = pd.DataFrame([(pred.uid, pred.iid, pred.est) for pred in predictions], 
                              columns=["user_id", "business_id", "predicted_reviewed"])
predictions_df.to_csv("knn_basic_review_predictions_small.csv", index=False)
print("Predictions saved to knn_basic_review_predictions_small.csv")

## SVD - Rating

### Load and split data

In [14]:
train_ratings = df_review_data[["user_id", "business_id", "stars"]]
test_ratings = train_ratings.sample(frac=0.2, random_state=42)[["user_id", "business_id"]]
test_ratings.to_csv("rating.csv", index=False)

review_data = train_ratings

reader = Reader(rating_scale=(1, 5)) 
data = Dataset.load_from_df(review_data[["user_id", "business_id", "stars"]], reader)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

### Fit SVD

model = SVD()
model.fit(trainset)
predictions = model.test(testset)
rmse = accuracy.rmse(predictions)
print("RMSE:", rmse)

### Tune params

In [15]:
param_grid = {
    "n_factors": [25],
    "n_epochs": [20],
    "lr_all": [0.005],
    "reg_all": [0.02]
}

grid_search = GridSearchCV(SVD, param_grid, measures = ["rmse"], cv = 3)
grid_search.fit(data)

best_param = grid_search.best_params["rmse"]

print("Best Params:", best_param)

best_model = SVD(**best_param)
best_model.fit(trainset)

predictions = best_model.test(testset)

rmse = accuracy.rmse(predictions)

Best Params: {'n_factors': 25, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.02}
RMSE: 1.2790


### Save predictions

In [16]:
predictions_df = pd.DataFrame([(pred.uid, pred.iid, pred.est) for pred in predictions], 
                              columns=["user_id", "business_id", "predicted_stars"])
predictions_df.to_csv("svd_rating_predictions.csv", index=False)

## KNN - Rating

### Load and split data

In [17]:
filtered_df = review_data.groupby("user_id").filter(lambda x: len(x) > 1)
small_df = filtered_df.sample(n=5000, random_state=42)
reader = Reader(rating_scale=(1, 5))
surprise_data = Dataset.load_from_df(small_df[["user_id", "business_id", "stars"]], reader)

trainset, testset = train_test_split(surprise_data, test_size=0.2, random_state=42)

### Fit KNN

sim_options = {"name": "cosine", "user_based": True, "k": 5, "min_support": 2}
model = KNNBasic(sim_options=sim_options)
model.fit(trainset)
predictions = model.test(testset)
rmse = accuracy.rmse(predictions)
print("RMSE:", rmse)

### Tune params

In [18]:
param_grid = {
    "name": ["cosine"],
    "user_based": [True],
    "k": [5],
    "min_support": [1]
}

grid_search = GridSearchCV(KNNBasic, param_grid, measures = ["rmse"], cv = 3)
grid_search.fit(surprise_data)

best_param_ = grid_search.best_params["rmse"]

print("Best Params:", best_param_)

best_model = KNNBasic(**best_param_)
best_model.fit(trainset)

predictions = best_model.test(testset)

rmse = accuracy.rmse(predictions)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Best Params: {'name': 'cosine', 'user_based': True, 'k': 5, 'min_support': 1}
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.4651


### Save predictions

In [19]:
predictions_df = pd.DataFrame([(pred.uid, pred.iid, pred.est) for pred in predictions], 
                              columns=["user_id", "business_id", "predicted_rating"])
predictions_df.to_csv("knn_rating_predictions.csv", index=False)

## Category Analysis

In [20]:
df_business_data["categories"] = df_business_data["categories"].fillna("")
vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(", "), lowercase=False)
tfidf_matrix = vectorizer.fit_transform(df_business_data["categories"])
category_names = vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=category_names, index=df_business_data.index)
df_business_data["minimized_category"] = tfidf_df.idxmax(axis=1)

#print(df_business_data[["business_id", "categories", "minimized_category"]])
category_counts = df_business_data["minimized_category"].value_counts()
print(category_counts)



minimized_category
Burgers           2375
Nail Salons       2083
Pizza             2009
Italian           1918
Mexican           1872
                  ... 
Cucina campana       1
Dart Arenas          1
Circus Schools       1
Bubble Soccer        1
Trade Fairs          1
Name: count, Length: 1305, dtype: int64


## Final SVD Model

### Load and split data

In [21]:
df_review_data = df_review_data[["user_id", "business_id", "stars"]]
df_business_data = df_business_data[["business_id", "minimized_category", "name"]]
df_review_data = df_review_data.merge(df_business_data, on="business_id")

reader = Reader(rating_scale=(1, 5))
surprise_data = Dataset.load_from_df(df_review_data[["user_id", "business_id", "stars"]], reader)
trainset, testset = train_test_split(surprise_data, test_size=0.2)

### Fit final SVD with best params

In [22]:
model = SVD(**best_param)
model.fit(trainset)

predictions = model.test(testset)
predictions_df = pd.DataFrame([(pred.uid, pred.iid, pred.est) for pred in predictions], 
                              columns=["user_id", "business_id", "predicted_rating"])

predictions_df = predictions_df.merge(df_business_data, on="business_id")

### Generate recommendation

In [23]:
N = 5
predictions_df = predictions_df.sort_values(["user_id", "predicted_rating"], ascending=[True, False])
top_n_recommendations = predictions_df.groupby("user_id").head(N)

italian_reviews = df_review_data[df_review_data["minimized_category"] == "Italian"]
user_italian_counts = italian_reviews.groupby("user_id").size().reset_index(name="count")
low_review_italian_users = user_italian_counts[user_italian_counts["count"] <= 3]["user_id"]

filtered_recommendations = top_n_recommendations[
    (top_n_recommendations["user_id"].isin(low_review_italian_users)) &
    (top_n_recommendations["minimized_category"] == "Italian")
]

### Evaluate

In [24]:
actual_italian = df_review_data[df_review_data["minimized_category"] == "Italian"]
actual_italian_users = actual_italian.groupby("user_id")["name"].apply(set).reset_index()
recommended_italian_users = filtered_recommendations.groupby("user_id")["name"].apply(set).reset_index()

merged = actual_italian_users.merge(recommended_italian_users, on="user_id", how="outer", suffixes=("_actual", "_predicted"))
merged["name_actual"] = merged["name_actual"].apply(lambda x: x if isinstance(x, set) else set())
merged["name_predicted"] = merged["name_predicted"].apply(lambda x: x if isinstance(x, set) else set())

merged["true_positives"] = merged.apply(lambda x: len(x["name_actual"] & x["name_predicted"]), axis=1)
merged["precision"] = merged["true_positives"] / merged["name_predicted"].apply(lambda x: max(1, len(x)))
merged["recall"] = merged["true_positives"] / merged["name_actual"].apply(lambda x: max(1, len(x)))

avg_precision = merged["precision"].mean()
avg_recall = merged["recall"].mean()
hit_rate = (merged["true_positives"] > 0).mean()

print(f"Precision: {avg_precision:.4f}")
print(f"Recall: {avg_recall:.4f}")
print(f"Hit Rate: {hit_rate:.4f}")

global_top_italian = df_review_data[df_review_data["minimized_category"] == "Italian"].groupby("name")["stars"].mean().reset_index()
global_top_italian = global_top_italian.sort_values("stars", ascending=False).head(5)  # Take top 5 highest-rated Italian places

def get_fallback_recommendation():
    return np.random.choice(global_top_italian["name"].values)

for idx, row in merged.head(10).iterrows():
    recommended_names = row["name_predicted"] if row["name_predicted"] else {get_fallback_recommendation()}
    print(f"User ID: {row['user_id']}")
    print(f"Reviewed Italian Restaurants: {row['name_actual']}")
    print(f"Recommended Italian Restaurants: {recommended_names}")
    print("-" * 50)

Precision: 0.1973
Recall: 0.1799
Hit Rate: 0.1973
User ID: ---r61b7EpVPkb4UVme5tA
Reviewed Italian Restaurants: {"Angelo's Pizza"}
Recommended Italian Restaurants: {'Cafe Giardino'}
--------------------------------------------------
User ID: --1zDxbpfa486HyfyV8a-g
Reviewed Italian Restaurants: {"Cacicia's Old World Sicilian Foods"}
Recommended Italian Restaurants: {'Tana at Treo'}
--------------------------------------------------
User ID: --3WaS23LcIXtxyFULJHTA
Reviewed Italian Restaurants: {'Pamfilios', 'Sangiovese Ristorante'}
Recommended Italian Restaurants: {'Tana at Treo'}
--------------------------------------------------
User ID: --4AjktZiHowEIBCMd4CZA
Reviewed Italian Restaurants: {"Matt & Marie's"}
Recommended Italian Restaurants: {'Tana at Treo'}
--------------------------------------------------
User ID: --7SMrwYYu5xniXjBGdWhw
Reviewed Italian Restaurants: {"Alfredo's Italian Cuisine"}
Recommended Italian Restaurants: {"Alfredo's Italian Cuisine"}
--------------------------