# Rating Prediction of Cafe on Google Maps

## Datasets

In [1]:
import os
import json
import gzip
from functools import partial
from datetime import datetime, timezone
import re

import requests
import numpy as np
import pandas as pd
import tqdm

import ast
import geopandas as gpd
from shapely.geometry import Point

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

torch.manual_seed(0)

<torch._C.Generator at 0x17eacd590>

### Downloading Dataset

In [2]:
meta_path = "./datasets/raw/meta-California.json.gz"
meta_keys = ["gmap_id", "name", "latitude", "longitude", "category", "avg_rating", "num_of_reviews", "price", "hours"]

review_path = "./datasets/raw/review-California.json.gz"
review_keys = ["gmap_id", "user_id", "name", "time", "rating"]

total_reviews = 70529977

In [3]:
def download_meta_data():
    url = "https://mcauleylab.ucsd.edu/public_datasets/gdrive/googlelocal/meta-California.json.gz"
    res = requests.get(url, stream=True)

    with open(meta_path, "wb") as f:
        f.write(res.content)

In [4]:

def download_review_data():
    url = "https://mcauleylab.ucsd.edu/public_datasets/gdrive/googlelocal/review-California.json.gz"
    res = requests.get(url, stream=True)

    with open(review_path, "wb") as f:
        f.write(res.content)

In [5]:
os.makedirs("./datasets/raw", exist_ok=True)
os.makedirs("./datasets/processed", exist_ok=True)

if not os.path.exists(meta_path):
    download_meta_data()

if not os.path.exists("./datasets/raw/review-California.json.gz"):
    download_review_data()

### Processing Dataset

In [6]:
def parse(path):
    g = gzip.open(path, "r")
    for l in g:
        yield json.loads(l)

Processing business data to extract Cafes we want to focus on.

In [7]:
# I reused my code from COGS108 project to process dataset.

def get_cafe_categories():
    categories = []
    for business in parse(meta_path):
        if business["category"] is not None:
            categories += business["category"]

    categories = np.array(categories)
    unique = np.unique(categories)

    cafe_categories = [str(category) for category in unique if "cafe" in category.lower() or "coffee" in category.lower()]
    print(f"The number of categories containting 'cafe' substring is {len(cafe_categories)}")
    print(cafe_categories)

    with open(f"./datasets/processed/cafe_categories.txt", "w") as f:
        f.write("\n".join(cafe_categories))

def filter_by_category(data, categories):
    category = data.get("category", None)
    if category is None:
        return False

    return len(set(category) & categories) != 0

def filter_by_num_reviews(data, min_num_reviews):
    return data["num_of_reviews"] >= min_num_reviews

def filter_raw_business_data(filters):
    businesses = []
    for business in parse(meta_path):
        if all([f(data=business) for f in filters]):
            business = {key: business.get(key, None) for key in meta_keys}
            businesses.append(business)

    print(f"We obtained total of {len(businesses)} after filtering")

    df = pd.DataFrame(businesses)
    df.to_csv(f"./datasets/processed/cafes.csv", index=False)

In [8]:
if not os.path.exists("./datasets/processed/cafe_categories.txt"):
        get_cafe_categories()

if not os.path.exists("./datasets/processed/cafes.csv"):
    min_num_reviews = 100

    with open("./datasets/processed/cafe_categories.txt", "r") as f:
        cafe_categories = set(f.read().split("\n"))

    cafe_filter = partial(filter_by_category, categories=cafe_categories)
    num_reviews_filter = partial(filter_by_num_reviews, min_num_reviews=min_num_reviews)

    filter_raw_business_data([cafe_filter, num_reviews_filter])

Processing review data to extract reviews we want to focus on.

In [9]:
# I reused my code from COGS108 project to process dataset.

def filter_by_gmap_id(data, gmap_ids):
    gmap_id = data.get("gmap_id", None)
    if gmap_id is None:
        return False

    return gmap_id in gmap_ids

def filter_raw_review_data(filters):
    reviews = []

    for review in tqdm.tqdm(parse(review_path), total=total_reviews):
        if all([f(data=review) for f in filters]):
            review = {key: review.get(key, None) for key in review_keys}
            review["review_id"] = f"{review['user_id']}_{review['gmap_id']}"
            reviews.append(review)

    print(f"We obtained total of {len(reviews)} after filtering")
    df = pd.DataFrame(reviews)
    df.to_csv("./datasets/raw/cafe_reviews.csv", index=False)

def extract_user_ids(reviews, min_num_reviews):
    user_ids = reviews["user_id"].dropna().values
    unique, counts = np.unique(np.array(user_ids), return_counts=True)
    users = pd.DataFrame({"user_id": unique, "num_reviews": counts})

    users = users[users["num_reviews"] >= min_num_reviews].reset_index(drop=True)
    print(f"We extracted {users.shape[0]} users after filtering.")

    users.to_csv("./datasets/processed/users.csv", index=False)

def filter_by_user_ids(reviews, user_ids):
    reviews = reviews[reviews["user_id"].isin(user_ids)]

    print(f"We extracted {reviews.shape[0]} reviews after filtering.")
    reviews.to_csv("./datasets/processed/reviews.csv", index=False)

In [10]:
if not os.path.exists("./datasets/raw/cafe_reviews.csv"):
    gmap_ids = set(pd.read_csv("./datasets/processed/cafes.csv")["gmap_id"].values)
    gmap_id_filter = partial(filter_by_gmap_id, gmap_ids=gmap_ids)
    filter_raw_review_data([gmap_id_filter])

if not os.path.exists("./datasets/processed/users.csv"):
    print("Start processing user data")
    reviews = pd.read_csv("./datasets/raw/cafe_reviews.csv")
    min_num_reviews = 20
    extract_user_ids(reviews, min_num_reviews)

if not os.path.exists("./datasets/processed/reviews.csv"):
    print("Start filtering review data")
    reviews = pd.read_csv("./datasets/raw/cafe_reviews.csv")
    user_ids = pd.read_csv("./datasets/processed/users.csv")["user_id"].values
    filter_by_user_ids(reviews, user_ids)

Split dataset into train, validation, and test so that we can evaluate models with unseen data. However, due to the design of the model which relies on pre-defined list of user and cafes, we need to split randomly without stratifying based on users or cafes.

In [11]:
def split_reviews():
    file_name = "./datasets/processed/reviews.csv"
    reviews = pd.read_csv(file_name).sample(frac=1, random_state=42)

    valid_size = int(reviews.shape[0] * 0.1)
    test_size = int(reviews.shape[0] * 0.1)

    valid_reviews = reviews.iloc[:valid_size].reset_index(drop=True)
    test_reviews = reviews.iloc[valid_size: valid_size + test_size].reset_index(drop=True)
    train_reviews = reviews.iloc[valid_size + test_size:].reset_index(drop=True)

    print(f"train: {train_reviews.shape[0]} / valid: {valid_reviews.shape[0]} / test: {test_reviews.shape[0]}")

    os.makedirs("./datasets/splits", exist_ok=True)

    train_reviews.to_csv("./datasets/splits/train.csv", index=False)
    valid_reviews.to_csv("./datasets/splits/valid.csv", index=False)
    test_reviews.to_csv("./datasets/splits/test.csv", index=False)

In [12]:
if not os.path.exists("./datasets/splits/train.csv"):
    split_reviews()

### EDA

# Todo

## Rating Predictions

### Modeling Context

### Feature Matrix Consturction

We construct dictionaries converting followings:
- `user_id` to index of onehot vector for user.
- `gmap_id` to index of onehot vector for cafe.

In [13]:
def preprocess_data_latent(feat_names):
    reviews = pd.read_csv("./datasets/processed/reviews.csv")
    cafes = pd.read_csv("./datasets/processed/cafes.csv")

    feat_dicts = {}
    for name in feat_names:
        if name == "user":
            unique_user_ids = np.sort(np.unique(reviews["user_id"].values))
            user2index = {user_id: index for index, user_id in enumerate(unique_user_ids)}
            feat_dicts[name] = user2index

        elif name == "cafe":
            unique_gmap_ids = np.sort(np.unique(cafes["gmap_id"]))
            cafe2index = {gmap_id: index for index, gmap_id in enumerate(unique_gmap_ids)}
            feat_dicts[name] = cafe2index

    avg_rating = reviews["rating"].mean()

    return feat_dicts, avg_rating

Then, we add preprocessing for unix time of a review.

In [14]:
# One Hot Encoding for Unix Time Weekday
def unix_weekday_to_onehot(time):
    feature_weekday = [0]*7

    day = datetime.fromtimestamp(time / 1000, tz=timezone.utc).weekday()
    feature_weekday[day] = 1.

    return feature_weekday

# One Hot Encoding for Unix Time Hour
def unix_hour_to_onehot(time):
    feature_dayhour = [0]*24

    hr = datetime.fromtimestamp(time / 1000, tz=timezone.utc).hour
    feature_dayhour[hr] = 1.

    return feature_dayhour

Then, we construct PyTorch Dataset. This class is desgined to be flexible about which features to use and it receives a list of feature names in `feat_names` we would use in a model and preprocessed mappings `feat_dicts` we need to map data to onehot vector. We can possibly use following features:
- `alpha` is a bias term and we initialize it with global average of rating.
- `user` is a user of a review.
- `cafe` is a cafe of a review.
- `weekday` is a weekday of a week when a review was posted.
- `hour` is an hour of a day when a review was posted.

In [15]:
class CafeDatasetLatent(Dataset):
    def __init__(self, mode, feat_names, feat_dicts):
        self.reviews = pd.read_csv(f"./datasets/splits/{mode}.csv").values

        self.feat_names = feat_names
        self.feat_dicts = feat_dicts

    def get_feat_sizes(self):
        feat_sizes = {}

        for name in self.feat_names:
            if name == "alpha":
                feat_sizes[name] = 1

            elif name == "user":
                feat_sizes[name] = len(self.feat_dicts[name].keys())

            elif name == "cafe":
                feat_sizes[name] = len(self.feat_dicts[name].keys())

            elif name == "weekday":
                feat_sizes[name] = 7

            elif name == "hour":
                feat_sizes[name] = 24

            else:
                raise NotImplementedError

        return feat_sizes

    def __len__(self):
        return self.reviews.shape[0]

    def __getitem__(self, index):
        review = self.reviews[index]
        feats = []
        for name in self.feat_names:
            if name == "alpha":
                feat = torch.ones(1)
                feats.append(feat)

            elif name == "user":
                feat_dict = self.feat_dicts[name]
                feat = torch.zeros(len(feat_dict.keys()))
                feat[feat_dict[review[1]]] = 1.
                feats.append(feat)

            elif name == "cafe":
                feat_dict = self.feat_dicts[name]
                feat = torch.zeros(len(feat_dict.keys()))
                feat[feat_dict[review[0]]] = 1.
                feats.append(feat)

            elif name == "weekday":
                feat = torch.tensor(unix_weekday_to_onehot(int(review[3])))
                feats.append(feat)

            elif name == "hour":
                feat = torch.tensor(unix_hour_to_onehot(int(review[3])))
                feats.append(feat)

            else:
                raise NotImplementedError

        rating = torch.tensor(review[4])

        return *feats, rating

Next, we construct a model for rating prediction. This class is also designed to be flexible and it receives following arguments:
- `name` is a unique identifier of a model.
- `dim` is a dimension of latent.
- `feat_sizes` is a list of feature sizes which would be used to initialize weights.
- `latent_names` is a list of latent feature names which would be used to initialize latents.
- `latent_pairs` is a list of tuples which indicate pairs we would calculate dot product in between.
- `avg_rating` is a global average rating which is used to initialize alpha.
  
This model has two sets of parameters:
- `weights` is a list of weights for each feature and corresponds to betas in model equation.
- `latents` is a list of latents for feature we specified and corresponds to gammas in model equation.

In [16]:
class RatePredictorLatent(nn.Module):
    def __init__(self, name, dim, feat_sizes, latent_names, latent_pairs, avg_rating):
        super().__init__()

        self.name = name
        self.num_feats = len(feat_sizes)
        self.num_latents = len(latent_names)

        self.latent_names = latent_names
        self.latent_pairs = latent_pairs

        self.latent_indices = []
        weights = []
        for i, (name, feat_size) in enumerate(feat_sizes.items()):
            if name == "alpha":
                weight = torch.tensor(avg_rating).unsqueeze(0)
            else:
                weight = torch.zeros(feat_size)
            weights.append(nn.Parameter(weight, requires_grad=True))

            if name in latent_names:
                self.latent_indices.append(i)

        self.weights =  nn.ParameterList(weights)

        latents = []
        for name in latent_names:
            feat_size = feat_sizes[name]
            latent = torch.randn(feat_size, dim) / dim
            latents.append(nn.Parameter(latent, requires_grad=True))

        self.latents = nn.ParameterList(latents)

    def forward(self, feats):
        assert len(feats) == self.num_feats

        out = torch.zeros(feats[0].size(0))
        for i in range(self.num_feats):
            out += torch.einsum("bd,d->b", feats[i], self.weights[i])

        gammas = {}
        for i in range(self.num_latents):
            index = self.latent_indices[i]
            gammas[self.latent_names[i]] = torch.einsum("bd,di->bi", feats[index], self.latents[i])

        for (latent_i, latent_j) in self.latent_pairs:
            out += torch.einsum("bi,bi->b", gammas[latent_i], gammas[latent_j])

        return out


Next, we construct a trainer to train a model by using autograd of PyTorch. 

In [17]:
class RateTrainerLatent():
    def __init__(self, model, lambs, lr, train_dataloader, valid_dataloader, device):
        self.model = model
        self.lambs = lambs
        self.train_dataloader = train_dataloader
        self.valid_dataloader = valid_dataloader
        self.device = device

        self.optim =  torch.optim.Adam(model.parameters(), lr=lr)

    def train(self, n_epochs):
        train_mses, valid_mses = [], []
        for i in range(n_epochs):
            train_mse = 0
            total = 0

            bar = tqdm.tqdm(self.train_dataloader, desc="Training Model")
            for feats in bar:
                ratings = feats[-1].to(self.device)
                feats = [f.to(self.device) for f in feats[:-1]]

                self.optim.zero_grad()

                pred_ratings = self.model(feats)
                mse = self.mse(ratings, pred_ratings)
                mse_reg = mse + self.regularizer()

                mse_reg.backward()
                self.optim.step()

                local_batch_size = feats[0].size(0)
                train_mse += mse.item() * local_batch_size
                total += local_batch_size

                bar.set_description(f"Training Model ({mse.item():.6f})")

            train_mse /= total
            valid_mse = self.validate()
            print(f"Step[{i + 1:2d}]: train {train_mse:2.6f} / valid {valid_mse:2.6f}")

            train_mses.append(train_mse)
            valid_mses.append(valid_mse)

        return train_mses, valid_mses

    def validate(self):
        with torch.no_grad():
            total = 0
            mse = 0

            for feats in self.valid_dataloader:
                ratings = feats[-1].to(self.device)
                feats = [f.to(self.device) for f in feats[:-1]]

                pred_ratings = self.model(feats)

                local_batch_size = feats[0].size(0)
                mse += self.mse(ratings, pred_ratings).item() * local_batch_size
                total += local_batch_size

            return mse / total

    def mse(self, y_true, y_pred):
        return torch.mean((y_true - y_pred) ** 2)

    def regularizer(self):
        weight_size = len(self.model.weights)
        latent_size = len(self.model.latents)
        assert len(self.lambs) == weight_size + latent_size

        reg = 0
        for i in range(len(self.lambs)):
            if i < len(self.model.weights):
                reg += self.lambs[i] * torch.mean(self.model.weights[i] ** 2)
            else:
                dim = self.model.latents[0].size(1)
                reg += self.lambs[i] * dim * torch.mean(self.model.latents[i - weight_size] ** 2)

        return reg

We record metrics in a file `metrics.json`

In [18]:
def update_metrics(name, train, valid):
    if os.path.exists("./metrics.json"):
        with open("./metrics.json", "r") as f:
           metrics = json.load(f)
    else:
        metrics = {}

    metrics[name] = {"metrics": {"train": train, "valid": valid}}

    with open("./metrics.json", "w") as f:
        json.dump(metrics, f)

After a series of experiments, we ended up using the following hyperparamters for all the experiments below:
- `n_epoch` is the numer of iterations for training.
- `lr` is a learning rate of gradient descent.
- `dim` is a dimension of latents if used.
- `batch_size` is a batch size of training.
- `lamb_dict` is a regularization coefficient for each feature.
- `device` is a device we run models on. You can change "cpu" to "cuda" if you have GPU environment.

In [19]:
n_epoch = 10
lr = 0.01
dim = 32
batch_size = 2048

lamb_dict = {"alpha": 0, "user": 0.1, "cafe": 1}

device = torch.device("cpu")

In [20]:
def train(name, feat_names, latent_names, latent_pairs):
    lambs = [lamb_dict[feat] for feat in feat_names + latent_names]

    assert all([((latent_i in latent_names) and (latent_j in latent_names)) for (latent_i, latent_j) in latent_pairs])
    assert len(lambs) == len(feat_names) + len(latent_names)

    if not os.path.exists(f"./models/{name}.pt"):
        print(f"Start training {name}")

        feat_dicts, avg_rating = preprocess_data_latent(feat_names)
        train_dataset = CafeDatasetLatent("train", feat_names, feat_dicts)
        valid_dataset = CafeDatasetLatent("valid", feat_names, feat_dicts)

        train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)

        feat_sizes = train_dataset.get_feat_sizes()
        model = RatePredictorLatent(name, dim, feat_sizes, latent_names, latent_pairs, avg_rating)

        trainer = RateTrainerLatent(model, lambs, lr, train_dataloader, valid_dataloader, device)

        train_mses, valid_mses = trainer.train(n_epoch)

        os.makedirs("./models", exist_ok=True)
        torch.save(model, f"./models/{name}.pt")

        update_metrics(name, train_mses, valid_mses)

We train a naive model where we only use features of `user_id` and `cafe` without latents.

In [21]:
name = "base"

feat_names = ["alpha", "user", "cafe"]
latent_names = []
latent_pairs = []

train(name, feat_names, latent_names, latent_pairs)

Next, we train a naive model with latents for `user` and `cafe`.

In [22]:
name = "latent"

feat_names = ["alpha", "user", "cafe"]
latent_names = ["user", "cafe"]
latent_pairs = [("user", "cafe")]

train(name, feat_names, latent_names, latent_pairs)

### Testing Models

We used three ways to evaluate models as follows:
- `mse` is a metric used as an objective for training.
- `rmse` is a root of mse which has same scale as predictive variable (rating).
- `accuracy` is accuracy of correct discrete rating prediction. Since all reviews have discrete ratings of 1.0, 2.0, 3.0, 4.0 and 5.0, given predictions, we asigned discrete prediction by rounding to nearest integer and calculated accuracy comparing true ratings.

In [23]:
def calculate_mse(y_true, y_pred):
    return torch.mean((y_true - y_pred) ** 2)

def calculate_rmse(y_true, y_pred):
    return torch.sqrt(torch.mean((y_true - y_pred) ** 2))

def discrete_rating(y_pred):
    y_pred = torch.clamp(y_pred, min=0, max=5)
    y_pred = torch.round(y_pred)

    return y_pred

For testing, we saved results in csv table so that we can compare models easily, which is saved to `./test_results.csv`.

In [24]:
def update_test_results(result):
    new_result = pd.Series(result).to_frame().T

    if os.path.exists("./test_results.csv"):
        results = pd.read_csv("./test_results.csv")

        duplicate_index = results["name"] == result["name"]
        if sum(duplicate_index) == 0:
            results = pd.concat([results, new_result]).reset_index(drop=True)
        else:
            results = results.values
            results[duplicate_index] = new_result.values
            results = pd.DataFrame(results, columns=new_result.columns)
    else:
        results = new_result

    print(results)
    results.to_csv("./test_results.csv", index=False)

First we evaluate two baseline models:
- `MostCommon` just returns most common discrete rating `5`.
- `Naive` just returns a global average.

In [25]:
class MostCommon(nn.Module):
    def __init__(self):
        super().__init__()

        reviews = pd.read_csv("./datasets/splits/test.csv")

        ratings, counts = np.unique(reviews["rating"], return_counts=True)
        self.most_common = torch.tensor(ratings[np.argmax(counts)])

    def forward(self, x):
        return self.most_common.repeat(x[0].size(0))

class Naive(nn.Module):
    def __init__(self):
        super().__init__()

        reviews = pd.read_csv("./datasets/splits/test.csv")
        self.average = torch.tensor(np.mean(reviews["rating"]))

    def forward(self, x):
        return self.average.repeat(x[0].size(0))

The following a function to run models on test dataset and save results into `test_results.csv`.

In [26]:
def test_model(name, feat_names):
    if name == "most_common":
        model = MostCommon()
    elif name == "naive":
        model = Naive()
    else:
        model = torch.load(f"./models/{name}.pt", weights_only=False)

    feat_dicts, _ = preprocess_data_latent(feat_names)

    test_dataset = CafeDatasetLatent("test", feat_names, feat_dicts)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

    with torch.no_grad():
        total = 0
        mse, rmse = 0, 0
        n_corrects = 0

        model.to(device)
        model.eval()

        for feats in tqdm.tqdm(test_dataloader):
            ratings = feats[-1].to(device)
            feats = [f.to(device) for f in feats[:-1]]

            pred_ratings = model(feats)

            local_batch_size = feats[0].size(0)
            mse += calculate_mse(ratings, pred_ratings).item() * local_batch_size
            rmse += calculate_rmse(ratings, pred_ratings).item() * local_batch_size

            pred_discrete = discrete_rating(pred_ratings)

            n_corrects += torch.sum(pred_discrete == ratings).item()
            total += local_batch_size

        test_mse = mse / total
        test_rmse = rmse / total
        test_accuracy = n_corrects / total

        result = {"name": name, "mse": test_mse, "rmse": test_rmse, "accuracy": test_accuracy}
        update_test_results(result)


Run MostCommon and Naive first.

In [27]:
name = "most_common"
feat_names = ["alpha"]
test_model(name, feat_names)

100%|██████████| 14/14 [00:00<00:00, 112.30it/s]

          name       mse      rmse  accuracy
0  most_common  1.594156  1.262308  0.523571
1        naive  1.006339  1.002901   0.28008
2         base  0.687413  0.828871  0.555235
3       latent  0.680835  0.824916  0.563493





In [28]:
name = "naive"
feat_names = ["alpha"]
test_model(name, feat_names)

100%|██████████| 14/14 [00:00<00:00, 75.28it/s]

          name       mse      rmse  accuracy
0  most_common  1.594156  1.262308  0.523571
1        naive  1.006339  1.003055   0.28008
2         base  0.687413  0.828871  0.555235
3       latent  0.680835  0.824916  0.563493





In [29]:
name = "base"
feat_names = ["alpha", "user", "cafe"]
test_model(name, feat_names)

100%|██████████| 14/14 [00:01<00:00, 10.55it/s]

          name       mse      rmse  accuracy
0  most_common  1.594156  1.262308  0.523571
1        naive  1.006339  1.003055   0.28008
2         base  0.687413  0.828869  0.555235
3       latent  0.680835  0.824916  0.563493





In [30]:
name = "latent"
feat_names = ["alpha", "user", "cafe"]
test_model(name, feat_names)

100%|██████████| 14/14 [00:01<00:00,  9.03it/s]

          name       mse      rmse  accuracy
0  most_common  1.594156  1.262308  0.523571
1        naive  1.006339  1.003055   0.28008
2         base  0.687413  0.828869  0.555235
3       latent  0.680835  0.824859  0.563493



