In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dianping/kg_final.txt
/kaggle/input/dianping/ratings_final.txt


In [2]:
## NDGC & RMSE

def dcg_k(r, k):
    """ Discounted Cumulative Gain (DGC)  
    Args:
        r: True Ratings in Predicted Rank Order (1st element is top recommendation)
        k: Number of results to consider
    Returns:
        DCG
    """
  
    r = np.asfarray(r)[:k]
    return np.sum(2**r / np.log2(np.arange(2, r.size + 2)))      



def ndcg_k(r, k):
    """Normalized Discounted Cumulative Gain (NDCG)
    Args:
        r: True Ratings in Predicted Rank Order (1st element is top recommendation)
        k: Number of results to consider
    Returns:
        NDCG
    """
    dcg_max = dcg_k(sorted(r, reverse=True), k)
    if not dcg_max:
        return 0.
    return dcg_k(r, k) / dcg_max

def mean_ndcg(rs):
    """Mean NDCG for all users
    Args:
        rs: Iterator / For each user: True Ratings in Predicted Rank Order
    Returns:
        Mean NDCG
    """
    return np.mean([ndcg_k(r, len(r)) for r in rs])

def rmse(y,h):
    """RMSE
    Args:
        y: real y
        h: predicted y
    Returns:
        RMSE
    """
    y = np.array(y)
    h = np.array(h)

    a = y-h

    return np.sqrt(sum(a**2)/len(a))


In [3]:
import logging
def load_data():
    logging.info("================== preparing data ===================")
    train_data, eval_data, test_data = load_rating()
    # item_triple_sets = kg_propagation(args, kg, item_init_entity_set, args.item_triple_set_size, False)
    return train_data, eval_data, test_data


def load_rating():
    rating_file = '/kaggle/input/dianping/ratings_final'
    logging.info("load rating file: %s.npy", rating_file)
    if os.path.exists(rating_file + '.npy'):
        rating_np = np.load(rating_file + '.npy')
    else:
        rating_np = np.loadtxt(rating_file + '.txt', dtype=np.int32)
        # np.save(rating_file + '.npy', rating_np)
    return dataset_split(rating_np)


def dataset_split(rating_np):
    logging.info("splitting dataset to 6:2:2 ...")
    # train:eval:test = 6:2:2
    np.random.seed(37)
    eval_ratio = 0.2
    test_ratio = 0.2
    n_ratings = rating_np.shape[0]
    
    eval_indices = np.random.choice(n_ratings, size=int(n_ratings * eval_ratio), replace=False)
    left = set(range(n_ratings)) - set(eval_indices)
    test_indices = np.random.choice(list(left), size=int(n_ratings * test_ratio), replace=False)
    train_indices = list(left - set(test_indices))
    
    # user_init_entity_set, item_init_entity_set = collaboration_propagation(rating_np, train_indices)
    
    # train_indices = [i for i in train_indices if rating_np[i][0] in user_init_entity_set.keys()]
    # eval_indices = [i for i in eval_indices if rating_np[i][0] in user_init_entity_set.keys()]
    # test_indices = [i for i in test_indices if rating_np[i][0] in user_init_entity_set.keys()]
    train_data = rating_np[train_indices]
    eval_data = rating_np[eval_indices]
    test_data = rating_np[test_indices]
    
    return train_data, eval_data, test_data

In [4]:
train, eval, test = load_data()

train = train[:1000]

In [5]:

def matrix_factorization(train, num_users, num_foods, latent_dim=10, lr=0.01, reg=0.1, epochs=100):
    U = np.random.normal(scale=1.0/latent_dim, size=(num_users, latent_dim))
    F = np.random.normal(scale=1.0/latent_dim, size=(num_foods, latent_dim))

    # Training using SGD
    for epoch in range(epochs):
        np.random.shuffle(train) 

        for user_id, food_id, rating in train:
            user_id = int(user_id)  # Ensure indexing is an integer
            food_id = int(food_id)

            pred_rating = np.dot(U[user_id], F[food_id])
            error = rating - pred_rating

            U[user_id] += lr * (error * F[food_id] - reg * U[user_id])
            F[food_id] += lr * (error * U[user_id] - reg * F[food_id])

        loss = 0
        for user_id, food_id, rating in train:
            user_id = int(user_id)
            food_id = int(food_id)
            pred_rating = np.dot(U[user_id], F[food_id])
            error = rating - pred_rating
            loss += error**2 + reg * (np.linalg.norm(U[user_id])**2 + np.linalg.norm(F[food_id])**2)

        loss /= len(train)
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss:.4f}")

    return U, F



In [6]:
# Example usage
# train = np.array([
#     [0, 0, 5],
#     [0, 1, 3],
#     [1, 0, 4],
#     [1, 2, 1],
#     # ... (user_id, food_id, rating)
# ])
num_users = int(np.max(train[:, 0]) + 1)
num_foods = int(np.max(train[:, 1]) + 1)
latent_dim = 10
U, F = matrix_factorization(train, num_users, num_foods, latent_dim=latent_dim)


Epoch 1/100, Loss: 0.5163
Epoch 2/100, Loss: 0.5141
Epoch 3/100, Loss: 0.5121
Epoch 4/100, Loss: 0.5103
Epoch 5/100, Loss: 0.5086
Epoch 6/100, Loss: 0.5070
Epoch 7/100, Loss: 0.5055
Epoch 8/100, Loss: 0.5041
Epoch 9/100, Loss: 0.5027
Epoch 10/100, Loss: 0.5014
Epoch 11/100, Loss: 0.5001
Epoch 12/100, Loss: 0.4988
Epoch 13/100, Loss: 0.4975
Epoch 14/100, Loss: 0.4962
Epoch 15/100, Loss: 0.4948
Epoch 16/100, Loss: 0.4935
Epoch 17/100, Loss: 0.4922
Epoch 18/100, Loss: 0.4908
Epoch 19/100, Loss: 0.4894
Epoch 20/100, Loss: 0.4880
Epoch 21/100, Loss: 0.4865
Epoch 22/100, Loss: 0.4850
Epoch 23/100, Loss: 0.4835
Epoch 24/100, Loss: 0.4819
Epoch 25/100, Loss: 0.4802
Epoch 26/100, Loss: 0.4785
Epoch 27/100, Loss: 0.4767
Epoch 28/100, Loss: 0.4749
Epoch 29/100, Loss: 0.4730
Epoch 30/100, Loss: 0.4710
Epoch 31/100, Loss: 0.4689
Epoch 32/100, Loss: 0.4668
Epoch 33/100, Loss: 0.4646
Epoch 34/100, Loss: 0.4623
Epoch 35/100, Loss: 0.4599
Epoch 36/100, Loss: 0.4574
Epoch 37/100, Loss: 0.4548
Epoch 38/1

In [7]:
def infer(U, F, user_id, food_id, default_rating=0):
    if user_id < 0 or user_id >= U.shape[0] or food_id < 0 or food_id >= F.shape[0]:
        return default_rating
    return np.dot(U[user_id], F[food_id])


dev_y_pred = [infer(U, F, int(user_id), int(food_id)) for user_id, food_id, _ in eval]


In [8]:
def parameter_tuning(train, eval_set, test_set, latent_dims=[10], lrs=[0.01], regs=[0.1], epochs=100):
    best_model = None
    best_score = -np.inf  # Track the highest NDCG score
    best_params = {}
    
    # Grid search over parameter combinations
    for latent_dim in latent_dims:
        for lr in lrs:
            for reg in regs:
                # Train matrix factorization model
                U, F = matrix_factorization(train, num_users=np.max(train[:, 0]) + 1,
                                            num_foods=np.max(train[:, 1]) + 1,
                                            latent_dim=latent_dim, lr=lr, reg=reg, epochs=epochs)
                
                # Evaluate on dev set (assumes dev has the same structure as train)
                dev_y_true = [rating for _, _, rating in eval_set]
                dev_y_pred = [infer(U, F, int(user_id), int(food_id)) for user_id, food_id, _ in eval_set]
                dev_rmse = rmse(dev_y_true, dev_y_pred)
                
                # Calculate NDCG on eval set
                # Group eval_set by user and calculate NDCG per user
                eval_users = np.unique(eval_set[:, 0])
                ndcg_scores = []
                
                for user_id in eval_users:
                    user_ratings = eval_set[eval_set[:, 0] == user_id]
                    relevance = [rating for _, _, rating in user_ratings]
                    preds = [infer(U, F, int(user_id), int(food_id)) for _, food_id, _ in user_ratings]
                    
                    # Sort predictions by relevance
                    sorted_indices = np.argsort(preds)[::-1]
                    sorted_relevance = [relevance[i] for i in sorted_indices]
                    
                    # Calculate NDCG for this user
                    ndcg_scores.append(ndcg_k(sorted_relevance, k=10))
                
                mean_ndcg_score = np.mean(ndcg_scores)
                
                # Check if this model is the best one based on NDCG (or RMSE if preferred)
                if mean_ndcg_score > best_score:
                    best_score = mean_ndcg_score
                    best_model = (U, F)
                    best_params = {'latent_dim': latent_dim, 'lr': lr, 'reg': reg, 'epochs': epochs}
                    print(f"New best model with NDCG: {mean_ndcg_score:.4f}, RMSE: {dev_rmse:.4f}")

    # Final evaluation on the test set
    U, F = best_model
    test_y_true = [rating for _, _, rating in test_set]
    test_y_pred = [infer(U, F, int(user_id), int(food_id)) for user_id, food_id, _ in test_set]
    test_rmse = rmse(test_y_true, test_y_pred)
    
    # Group test set by user and calculate mean NDCG per user
    test_users = np.unique(test_set[:, 0])
    test_ndcg_scores = []

    for user_id in test_users:
        user_ratings = test_set[test_set[:, 0] == user_id]
        relevance = [rating for _, _, rating in user_ratings]
        preds = [infer(U, F, int(user_id), int(food_id)) for _, food_id, _ in user_ratings]

        sorted_indices = np.argsort(preds)[::-1]
        sorted_relevance = [relevance[i] for i in sorted_indices]
        test_ndcg_scores.append(ndcg_k(sorted_relevance, k=10))

    mean_test_ndcg = np.mean(test_ndcg_scores)

    print("\nFinal evaluation on the test set:")
    print(f"Test RMSE: {test_rmse:.4f}")
    print(f"Mean Test NDCG@10: {mean_test_ndcg:.4f}")
    print(f"Best parameters: {best_params}")
    
parameter_tuning(train, eval, test)

Epoch 1/100, Loss: 0.5184
Epoch 2/100, Loss: 0.5158
Epoch 3/100, Loss: 0.5135
Epoch 4/100, Loss: 0.5115
Epoch 5/100, Loss: 0.5097
Epoch 6/100, Loss: 0.5080
Epoch 7/100, Loss: 0.5064
Epoch 8/100, Loss: 0.5049
Epoch 9/100, Loss: 0.5034
Epoch 10/100, Loss: 0.5021
Epoch 11/100, Loss: 0.5007
Epoch 12/100, Loss: 0.4994
Epoch 13/100, Loss: 0.4981
Epoch 14/100, Loss: 0.4968
Epoch 15/100, Loss: 0.4956
Epoch 16/100, Loss: 0.4943
Epoch 17/100, Loss: 0.4930
Epoch 18/100, Loss: 0.4917
Epoch 19/100, Loss: 0.4904
Epoch 20/100, Loss: 0.4890
Epoch 21/100, Loss: 0.4876
Epoch 22/100, Loss: 0.4862
Epoch 23/100, Loss: 0.4848
Epoch 24/100, Loss: 0.4833
Epoch 25/100, Loss: 0.4817
Epoch 26/100, Loss: 0.4802
Epoch 27/100, Loss: 0.4785
Epoch 28/100, Loss: 0.4769
Epoch 29/100, Loss: 0.4751
Epoch 30/100, Loss: 0.4733
Epoch 31/100, Loss: 0.4714
Epoch 32/100, Loss: 0.4695
Epoch 33/100, Loss: 0.4675
Epoch 34/100, Loss: 0.4654
Epoch 35/100, Loss: 0.4632
Epoch 36/100, Loss: 0.4610
Epoch 37/100, Loss: 0.4586
Epoch 38/1