# Neural Collaborative Filtering (NCF)

## Imports

In [3]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
import csv
from tqdm import tqdm
import math

## Pre process data

In [5]:
# Load and preprocess data
def load_data(file_path):
    df = pd.read_csv(file_path)

    # Map author_id (user) and hotel_id (item) to continuous indices
    user_mapping = {id: idx for idx, id in enumerate(df['author_id'].unique())}
    item_mapping = {id: idx for idx, id in enumerate(df['hotel_id'].unique())}

    df['author_id'] = df['author_id'].map(user_mapping)
    df['hotel_id'] = df['hotel_id'].map(item_mapping)

    num_users = len(user_mapping)
    num_items = len(item_mapping)

    return df, num_users, num_items

# Load the data
data_file = "../data/combined_filtered_reviews.csv"
df, num_users, num_items = load_data(data_file)
print(f"Number of users: {num_users}, Number of items: {num_items}")


Number of users: 189992, Number of items: 329340


## NCFDataset split

In [7]:
# Define Dataset and DataLoader
class NCFDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df['author_id'].values, dtype=torch.long)
        self.items = torch.tensor(df['hotel_id'].values, dtype=torch.long)
        self.ratings = torch.tensor(df['rating'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]

# Split data into train and test sets
dataset = NCFDataset(df)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)
print("Data split into train and test sets.")


Data split into train and test sets.


## NCF Model

In [9]:
# Create the NCF model
class NCF(nn.Module):
    def __init__(self, num_users, num_items, latent_dim=50):
        super(NCF, self).__init__()
        self.user_embedding = nn.Embedding(num_users, latent_dim)
        self.item_embedding = nn.Embedding(num_items, latent_dim)

        # Fully connected layers
        self.fc = nn.Sequential(
            nn.Linear(latent_dim * 2, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1)
        )

    def forward(self, user, item):
        user_emb = self.user_embedding(user)
        item_emb = self.item_embedding(item)
        x = torch.cat([user_emb, item_emb], dim=-1)
        return self.fc(x).squeeze()

# Initialize the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
latent_dim = 50
model = NCF(num_users, num_items, latent_dim).to(device)
print(model)


NCF(
  (user_embedding): Embedding(189992, 50)
  (item_embedding): Embedding(329340, 50)
  (fc): Sequential(
    (0): Linear(in_features=100, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=128, out_features=64, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.3, inplace=False)
    (6): Linear(in_features=64, out_features=1, bias=True)
  )
)


## Training setup loop

In [11]:
# Define training setup
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [12]:
epochs = 10
train_losses, val_losses = [], []

for epoch in range(epochs):
    model.train()
    train_loss = 0
    with tqdm(total=len(train_loader), desc=f"Epoch {epoch + 1}/{epochs}") as pbar:
        for user, item, rating in train_loader:
            user, item, rating = user.to(device), item.to(device), rating.to(device)

            optimizer.zero_grad()
            preds = model(user, item)
            loss = criterion(preds, rating)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            pbar.update(1)
    train_loss /= len(train_loader)
    
    # Validation step
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for user, item, rating in test_loader:
            user, item, rating = user.to(device), item.to(device), rating.to(device)
            preds = model(user, item)
            val_loss += criterion(preds, rating).item()
    val_loss /= len(test_loader)

    print(f"Epoch {epoch + 1}/{epochs} - Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")


Epoch 1/10: 100%|██████████| 23062/23062 [02:05<00:00, 184.15it/s]


Epoch 1/10 - Train Loss: 1.0264, Validation Loss: 0.8603


Epoch 2/10: 100%|██████████| 23062/23062 [02:04<00:00, 184.99it/s]


Epoch 2/10 - Train Loss: 0.8158, Validation Loss: 0.7994


Epoch 3/10: 100%|██████████| 23062/23062 [02:05<00:00, 184.25it/s]


Epoch 3/10 - Train Loss: 0.7618, Validation Loss: 0.7830


Epoch 4/10: 100%|██████████| 23062/23062 [02:05<00:00, 184.31it/s]


Epoch 4/10 - Train Loss: 0.7385, Validation Loss: 0.7757


Epoch 5/10: 100%|██████████| 23062/23062 [02:05<00:00, 183.57it/s]


Epoch 5/10 - Train Loss: 0.7241, Validation Loss: 0.7721


Epoch 6/10: 100%|██████████| 23062/23062 [02:05<00:00, 183.86it/s]


Epoch 6/10 - Train Loss: 0.7117, Validation Loss: 0.7739


Epoch 7/10: 100%|██████████| 23062/23062 [02:04<00:00, 184.83it/s]


Epoch 7/10 - Train Loss: 0.6983, Validation Loss: 0.7782


Epoch 8/10: 100%|██████████| 23062/23062 [02:05<00:00, 183.64it/s]


Epoch 8/10 - Train Loss: 0.6840, Validation Loss: 0.7857


Epoch 9/10: 100%|██████████| 23062/23062 [02:05<00:00, 183.40it/s]


Epoch 9/10 - Train Loss: 0.6684, Validation Loss: 0.7866


Epoch 10/10: 100%|██████████| 23062/23062 [02:05<00:00, 184.27it/s]


Epoch 10/10 - Train Loss: 0.6523, Validation Loss: 0.7912


In [13]:
# Save the model
torch.save(model.state_dict(), "ncf_model.pth")
print("Model saved as 'ncf_model.pth'")


Model saved as 'ncf_model.pth'


## MSE and RMSE

In [None]:
# Reload the trained model
state_dict = torch.load("ncf_model.pth", weights_only=True)
loaded_model = NCF(num_users, num_items, latent_dim=50)  # model architecture
loaded_model.load_state_dict(state_dict)
loaded_model.to(device)
loaded_model.eval()  # Set to evaluation mode

def compute_rmse(model, data_loader):
    model.eval()
    mse_loss = 0
    total_samples = 0

    with torch.no_grad():
        for user, item, rating in data_loader:
            user, item, rating = user.to(device), item.to(device), rating.to(device)
            predictions = model(user, item)
            mse_loss += torch.sum((predictions - rating) ** 2).item()
            total_samples += len(rating)

    rmse = math.sqrt(mse_loss / total_samples)
    return rmse


def evaluate_model(model, data_loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for user, item, rating in data_loader:
            user, item, rating = user.to(device), item.to(device), rating.to(device)
            predictions = model(user, item)
            loss = criterion(predictions, rating)
            total_loss += loss.item()
    return total_loss / len(data_loader)

test_loss = evaluate_model(loaded_model, test_loader)
print(f"Test Loss (MSE): {test_loss:.4f}")

test_rmse = compute_rmse(loaded_model, test_loader)
print(f"Test RMSE: {test_rmse:.4f}")

Test Loss (MSE): 0.7912
