# Simple Hybrid recommendation

## Imports

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import json
from sklearn.model_selection import train_test_split


## Data pre processing

In [2]:
# Load and preprocess data
def preprocess_data(file_path):
    df = pd.read_csv(file_path)
    # Map author_id (user) and hotel_id (item) to continuous indices
    user_mapping = {id: idx for idx, id in enumerate(df['author_id'].unique())}
    item_mapping = {id: idx for idx, id in enumerate(df['hotel_id'].unique())}

    df['author_id'] = df['author_id'].map(user_mapping)
    df['hotel_id'] = df['hotel_id'].map(item_mapping)

    # Extract metadata from property_dict
    df['property_dict'] = df['property_dict'].apply(lambda x: json.loads(x) if isinstance(x, str) else {})
    metadata = df['property_dict'].apply(pd.Series).fillna(0)
    df = pd.concat([df, metadata], axis=1)

    num_users = len(user_mapping)
    num_items = len(item_mapping)
    num_attributes = metadata.shape[1]

    return df, num_users, num_items, num_attributes

file_path = "../data/combined_filtered_reviews.csv"
df, num_users, num_items, num_attributes = preprocess_data(file_path)


## Dataset and Dataloader

In [3]:
# Define Dataset
class HybridDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df['author_id'].values, dtype=torch.long)
        self.items = torch.tensor(df['hotel_id'].values, dtype=torch.long)
        self.ratings = torch.tensor(df['rating'].values, dtype=torch.float32)
        self.attributes = torch.tensor(df.iloc[:, 7:].values, dtype=torch.float32)  # Assuming metadata starts at 7th column

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx], self.attributes[idx]

# Split data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = HybridDataset(train_df)
test_dataset = HybridDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)


## Hybrid model definition

In [4]:
# Define Hybrid Model
class HybridNCF(nn.Module):
    def __init__(self, num_users, num_items, num_attributes, latent_dim=50):
        super(HybridNCF, self).__init__()
        # Embedding layers
        self.user_embedding = nn.Embedding(num_users, latent_dim)
        self.item_embedding = nn.Embedding(num_items, latent_dim)
        
        # Metadata processing
        self.metadata_fc = nn.Sequential(
            nn.Linear(num_attributes, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.3)
        )

        # Fully connected layers for recommendation
        self.fc = nn.Sequential(
            nn.Linear(latent_dim * 2 + 32, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1)
        )

    def forward(self, user, item, metadata):
        user_emb = self.user_embedding(user)
        item_emb = self.item_embedding(item)
        metadata_emb = self.metadata_fc(metadata)
        x = torch.cat([user_emb, item_emb, metadata_emb], dim=-1)
        return self.fc(x).squeeze()


## Training setup using CUDA

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = HybridNCF(num_users, num_items, num_attributes).to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


## Training Loop

In [6]:
epochs = 10
for epoch in range(epochs):
    model.train()
    train_loss = 0
    for user, item, rating, metadata in train_loader:
        user, item, rating, metadata = user.to(device), item.to(device), rating.to(device), metadata.to(device)

        optimizer.zero_grad()
        predictions = model(user, item, metadata)
        loss = criterion(predictions, rating)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_loss /= len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}")


Epoch 1/10, Train Loss: 0.5472
Epoch 2/10, Train Loss: 0.4061
Epoch 3/10, Train Loss: 0.3755
Epoch 4/10, Train Loss: 0.3597
Epoch 5/10, Train Loss: 0.3499
Epoch 6/10, Train Loss: 0.3415
Epoch 7/10, Train Loss: 0.3345
Epoch 8/10, Train Loss: 0.3274
Epoch 9/10, Train Loss: 0.3201
Epoch 10/10, Train Loss: 0.3122


## Evaluation

In [7]:
def evaluate_model(model, data_loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for user, item, rating, metadata in data_loader:
            user, item, rating, metadata = user.to(device), item.to(device), rating.to(device), metadata.to(device)
            predictions = model(user, item, metadata)
            loss = criterion(predictions, rating)
            total_loss += loss.item()
    return total_loss / len(data_loader)

test_loss = evaluate_model(model, test_loader)
print(f"Test Loss (MSE): {test_loss:.4f}")


Test Loss (MSE): 0.5191


## RMSE

In [8]:
import math

def compute_rmse(model, data_loader):
    model.eval()
    mse_loss = 0
    total_samples = 0
    with torch.no_grad():
        for user, item, rating, metadata in data_loader:
            user, item, rating, metadata = user.to(device), item.to(device), rating.to(device), metadata.to(device)
            predictions = model(user, item, metadata)
            mse_loss += torch.sum((predictions - rating) ** 2).item()
            total_samples += len(rating)
    rmse = math.sqrt(mse_loss / total_samples)
    return rmse

test_rmse = compute_rmse(model, test_loader)
print(f"Test RMSE: {test_rmse:.4f}")


Test RMSE: 0.7205


## Export model

In [9]:
torch.save(model.state_dict(), "hybrid_model.pth")
print("Model saved as 'hybrid_model.pth'")


Model saved as 'hybrid_model.pth'
