In [37]:
import pandas as pd
import numpy as np

# Create interaction data
interactions = [
    {'user_id': 'U1', 'product_id': 'P1', 'interaction_type': 'purchase', 'weight': 1.0},
    {'user_id': 'U1', 'product_id': 'P2', 'interaction_type': 'view', 'weight': 0.5},
    {'user_id': 'U2', 'product_id': 'P3', 'interaction_type': 'purchase', 'weight': 1.0},
    {'user_id': 'U2', 'product_id': 'P4', 'interaction_type': 'view', 'weight': 0.5},
    {'user_id': 'U3', 'product_id': 'P1', 'interaction_type': 'purchase', 'weight': 1.0},
    {'user_id': 'U3', 'product_id': 'P2', 'interaction_type': 'purchase', 'weight': 1.0},
    {'user_id': 'U3', 'product_id': 'P5', 'interaction_type': 'view', 'weight': 0.5},
    {'user_id': 'U4', 'product_id': 'P5', 'interaction_type': 'purchase', 'weight': 1.0},
    {'user_id': 'U4', 'product_id': 'P6', 'interaction_type': 'view', 'weight': 0.5},
    {'user_id': 'U5', 'product_id': 'P3', 'interaction_type': 'view', 'weight': 0.5},
    {'user_id': 'U5', 'product_id': 'P4', 'interaction_type': 'view', 'weight': 0.5}
]
interactions_df = pd.DataFrame(interactions)

# Create user features
users = [
    {'user_id': 'U1', 'age': 25, 'preferred_category': 'electronics'},
    {'user_id': 'U2', 'age': 35, 'preferred_category': 'clothing'},
    {'user_id': 'U3', 'age': 28, 'preferred_category': 'electronics'},
    {'user_id': 'U4', 'age': 40, 'preferred_category': 'home_goods'},
    {'user_id': 'U5', 'age': 22, 'preferred_category': 'clothing'}
]
users_df = pd.DataFrame(users)

# Create product features
products = [
    {'product_id': 'P1', 'name': 'Smartphone', 'category': 'electronics', 'price': 500},
    {'product_id': 'P2', 'name': 'Laptop', 'category': 'electronics', 'price': 1000},
    {'product_id': 'P3', 'name': 'T-shirt', 'category': 'clothing', 'price': 20},
    {'product_id': 'P4', 'name': 'Jeans', 'category': 'clothing', 'price': 50},
    {'product_id': 'P5', 'name': 'Lamp', 'category': 'home_goods', 'price': 40},
    {'product_id': 'P6', 'name': 'Blender', 'category': 'home_goods', 'price': 80}
]
products_df = pd.DataFrame(products)

# Normalize numerical features
products_df['price_normalized'] = products_df['price'] / products_df['price'].max()
users_df['age_normalized'] = users_df['age'] / users_df['age'].max()

# One-hot encode categorical features
user_categories = pd.get_dummies(users_df['preferred_category'], prefix='prefers')
users_df = pd.concat([users_df, user_categories], axis=1)

product_categories = pd.get_dummies(products_df['category'], prefix='category')
products_df = pd.concat([products_df, product_categories], axis=1)


In [38]:
import networkx as nx
import torch
from torch_geometric.data import Data, HeteroData

# Create a bipartite graph
G = nx.Graph()

# Add user nodes with features
for _, user in users_df.iterrows():
    G.add_node(user['user_id'], 
               type='user',
               age=user['age_normalized'],
               prefers_electronics=user['prefers_electronics'] if 'prefers_electronics' in user else 0,
               prefers_clothing=user['prefers_clothing'] if 'prefers_clothing' in user else 0,
               prefers_home_goods=user['prefers_home_goods'] if 'prefers_home_goods' in user else 0)

# Add product nodes with features
for _, product in products_df.iterrows():
    G.add_node(product['product_id'], 
               type='product',
               price=product['price_normalized'],
               category_electronics=product['category_electronics'],
               category_clothing=product['category_clothing'],
               category_home_goods=product['category_home_goods'])

# Add edges based on interactions
for _, interaction in interactions_df.iterrows():
    G.add_edge(interaction['user_id'], interaction['product_id'], weight=interaction['weight'])

# Preparing data for PyTorch Geometric
# Create mappings for user and product IDs
user_mapping = {user_id: idx for idx, user_id in enumerate(G.nodes()) if G.nodes[user_id]['type'] == 'user'}
product_mapping = {prod_id: idx for idx, prod_id in enumerate(G.nodes()) if G.nodes[prod_id]['type'] == 'product'}

# Create edge index
edge_index = []
edge_weights = []

for u, v, data in G.edges(data=True):
    if G.nodes[u]['type'] == 'user':
        edge_index.append([user_mapping[u], product_mapping[v]])
    else:
        edge_index.append([user_mapping[v], product_mapping[u]])
    edge_weights.append(data['weight'])

edge_index = torch.tensor(edge_index, dtype=torch.long).t()
edge_weights = torch.tensor(edge_weights, dtype=torch.float)

# Create node features
user_features = torch.tensor([[G.nodes[u]['age'], 
                              G.nodes[u]['prefers_electronics'],
                              G.nodes[u]['prefers_clothing'],
                              G.nodes[u]['prefers_home_goods']] 
                             for u in user_mapping], dtype=torch.float)

product_features = torch.tensor([[G.nodes[p]['price'],
                                 G.nodes[p]['category_electronics'],
                                 G.nodes[p]['category_clothing'],
                                 G.nodes[p]['category_home_goods']] 
                                for p in product_mapping], dtype=torch.float)

# Create PyTorch Geometric data object
data = HeteroData()
data['user'].x = user_features
data['product'].x = product_features
data['user', 'interacts', 'product'].edge_index = edge_index
data['user', 'interacts', 'product'].edge_attr = edge_weights


In [39]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from torch_geometric.nn import HeteroConv, GCNConv

class GraphSAGERecommender(nn.Module):
    def __init__(self, user_features, product_features, embedding_dim=64, hidden_dim=32):
        super(GraphSAGERecommender, self).__init__()
        
        # Input feature dimensions
        self.user_feature_dim = user_features.shape[1]
        self.product_feature_dim = product_features.shape[1]
        
        # First layer: transform node features into embeddings
        self.user_encoder = nn.Linear(self.user_feature_dim, embedding_dim)
        self.product_encoder = nn.Linear(self.product_feature_dim, embedding_dim)
        
        # GraphSAGE convolutional layers
        self.conv1 = HeteroConv({
            ('user', 'interacts', 'product'): SAGEConv(embedding_dim, hidden_dim),
            ('product', 'rev_interacts', 'user'): SAGEConv(embedding_dim, hidden_dim)
        })
        
        self.conv2 = HeteroConv({
            ('user', 'interacts', 'product'): SAGEConv(hidden_dim, hidden_dim),
            ('product', 'rev_interacts', 'user'): SAGEConv(hidden_dim, hidden_dim)
        })
        
        # Final prediction layer
        self.predictor = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )
    
    def forward(self, x_dict, edge_indices_dict):
        # Initial embeddings from features
        x_dict['user'] = F.relu(self.user_encoder(x_dict['user']))
        x_dict['product'] = F.relu(self.product_encoder(x_dict['product']))
        
        # First message passing layer
        x_dict = self.conv1(x_dict, edge_indices_dict)
        x_dict = {key: F.relu(value) for key, value in x_dict.items()}
        
        # Second message passing layer
        x_dict = self.conv2(x_dict, edge_indices_dict)
        x_dict = {key: F.relu(value) for key, value in x_dict.items()}
        
        return x_dict
    
    def predict(self, user_emb, product_emb):
        # Concatenate user and product embeddings
        pair_emb = torch.cat([user_emb, product_emb], dim=1)
        # Predict interaction probability
        return torch.sigmoid(self.predictor(pair_emb))
# 

In [40]:
import torch.optim as optim
from torch_geometric.loader import NeighborLoader

def train_model(model, data, epochs=100, lr=0.01, batch_size=64):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.BCELoss()
    
    # Split data into train/validation
    # (For simplicity, we're using all data for training in this example)
    # In a real scenario, you would use proper train/val/test splits
    
    train_loader = NeighborLoader(
        data,
        num_neighbors={
            ('user', 'interacts', 'product'): [5],
            ('product', 'rev_interacts', 'user'): [5]
        },
        batch_size=batch_size,
        input_nodes=('user', None)
    )
    
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            # Zero gradients
            optimizer.zero_grad()
            
            # Forward pass
            embeddings = model(batch.x_dict, batch.edge_index_dict)
            
            # Get user-product pairs from batch
            user_idx, product_idx = batch.edge_index_dict[('user', 'interacts', 'product')]
            
            # Get embeddings for the connected pairs
            user_embs = embeddings['user'][user_idx]
            product_embs = embeddings['product'][product_idx]
            
            # Predict interactions for these pairs
            pred = model.predict(user_embs, product_embs).squeeze()
            
            # Ground truth - edge weights as target (1.0 for purchase, 0.5 for view)
            target = batch.edge_attr_dict[('user', 'interacts', 'product')].float()
            
            # Compute loss
            loss = criterion(pred, target)
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        # Print progress
        if (epoch + 1) % 10 == 0:
            print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}')
    
    return model

In [41]:
def get_recommendations(model, data, user_id, user_mapping, product_mapping, interactions_df, top_k=5):
    # Set model to evaluation mode
    model.eval()
    
    # Get index for the specified user ID
    user_idx = list(user_mapping.keys()).index(user_id)
    
    # Run model forward pass to get embeddings
    with torch.no_grad():
        embeddings = model(data.x_dict, data.edge_index_dict)
    
    # Get the embedding for our target user
    user_embedding = embeddings['user'][user_idx].unsqueeze(0)
    
    # Calculate scores for all products
    product_embeddings = embeddings['product']
    
    # For each product, calculate prediction score
    scores = []
    for prod_idx in range(len(product_mapping)):
        prod_embedding = product_embeddings[prod_idx].unsqueeze(0)
        score = model.predict(user_embedding, prod_embedding).item()
        product_id = list(product_mapping.keys())[prod_idx]
        scores.append((product_id, score))
    
    # Sort by score (descending) and get top-k
    scores.sort(key=lambda x: x[1], reverse=True)
    
    # Filter out products the user has already interacted with
    existing_interactions = set(interactions_df[interactions_df['user_id'] == user_id]['product_id'])
    recommendations = [(prod_id, score) for prod_id, score in scores if prod_id not in existing_interactions][:top_k]
    
    return recommendations

In [42]:
def evaluate_model(model, data, test_interactions):
    model.eval()
    
    # Metrics
    precision_k = 0
    recall_k = 0
    ndcg_k = 0
    k = 10  # top-k recommendations
    
    # Get all user embeddings
    with torch.no_grad():
        embeddings = model(data.x_dict, data.edge_index_dict)
    
    # Calculate metrics for each user
    for user_id in user_mapping.keys():
        # Get ground truth interactions from test set
        true_interactions = set(test_interactions[test_interactions['user_id'] == user_id]['product_id'])
        
        if not true_interactions:
            continue  # Skip users with no test interactions
        
        # Get recommendations
        recommended_products = get_recommendations(model, data, user_id, top_k=k)
        
        # Calculate precision@k
        hits = len(set(recommended_products) & true_interactions)
        precision_k += hits / k
        
        # Calculate recall@k
        recall_k += hits / len(true_interactions)
        
        # NDCG calculation would go here
        
    # Average metrics
    num_users = len(user_mapping)
    precision_k /= num_users
    recall_k /= num_users
    
    return {
        f'precision@{k}': precision_k,
        f'recall@{k}': recall_k
    }


In [43]:
import pandas as pd

# Simulate a database with interactions, users, and products data
# Interactions table
interactions_data = [
    {'user_id': 'U1', 'product_id': 'P1', 'interaction_type': 'purchase', 'weight': 1.0},
    {'user_id': 'U1', 'product_id': 'P2', 'interaction_type': 'view', 'weight': 0.5},
    {'user_id': 'U2', 'product_id': 'P3', 'interaction_type': 'purchase', 'weight': 1.0},
    {'user_id': 'U2', 'product_id': 'P4', 'interaction_type': 'view', 'weight': 0.5},
    {'user_id': 'U3', 'product_id': 'P1', 'interaction_type': 'purchase', 'weight': 1.0},
    {'user_id': 'U3', 'product_id': 'P2', 'interaction_type': 'purchase', 'weight': 1.0},
    {'user_id': 'U3', 'product_id': 'P5', 'interaction_type': 'view', 'weight': 0.5},
    {'user_id': 'U4', 'product_id': 'P5', 'interaction_type': 'purchase', 'weight': 1.0},
    {'user_id': 'U4', 'product_id': 'P6', 'interaction_type': 'view', 'weight': 0.5},
    {'user_id': 'U5', 'product_id': 'P3', 'interaction_type': 'view', 'weight': 0.5},
    {'user_id': 'U5', 'product_id': 'P4', 'interaction_type': 'view', 'weight': 0.5}
]
interactions_df = pd.DataFrame(interactions_data)

# Users table
users_data = [
    {'user_id': 'U1', 'age': 25, 'preferred_category': 'electronics'},
    {'user_id': 'U2', 'age': 35, 'preferred_category': 'clothing'},
    {'user_id': 'U3', 'age': 28, 'preferred_category': 'electronics'},
    {'user_id': 'U4', 'age': 40, 'preferred_category': 'home_goods'},
    {'user_id': 'U5', 'age': 22, 'preferred_category': 'clothing'}
]
users_df = pd.DataFrame(users_data)

# Products table
products_data = [
    {'product_id': 'P1', 'name': 'Smartphone', 'category': 'electronics', 'price': 500},
    {'product_id': 'P2', 'name': 'Laptop', 'category': 'electronics', 'price': 1000},
    {'product_id': 'P3', 'name': 'T-shirt', 'category': 'clothing', 'price': 20},
    {'product_id': 'P4', 'name': 'Jeans', 'category': 'clothing', 'price': 50},
    {'product_id': 'P5', 'name': 'Lamp', 'category': 'home_goods', 'price': 40},
    {'product_id': 'P6', 'name': 'Blender', 'category': 'home_goods', 'price': 80}
]
products_df = pd.DataFrame(products_data)

# Display the simulated database
print("Interactions DataFrame:")
print(interactions_df.head())
print("\nUsers DataFrame:")
print(users_df.head())
print("\nProducts DataFrame:")
print(products_df.head())


Interactions DataFrame:
  user_id product_id interaction_type  weight
0      U1         P1         purchase     1.0
1      U1         P2             view     0.5
2      U2         P3         purchase     1.0
3      U2         P4             view     0.5
4      U3         P1         purchase     1.0

Users DataFrame:
  user_id  age preferred_category
0      U1   25        electronics
1      U2   35           clothing
2      U3   28        electronics
3      U4   40         home_goods
4      U5   22           clothing

Products DataFrame:
  product_id        name     category  price
0         P1  Smartphone  electronics    500
1         P2      Laptop  electronics   1000
2         P3     T-shirt     clothing     20
3         P4       Jeans     clothing     50
4         P5        Lamp   home_goods     40


In [44]:
def prepare_data():
    import pandas as pd
    
    # Simulated database (already created in the previous step)
    interactions_data = [
        {'user_id': 'U1', 'product_id': 'P1', 'interaction_type': 'purchase', 'weight': 1.0},
        {'user_id': 'U1', 'product_id': 'P2', 'interaction_type': 'view', 'weight': 0.5},
        {'user_id': 'U2', 'product_id': 'P3', 'interaction_type': 'purchase', 'weight': 1.0},
        {'user_id': 'U2', 'product_id': 'P4', 'interaction_type': 'view', 'weight': 0.5},
        {'user_id': 'U3', 'product_id': 'P1', 'interaction_type': 'purchase', 'weight': 1.0},
        {'user_id': 'U3', 'product_id': 'P2', 'interaction_type': 'purchase', 'weight': 1.0},
        {'user_id': 'U3', 'product_id': 'P5', 'interaction_type': 'view', 'weight': 0.5},
        {'user_id': 'U4', 'product_id': 'P5', 'interaction_type': 'purchase', 'weight': 1.0},
        {'user_id': 'U4', 'product_id': 'P6', 'interaction_type': 'view', 'weight': 0.5},
        {'user_id': 'U5', 'product_id': 'P3', 'interaction_type': 'view', 'weight': 0.5},
        {'user_id': 'U5', 'product_id': 'P4', 'interaction_type': 'view', 'weight': 0.5}
    ]
    interactions_df = pd.DataFrame(interactions_data)

    users_data = [
        {'user_id': 'U1', 'age': 25, 'preferred_category': 'electronics'},
        {'user_id': 'U2', 'age': 35, 'preferred_category': 'clothing'},
        {'user_id': 'U3', 'age': 28, 'preferred_category': 'electronics'},
        {'user_id': 'U4', 'age': 40, 'preferred_category': 'home_goods'},
        {'user_id': 'U5', 'age': 22, 'preferred_category': 'clothing'}
    ]
    users_df = pd.DataFrame(users_data)

    products_data = [
        {'product_id': 'P1', 'name': 'Smartphone', 'category': 'electronics', 'price': 500},
        {'product_id': 'P2', 'name': 'Laptop', 'category': 'electronics', 'price': 1000},
        {'product_id': 'P3', 'name': 'T-shirt', 'category': 'clothing', 'price': 20},
        {'product_id': 'P4', 'name': 'Jeans', 'category': 'clothing', 'price': 50},
        {'product_id': 'P5', 'name': 'Lamp', 'category': 'home_goods', 'price': 40},
        {'product_id': 'P6', 'name': 'Blender', 'category': 'home_goods', 'price': 80}
    ]
    products_df = pd.DataFrame(products_data)

    # Normalize numerical features
    products_df['price_normalized'] = products_df['price'] / products_df['price'].max()
    users_df['age_normalized'] = users_df['age'] / users_df['age'].max()

    # One-hot encode categorical features
    user_categories = pd.get_dummies(users_df['preferred_category'], prefix='prefers')
    users_df = pd.concat([users_df, user_categories], axis=1)

    product_categories = pd.get_dummies(products_df['category'], prefix='category')
    products_df = pd.concat([products_df, product_categories], axis=1)

    return interactions_df, users_df, products_df

def build_graph(interactions_df, users_df, products_df):
    import networkx as nx
    import torch
    from torch_geometric.data import HeteroData
    
    # Create a bipartite graph
    G = nx.Graph()
    
    # Add user nodes with features
    for _, user in users_df.iterrows():
        G.add_node(user['user_id'], 
                   type='user',
                   age=user['age_normalized'],
                   prefers_clothing=user['prefers_clothing'],
                   prefers_electronics=user['prefers_electronics'],
                   prefers_home_goods=user['prefers_home_goods'])
    
    # Add product nodes with features
    for _, product in products_df.iterrows():
        G.add_node(product['product_id'], 
                   type='product',
                   price=product['price_normalized'],
                   category_clothing=product['category_clothing'],
                   category_electronics=product['category_electronics'],
                   category_home_goods=product['category_home_goods'])
    
    # Add edges based on interactions
    for _, interaction in interactions_df.iterrows():
        G.add_edge(interaction['user_id'], interaction['product_id'], weight=interaction['weight'])
    
    # Preparing data for PyTorch Geometric
    user_mapping = {user_id: idx for idx, user_id in enumerate([n for n, d in G.nodes(data=True) if d['type'] == 'user'])}
    product_mapping = {prod_id: idx for idx, prod_id in enumerate([n for n, d in G.nodes(data=True) if d['type'] == 'product'])}
    
    edge_index = []
    edge_weights = []
    
    for u, v, data in G.edges(data=True):
        if G.nodes[u]['type'] == 'user':
            edge_index.append([user_mapping[u], product_mapping[v]])
        else:
            edge_index.append([user_mapping[v], product_mapping[u]])
        edge_weights.append(data['weight'])
    
    edge_index = torch.tensor(edge_index, dtype=torch.long).t()
    edge_weights = torch.tensor(edge_weights, dtype=torch.float)
    
    user_features = torch.tensor([[G.nodes[u]['age'], 
                                  G.nodes[u]['prefers_clothing'],
                                  G.nodes[u]['prefers_electronics'],
                                  G.nodes[u]['prefers_home_goods']] 
                                 for u in user_mapping], dtype=torch.float)
    
    product_features = torch.tensor([[G.nodes[p]['price'],
                                     G.nodes[p]['category_clothing'],
                                     G.nodes[p]['category_electronics'],
                                     G.nodes[p]['category_home_goods']] 
                                    for p in product_mapping], dtype=torch.float)
    
    data = HeteroData()
    data['user'].x = user_features
    data['product'].x = product_features
    
    # Add forward edges (user to product)
    data['user', 'interacts', 'product'].edge_index = edge_index
    data['user', 'interacts', 'product'].edge_attr = edge_weights
    
    # Add reverse edges (product to user) - THIS IS THE KEY FIX
    data['product', 'rev_interacts', 'user'].edge_index = torch.stack([edge_index[1], edge_index[0]], dim=0)
    data['product', 'rev_interacts', 'user'].edge_attr = edge_weights
    
    # Return all necessary objects
    return data, user_mapping, product_mapping, interactions_df

In [45]:
def main():
    # Step 1: Prepare data
    print("Preparing data...")
    interactions_df, users_df, products_df = prepare_data()
    
    # Step 2: Build graph
    print("Building graph...")
    data, user_mapping, product_mapping, interactions_df = build_graph(
        interactions_df, users_df, products_df
    )
    
    # Step 3: Create model
    print("Creating model...")
    model = GraphSAGERecommender(data['user'].x, data['product'].x)
    
    # Step 4: Train model
    print("Training model...")
    model = train_model(model, data, epochs=100)
    
    # Step 5: Generate recommendations for each user
    print("\nGenerating recommendations:")
    for user_id in user_mapping:
        recommendations = get_recommendations(
            model, data, user_id, user_mapping, product_mapping, interactions_df
        )
        
        # Display recommendations with product names
        print(f"\nRecommendations for {user_id}:")
        for i, (prod_id, score) in enumerate(recommendations, 1):
            product_name = products_df.loc[products_df['product_id'] == prod_id, 'name'].values[0]
            print(f"  {i}. {product_name} (score: {score:.4f})")

if __name__ == "__main__":
    main()

Preparing data...
Building graph...
Creating model...
Training model...
Epoch 10/100, Loss: nan
Epoch 20/100, Loss: nan
Epoch 30/100, Loss: nan
Epoch 40/100, Loss: nan
Epoch 50/100, Loss: nan
Epoch 60/100, Loss: nan
Epoch 70/100, Loss: nan
Epoch 80/100, Loss: nan
Epoch 90/100, Loss: nan
Epoch 100/100, Loss: nan

Generating recommendations:

Recommendations for U1:
  1. Lamp (score: 0.5032)
  2. Jeans (score: 0.5020)
  3. T-shirt (score: 0.5020)
  4. Blender (score: 0.5016)

Recommendations for U2:
  1. Lamp (score: 0.5009)
  2. Laptop (score: 0.5007)
  3. Blender (score: 0.5003)
  4. Smartphone (score: 0.5002)

Recommendations for U3:
  1. Jeans (score: 0.5027)
  2. T-shirt (score: 0.5027)
  3. Blender (score: 0.5022)

Recommendations for U4:
  1. Laptop (score: 0.5006)
  2. Smartphone (score: 0.5005)
  3. Jeans (score: 0.4995)
  4. T-shirt (score: 0.4995)

Recommendations for U5:
  1. Lamp (score: 0.5013)
  2. Laptop (score: 0.5010)
  3. Smartphone (score: 0.5006)
  4. Blender (score: