In [2]:
pip install torch

Collecting torch
  Using cached torch-2.3.1-cp312-cp312-win_amd64.whl.metadata (26 kB)
Collecting filelock (from torch)
  Using cached filelock-3.15.4-py3-none-any.whl.metadata (2.9 kB)
Collecting sympy (from torch)
  Downloading sympy-1.13.0-py3-none-any.whl.metadata (12 kB)
Collecting jinja2 (from torch)
  Using cached jinja2-3.1.4-py3-none-any.whl.metadata (2.6 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)
Collecting mkl<=2021.4.0,>=2021.1.1 (from torch)
  Using cached mkl-2021.4.0-py2.py3-none-win_amd64.whl.metadata (1.4 kB)
Collecting intel-openmp==2021.* (from mkl<=2021.4.0,>=2021.1.1->torch)
  Using cached intel_openmp-2021.4.0-py2.py3-none-win_amd64.whl.metadata (1.2 kB)
Collecting tbb==2021.* (from mkl<=2021.4.0,>=2021.1.1->torch)
  Downloading tbb-2021.13.0-py3-none-win_amd64.whl.metadata (1.1 kB)
Collecting MarkupSafe>=2.0 (from jinja2->torch)
  Using cached MarkupSafe-2.1.5-cp312-cp312-win_amd64.whl.metadata (3.1 kB)
Coll

In [3]:
pip install torch torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric 

Collecting torch-scatter
  Using cached torch_scatter-2.1.2.tar.gz (108 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting torch-sparse
  Downloading torch_sparse-0.6.18.tar.gz (209 kB)
     ---------------------------------------- 0.0/210.0 kB ? eta -:--:--
     -------------------------------------- 210.0/210.0 kB 4.2 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting torch-cluster
  Downloading torch_cluster-1.6.3.tar.gz (54 kB)
     ---------------------------------------- 0.0/54.5 kB ? eta -:--:--
     ---------------------------------------- 54.5/54.5 kB 2.8 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting torch-spline-conv
  Downloading torch_spline_conv-1.2.2.tar.gz (25 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished 

In [5]:
pip install scikit-learn matplotlib pandas pyarrow networkx matplotlib  


Note: you may need to restart the kernel to use updated packages.



# 1. OVERVIEW 
## Graph Construction:

Nodes: represent customers, financial products

Edges: represent relationships or interactions between nodes, such as purchase transactions, 

Node Features: Customer demographics, purchase history, preferences, etc.

Edge Features: Transaction amounts, frequency of interactions, etc.

In previous noteboos of this repo (STEP 1), the graph was persisted in graphml format 

## Feature Engineering:

Utilize both node features and edge features to enrich the graph representation.
Apply normalization and encoding techniques to prepare the data for GNN processing.


## GNN Model Training:

Train a GNN to learn embeddings for nodes that capture both the node features and the graph structure.
Use these embeddings for clustering customers into segments.

## Clustering:

Apply clustering algorithms  on the learned node embeddings to identify distinct market segments.

# 2.  Load the graph

In [6]:
import networkx as nx
import matplotlib.pyplot as plt
G = nx.read_graphml("graph.graphml")

# 3. Feature Enginnering

As I allready have precomputed embeddings from Node2Vec, I will include these embeddings as additional features in my  GNN model. This approach leverages the structural information captured by Node2Vec and combines it with the GNN's ability to learn from node features and graph topology.

x (Node Features):

x is a matrix where each row corresponds to the feature vector of a node in the graph.
The shape of x is [num_nodes, num_node_features].
edge_index (Edge Index):

edge_index is a tensor that represents the edges of the graph.
It is a 2D tensor of shape [2, num_edges], where each column represents an edge. The first row contains the source nodes, and the second row contains the target nodes.
edge_attr (Edge Features):

edge_attr is an optional tensor that contains the features of the edges.
The shape of edge_attr is [num_edges, num_edge_features].

In [98]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from torch_geometric.data import Data
import numpy as np
import torch
from torch_geometric.loader import NeighborLoader

from gensim.models import KeyedVectors


# Load embeddings from file c reated by node2vec
embeddings = KeyedVectors.load("node2vec_embeddings.kv")


# Extract node IDs and features
node_ids = []
node_types = []

for node in G.nodes(data=True):
    node_ids.append(str(node[0]))  
    node_types.append(node[1]['Node_Type'])

node2vec_embeddings = np.array([embeddings[node] for node in node_ids])

# One-hot encode the categorical node features
encoder = OneHotEncoder(sparse_output=False)
node_types_encoded = encoder.fit_transform(np.array(node_types).reshape(-1, 1))

# Create tensor node_features with node2vec embeddings and one-hot encoded node types
node_features = np.hstack([node2vec_embeddings, node_types_encoded])
node_features = torch.tensor(node_features, dtype=torch.float)
node_features

# Step 3: Create edge index and edge features
# Extract node IDs and create a mapping to integer indices
node_ids = list(G.nodes())
node_id_map = {node_id: i for i, node_id in enumerate(node_ids)}

edge_index = []
edge_features = []

for edge in G.edges(data=True):
    source, target = edge[0], edge[1]
    edge_index.append([node_id_map[source], node_id_map[target]])
    edge_features.append([edge[2]['Frequency'], edge[2]['Total_Amount'], edge[2]['Average_Amount']])

edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

# Normalize edge features before creating tensor because they have different scales
edge_features = np.array(edge_features)
scaler = StandardScaler()
edge_features_normalized = scaler.fit_transform(edge_features)
# Calculate the average of the normalized frequency and total_amount
edge_weights = np.mean(edge_features_normalized[:, :2], axis=1)  # Averaging frequency and total_amount
edge_features_normalized = torch.tensor(edge_features_normalized, dtype=torch.float)

# Ensure there are no NaN values in edge weights
edge_weights = np.nan_to_num(edge_weights)
# Convert to tensor
edge_weights = torch.tensor(edge_weights, dtype=torch.float)

# Check for NaNs in node features
if torch.isnan(node_features).any():
    print("NaN values found in node features")
if torch.isnan(edge_weights).any():
    print("NaN values found in edge weights")
# Data object with node features, edge index and edge features
data = Data(x=node_features, edge_index=edge_index, edge_attr=edge_features_normalized, edge_weight=edge_weights)
#It is not necessary to normalize the node features as  the embeddings are already normalized. =torch.float)  # Apply normalization back

# Create DataLoader for batch processing
loader = NeighborLoader(data, num_neighbors=[15, 10], batch_size=1024, shuffle=True)

data
edge_weights

tensor([ 0.2442, -0.1772, -0.2115,  ..., -0.2907, -0.1227, -0.1797])

# 4. GNN Architecture

## 4.1 Define reconstruction_loss



In [97]:
import torch
import torch.nn.functional as F


def reconstruction_loss(adj_pred, edge_index, num_nodes):
    # Create the actual adjacency matrix
    adj_true = torch.zeros((num_nodes, num_nodes), device=adj_pred.device)
    adj_true[edge_index[0], edge_index[1]] = 1.0

    # Compute the reconstruction loss
    loss = F.binary_cross_entropy(adj_pred.view(-1), adj_true.view(-1))
    return loss


## 4.2 Train GNN

In [101]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv


class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def encode(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x

    def decode(self, z, edge_index):
        row, col = edge_index
        return torch.sigmoid((z[row] * z[col]).sum(dim=1))

    def forward(self, data):
        z = self.encode(data)
        adj_pred = self.decode(z, data.edge_index)
        return adj_pred
    
# Define the input, hidden and output dimensions
input_dim = node_features.shape[1]  # The number of features per node
hidden_dim = 128  # Number of hidden units in the first GCN layer
output_dim = 64  # Number of output units in the second GCN layer

# Create the model
model = GCN(input_dim, hidden_dim, output_dim)

# Generate pairs
num_nodes = node_features.shape[0]

# Training the GNN model
num_epochs = 10
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_values = []
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for batch in loader:
        optimizer.zero_grad()
        adj_pred = model(batch)
        loss = reconstruction_loss(adj_pred, batch.edge_index, batch.num_nodes)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    avg_loss = total_loss / len(loader)
    loss_values.append(avg_loss)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss}")

print("Training complete.")

# Plot the learning curve
plt.plot(loss_values)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Learning Curve')
plt.show()

ValueError: Using a target size (torch.Size([1073296])) that is different to the input size (torch.Size([15])) is deprecated. Please ensure they have the same size.