In [1]:
import pandas as pd
from IPython.display import display
# Ensure other common libraries like numpy, matplotlib, seaborn are imported
# in a common cell at the beginning of the notebook, as per previous discussions.
# For NetworkX, it will be imported when first used.

# Define the path to our custom transaction dataset
TRANSACTION_DATA_PATH = "fraud_transactions.csv" 

try:
    transactions_df = pd.read_csv(TRANSACTION_DATA_PATH)
    print(f"'{TRANSACTION_DATA_PATH}' loaded successfully.")
    print("--------------------------------------------------")
    print("First few rows of the transaction dataset:")
    display(transactions_df.head()) # Display more rows if needed, e.g., head(10)
    print("\n--------------------------------------------------")
    print(f"Dataset Shape: {transactions_df.shape}")
    print("--------------------------------------------------")
    # Basic info about data types and missing values
    print("Dataset Info:")
    transactions_df.info()
    print("--------------------------------------------------")
    if 'is_fraud' in transactions_df.columns:
        print("Distribution of 'is_fraud' label:")
        display(transactions_df['is_fraud'].value_counts(normalize=True).reset_index().rename(
            columns={'index': 'Is_Fraud', 'is_fraud': 'Proportion'}
        ))
    else:
        print("Note: 'is_fraud' column not found. Analysis will proceed without it for now.")
    print("--------------------------------------------------")
    
except FileNotFoundError:
    print(f"Error: '{TRANSACTION_DATA_PATH}' was not found.")
    print("Please ensure you have this file from the book's GitHub repository (chapter09/data folder)")
    print("and that it is in the correct path relative to your Jupyter Notebook.")
    transactions_df = None # Set to None if loading fails

'fraud_transactions.csv' loaded successfully.
--------------------------------------------------
First few rows of the transaction dataset:


Unnamed: 0,transaction_id,user_id,transaction_time,ip,device_id,phone_number,credit_card_number,order_item,amount,is_fraud
0,05987d33-9bcd-4455-9d7c-2f043b007b10,0,2024-05-07,219.20.206.172,8efe3214-ea7f-47ef-bb30-716e28f02600,(527)638-2221x33874,4908554227138584,Accessories,201631,False
1,013414be-3e37-48fb-aab2-5f42a4933db8,0,2024-05-09,219.20.206.172,8efe3214-ea7f-47ef-bb30-716e28f02600,(527)638-2221x33874,4908554227138584,Kids Products,281063,False
2,c6d1eed6-2c40-44f0-8a6f-fec0f8f7a83b,0,2024-05-12,219.20.206.172,8efe3214-ea7f-47ef-bb30-716e28f02600,(527)638-2221x33874,4908554227138584,Cosmetics,279119,False
3,90321501-3b4f-4cd3-92e9-ae1531b6b705,0,2024-05-01,219.20.206.172,8efe3214-ea7f-47ef-bb30-716e28f02600,(527)638-2221x33874,4908554227138584,Accessories,253019,False
4,2169fe4b-772a-45ef-9478-dbc48ee0e331,0,2024-05-09,219.20.206.172,8efe3214-ea7f-47ef-bb30-716e28f02600,(527)638-2221x33874,4908554227138584,Food,170733,False



--------------------------------------------------
Dataset Shape: (14813, 10)
--------------------------------------------------
Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14813 entries, 0 to 14812
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   transaction_id      14813 non-null  object
 1   user_id             14813 non-null  int64 
 2   transaction_time    14813 non-null  object
 3   ip                  14813 non-null  object
 4   device_id           14813 non-null  object
 5   phone_number        14813 non-null  object
 6   credit_card_number  14813 non-null  int64 
 7   order_item          14813 non-null  object
 8   amount              14813 non-null  int64 
 9   is_fraud            14813 non-null  bool  
dtypes: bool(1), int64(3), object(6)
memory usage: 1.0+ MB
--------------------------------------------------
Distribution of 'is_fraud' label:


Unnamed: 0,Proportion,proportion
0,False,0.982245
1,True,0.017755


--------------------------------------------------


In [16]:
# Optimized GNN Demo - Fast Graph Generation
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix

# Simple GNN Model
class FraudGNN(nn.Module):
    def __init__(self, input_dim, hidden_dim=32):
        super(FraudGNN, self).__init__()
        self.conv1 = nn.Linear(input_dim, hidden_dim)
        self.conv2 = nn.Linear(hidden_dim, hidden_dim)
        self.classifier = nn.Linear(hidden_dim, 2)  # Normal/Fraud
        
    def forward(self, x, adj):
        # First GNN layer: Aggregate neighbor information
        h1 = torch.relu(self.conv1(x))
        h1_agg = torch.mm(adj, h1)  # Average neighbors' information
        h1 = h1 + h1_agg
        
        # Second GNN layer
        h2 = torch.relu(self.conv2(h1))
        h2_agg = torch.mm(adj, h2)
        h2 = h2 + h2_agg
        
        # Classification
        out = self.classifier(h2)
        return out

def create_fast_graph(df, max_connections=50):
    """Fast graph generation using vectorized operations"""
    print("   📊 Creating node features...")
    n = len(df)
    
    # Node features: [amount, hour, weekday] - vectorized
    times = pd.to_datetime(df['transaction_time'])
    features = np.column_stack([
        df['amount'].values / 1000,
        times.dt.hour / 24,
        times.dt.weekday / 7
    ])
    
    print("   🔗 Computing connections...")
    # Efficient adjacency matrix generation
    row_indices = []
    col_indices = []
    
    # Connect transactions with same user_id
    for user_id in df['user_id'].unique():
        user_mask = df['user_id'] == user_id
        user_indices = np.where(user_mask)[0]
        
        if len(user_indices) > 1:
            # Generate all combinations (excluding self)
            for i in user_indices:
                for j in user_indices:
                    if i != j:
                        row_indices.append(i)
                        col_indices.append(j)
    
    # Connect transactions with same device_id
    for device_id in df['device_id'].unique():
        device_mask = df['device_id'] == device_id
        device_indices = np.where(device_mask)[0]
        
        if len(device_indices) > 1:
            for i in device_indices:
                for j in device_indices:
                    if i != j:
                        row_indices.append(i)
                        col_indices.append(j)
    
    print("   ⚡ Creating sparse matrix...")
    # Create sparse matrix (memory efficient)
    if row_indices:
        data = np.ones(len(row_indices))
        adj_sparse = csr_matrix((data, (row_indices, col_indices)), shape=(n, n))
        
        # Normalization
        row_sum = np.array(adj_sparse.sum(axis=1)).flatten()
        row_sum[row_sum == 0] = 1
        adj_sparse = adj_sparse.multiply(1.0 / row_sum.reshape(-1, 1))
        
        # Convert to dense matrix (only for small data)
        if n < 1000:
            adj_matrix = adj_sparse.toarray()
        else:
            # Keep sparse matrix for large data
            adj_matrix = adj_sparse
    else:
        adj_matrix = np.eye(n)  # Identity matrix if no connections
    
    return features, adj_matrix

def run_fast_gnn_demo():
    """Run optimized GNN demo"""
    print("🚀 Fast GNN Fraud Detection Demo")
    print("=" * 35)
    
    if 'transactions_df' not in globals():
        print("❌ transactions_df not found!")
        return
    
    df = transactions_df
    print(f"📊 Total data: {len(df)} transactions")
    
    # 1. Use full dataset (no sampling)
    print("📊 Using full dataset")
    features, adj_matrix = create_fast_graph(df)
    
    print(f"   Final nodes: {len(features)}")
    
    # Connection information
    if hasattr(adj_matrix, 'nnz'):  # Sparse matrix
        print(f"   Connections: {adj_matrix.nnz}")
        print(f"   Avg neighbors: {adj_matrix.nnz / len(features):.1f}")
        # Convert sparse to dense matrix
        adj_matrix = adj_matrix.toarray()
    else:
        print(f"   Connections: {(adj_matrix > 0).sum()}")
        print(f"   Avg neighbors: {(adj_matrix > 0).sum(axis=1).mean():.1f}")
    
    # 2. Convert to tensors
    X = torch.FloatTensor(features)
    A = torch.FloatTensor(adj_matrix)
    
    # 3. Create model
    model = FraudGNN(input_dim=3, hidden_dim=16)
    
    # 4. Train if labels exist, otherwise inference only
    if 'is_fraud' in df.columns:
        print("\n🎯 Training GNN...")
        y = torch.LongTensor(df['is_fraud'].values)
        
        optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-4)
        criterion = nn.CrossEntropyLoss()
        
        # Improved training (more epochs and learning rate scheduling)
        best_loss = float('inf')
        patience = 0
        
        for epoch in range(100):
            optimizer.zero_grad()
            outputs = model(X, A)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()
            
            if epoch % 20 == 0:
                acc = (outputs.argmax(1) == y).float().mean()
                print(f"   Epoch {epoch}: Loss={loss:.3f}, Acc={acc:.3f}")
                
                # Early stopping check
                if loss < best_loss:
                    best_loss = loss
                    patience = 0
                else:
                    patience += 1
                    if patience > 3 and epoch > 40:
                        print(f"   Early stopping at epoch {epoch}")
                        break
    
    else:
        print("\n🔮 GNN inference mode...")
    
    # 5. Results analysis
    model.eval()
    with torch.no_grad():
        outputs = model(X, A)
        fraud_probs = F.softmax(outputs, dim=1)[:, 1]
    
    # Top suspicious transactions (TOP 5)
    top_indices = fraud_probs.argsort(descending=True)[:5]
    print(f"\n🚨 Most Suspicious Transactions TOP 5:")
    for i, idx in enumerate(top_indices):
        idx = int(idx)  # Convert tensor index to integer
        row = df.iloc[idx]
        prob = fraud_probs[idx]
        actual = f" (Actual: {'Fraud' if 'is_fraud' in df.columns and row['is_fraud'] else 'Normal'})" if 'is_fraud' in df.columns else ""
        print(f"   {i+1}. User:{row['user_id']}, Amount:{row['amount']}, Prob:{prob:.3f}{actual}")
    
    print(f"\n✅ Optimized GNN analysis complete!")
    
    return model, fraud_probs

# Run
run_fast_gnn_demo()

🚀 Fast GNN Fraud Detection Demo
📊 Total data: 14813 transactions
📊 Using full dataset
   📊 Creating node features...
   🔗 Computing connections...
   ⚡ Creating sparse matrix...
   Final nodes: 14813
   Connections: 60142
   Avg neighbors: 4.1

🎯 Training GNN...
   Epoch 0: Loss=1.111, Acc=0.982
   Epoch 20: Loss=0.110, Acc=0.982
   Epoch 40: Loss=0.106, Acc=0.982
   Epoch 60: Loss=0.106, Acc=0.982
   Epoch 80: Loss=0.098, Acc=0.982

🚨 Most Suspicious Transactions TOP 5:
   1. User:205, Amount:12168, Prob:0.492 (Actual: Fraud)
   2. User:1550, Amount:19086, Prob:0.482 (Actual: Fraud)
   3. User:163, Amount:57917, Prob:0.424 (Actual: Fraud)
   4. User:2498, Amount:65591, Prob:0.414 (Actual: Fraud)
   5. User:1364, Amount:134086, Prob:0.311 (Actual: Fraud)

✅ Optimized GNN analysis complete!


(FraudGNN(
   (conv1): Linear(in_features=3, out_features=16, bias=True)
   (conv2): Linear(in_features=16, out_features=16, bias=True)
   (classifier): Linear(in_features=16, out_features=2, bias=True)
 ),
 tensor([0.0022, 0.0016, 0.0016,  ..., 0.0173, 0.0249, 0.0127]))