<a href="https://colab.research.google.com/github/davidelgas/DataSciencePortfolio/blob/main/Neural%20Network%20with%20Knowledge%20Graph/KG_DNN_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Overview

In this notebook I will be creating a small NN and a knowledge graph with an open source e-Commerce dataset.


My use case:<br>
A language interface (chatbot or search engine) that answers user queries.<br>
A knowledge graph to structure relationships between products, conversations, and recommendations.<br>
A PyTorch-based neural network for embeddings, retrieval, or ranking.<br>
Retrieval-Augmented Generation (RAG) to enhance responses with knowledge graph lookups.<br>



In [2]:
# Access to Google Drive
# This seems to propagate credentials better from its own cell

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Workfow


1.   Load dataset
2.   Preprocess and Clean the Data
3.   Convert Data into a Knowledge Graph Structure
4.   Generate Knowledge Graph Embeddings
5.   Build a Graph-Based Recommendation Model
6.   Integrate Conversational AI (Natural Language Model)
7.   Train & Optimize the Full System
8.   Implement Real-Time Inference for Recommendations
9.   Deploy as an Interactive API or Chatbot
10.  Apppendx of code and notes




# Load dataset

In [4]:
import pandas as pd
import numpy as np

# I tried the Grocerie dataset, but dial_gt_context_train is corrupt.

dial_gt_context_test = np.load("/content/drive/MyDrive/Colab Notebooks/Datasets/dial_gt_context_test.npz", allow_pickle=True)

dial_gt_context_train = np.load("/content/drive/MyDrive/Colab Notebooks/Datasets/dial_gt_context_train.npz", allow_pickle=True)

dial_gt_context_val = np.load("/content/drive/MyDrive/Colab Notebooks/Datasets/dial_gt_context_val.npz", allow_pickle=True)

dial_utter_resp_test = np.load("/content/drive/MyDrive/Colab Notebooks/Datasets/dial_utter_resp_test.npz", allow_pickle=True)

dial_utter_resp_train = np.load("/content/drive/MyDrive/Colab Notebooks/Datasets/dial_utter_resp_train.npz", allow_pickle=True)

dial_utter_resp_val = np.load("/content/drive/MyDrive/Colab Notebooks/Datasets/dial_utter_resp_val.npz", allow_pickle=True)

dial_word_embed = np.load("/content/drive/MyDrive/Colab Notebooks/Datasets/dial_word_embed.npz", allow_pickle=True)

rec_test_candidate100 = np.load("/content/drive/MyDrive/Colab Notebooks/Datasets/rec_test_candidate100.npz", allow_pickle=True)

rec_val_candidate100 = np.load("/content/drive/MyDrive/Colab Notebooks/Datasets/rec_val_candidate100.npz", allow_pickle=True)


# /content/vocab_and_embeddings.pkl

In [5]:
files = {
    "dial_gt_context_test": dial_gt_context_test,
    "dial_gt_context_train": dial_gt_context_train,
    "dial_gt_context_val": dial_gt_context_val,
    "dial_utter_resp_test": dial_utter_resp_test,
    "dial_utter_resp_train": dial_utter_resp_train,
    "dial_utter_resp_val": dial_utter_resp_val,
    "dial_word_embed": dial_word_embed,
    "rec_test_candidate100": rec_test_candidate100,
    "rec_val_candidate100": rec_val_candidate100
}


def look_at_files(files):
    for name, data in files.items():  # Loop through dictionary items
        print(f"\nInspecting {name}")
        print(f"Stored arrays: {data.files}")  # List arrays in each NPZ file

        for key in data.files:
            array = data[key]
            print(f"Array Name: {key}")
            print(f"   Type: {type(array)}")
            print(f"   Shape: {array.shape}")
            print(f"   Data Type: {array.dtype}")

            if array.dtype.names:  # This checks for structured NumPy arrays with named columns
                print(f"   Columns: {array.dtype.names}")
            print("-" * 50)  # Separator

look_at_files(files)



Inspecting dial_gt_context_test
Stored arrays: ['utter_gt', 'utter_context', 'resp_gt', 'resp_contexst', 'dial_length']
Array Name: utter_gt
   Type: <class 'numpy.ndarray'>
   Shape: (127712, 10, 1)
   Data Type: int64
--------------------------------------------------
Array Name: utter_context
   Type: <class 'numpy.ndarray'>
   Shape: (127712, 10, 24)
   Data Type: int64
--------------------------------------------------
Array Name: resp_gt
   Type: <class 'numpy.ndarray'>
   Shape: (1277120, 1)
   Data Type: int64
--------------------------------------------------
Array Name: resp_contexst
   Type: <class 'numpy.ndarray'>
   Shape: (1277120, 24)
   Data Type: int64
--------------------------------------------------
Array Name: dial_length
   Type: <class 'numpy.ndarray'>
   Shape: (127712,)
   Data Type: int64
--------------------------------------------------

Inspecting dial_gt_context_train
Stored arrays: ['utter_gt', 'utter_context', 'resp_gt', 'resp_context']
Array Name: utte

# Preprocess and Clean the Data

In [6]:
# Expose the arrays from each NPZ

import numpy as np

extracted_arrays = {}

# Iterate through each NPZ file and extract its arrays
for file_name, npz_obj in files.items():
    extracted_arrays[file_name] = {}
    for array_name in npz_obj.files:
        extracted_arrays[file_name][array_name] = npz_obj[array_name]
        print(f"Extracted {array_name} from {file_name}")


Extracted utter_gt from dial_gt_context_test
Extracted utter_context from dial_gt_context_test
Extracted resp_gt from dial_gt_context_test
Extracted resp_contexst from dial_gt_context_test
Extracted dial_length from dial_gt_context_test
Extracted utter_gt from dial_gt_context_train
Extracted utter_context from dial_gt_context_train
Extracted resp_gt from dial_gt_context_train
Extracted resp_context from dial_gt_context_train
Extracted utter_gt from dial_gt_context_val
Extracted utter_context from dial_gt_context_val
Extracted resp_gt from dial_gt_context_val
Extracted resp_context from dial_gt_context_val
Extracted utterance from dial_utter_resp_test
Extracted response from dial_utter_resp_test
Extracted label from dial_utter_resp_test
Extracted utterance from dial_utter_resp_train
Extracted response from dial_utter_resp_train
Extracted label from dial_utter_resp_train
Extracted utterance from dial_utter_resp_val
Extracted response from dial_utter_resp_val
Extracted label from dial_utt

In [7]:
# Check shapes and sample data for conversation context
print("utter_gt_train shape:", extracted_arrays["dial_gt_context_train"]["utter_gt"].shape)
print("Sample:", extracted_arrays["dial_gt_context_train"]["utter_gt"][:2])

print("resp_gt_train shape:", extracted_arrays["dial_gt_context_train"]["resp_gt"].shape)
print("Sample:", extracted_arrays["dial_gt_context_train"]["resp_gt"][:2])

print("utter_context_train shape:", extracted_arrays["dial_gt_context_train"]["utter_context"].shape)
print("Sample:", extracted_arrays["dial_gt_context_train"]["utter_context"][:2])

print("resp_context_train shape:", extracted_arrays["dial_gt_context_train"]["resp_context"].shape)
print("Sample:", extracted_arrays["dial_gt_context_train"]["resp_context"][:2])


utter_gt_train shape: (361590, 10, 1)
Sample: [[[169331]
  [169331]
  [169331]
  [169331]
  [169331]
  [169331]
  [135181]
  [169331]
  [128901]
  [169331]]

 [[139397]
  [169331]
  [168871]
  [169331]
  [163173]
  [169331]
  [160426]
  [169331]
  [115820]
  [169331]]]
resp_gt_train shape: (723180, 1)
Sample: [[155243]
 [153058]]
utter_context_train shape: (361590, 10, 24)
Sample: [[[169331     45 169331 169331     45 169331 169331     45 169331 169331
       45 169331 169331     45 169331 169331     45 169331 169331     45
   169331 169331     45 169331]
  [169331     45 169331 169331     45 169331 169331     45 169331 169331
       45 169331 169331     45 169331 169331     45 169331 169331     45
   169331 169331     45 169331]
  [169331     45 169331 169331     45 169331 169331     45 169331 169331
       45 169331 169331     45 169331 169331     45 169331 169331     45
   169331 169331     45 169331]
  [169331     45 169331 169331     45 169331 169331     45 169331 169331
       45

Well, these have already been encoded. I'll proceed with them as is, but then start with raw text, convert to tensors, encode, etc. afterwards.

In [12]:
word_dict = {}

with open("/content/drive/MyDrive/Colab Notebooks/Datasets/word_dict.txt", "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()  # Remove any leading or trailing spaces
        parts = line.rsplit(" ", 1)  # Split at the LAST space

        if len(parts) == 2 and parts[1].isdigit():  # Ensure the last part is an index
            word, idx = parts
            word_dict[int(idx)] = word  # Store as {index: word}

print(list(word_dict.items())[:10])

[(786, 'militaryshield'), (3464, '6.6ft'), (8135, 'circuitri'), (12798, 'c-pattern-4'), (15378, 'zedari'), (7476, '20000mah-black'), (16861, 'rvh-ac03'), (3592, 'hanlesi'), (709, 'yellow'), (5354, 'interchang')]


In [14]:
# Are these tripples ?
# Extract a small sample
utter_gt_sample = extracted_arrays["dial_gt_context_train"]["utter_gt"][:2]
resp_gt_sample = extracted_arrays["dial_gt_context_train"]["resp_gt"][:2]
utter_context_sample = extracted_arrays["dial_gt_context_train"]["utter_context"][:2]

# Decode using the word dictionary
decoded_utterances = [
    [word_dict.get(int(token[0]), "[UNK]") for token in utterance] for utterance in utter_gt_sample
]
decoded_responses = [
    [word_dict.get(int(token[0]), "[UNK]") for token in response] for response in resp_gt_sample
]
decoded_contexts = [
    [[word_dict.get(int(token), "[UNK]") for token in context] for context in utterance]
    for utterance in utter_context_sample
]

# Print a few examples
print("Decoded Utterances:", decoded_utterances)
print("Decoded Responses:", decoded_responses)
print("Decoded Contexts:", decoded_contexts[:1])  # Print only one for readability


IndexError: invalid index to scalar variable.

# Convert Data into a Knowledge Graph Structure

# Generate Knowledge Graph Embeddings

# Build a Graph-Based Recommendation Model

# Integrate Conversational AI (Natural Language Model)

# Train & Optimize the Full System

# Implement Real-Time Inference for Recommendations

# Deploy as an Interactive API or Chatbot

# Appendix

## Scoring Criteria for Selecting an Encoder


| **Factor**                 | **Description** |
|---------------------------|----------------|
| **Computational Efficiency** | How fast is the encoding on CPU/GPU? |
| **Memory Usage**          | How much memory does it require? |
| **Scalability**           | Can it handle large datasets like OpenBG500? |
| **Preserves Semantic Meaning** | Does the encoding capture relationships between entities? |
| **Compatibility with PyTorch** | How well does it integrate into PyTorch models? |
| **Ease of Implementation** | How difficult is it to set up? |

Each encoding method gets a **score from 1 to 5** for each factor.

---

## Scoring Different Encoding Methods

| Encoding Method  | Computational Efficiency | Memory Usage | Scalability | Semantic Meaning | PyTorch Compatibility | Ease of Implementation | **Total Score** |
|-----------------|------------------------|--------------|-------------|------------------|----------------------|--------------------|--------------|
| **Label Encoding** (Integer Mapping) | **5** (Very fast) | **5** (Very low) | **5** (Handles millions of nodes) | **1** (No meaning captured) | **5** (PyTorch works with integers easily) | **5** (Simple `map()`) | **26** |
| **One-Hot Encoding** | **2** (Slow for large datasets) | **1** (Consumes huge memory) | **1** (Bad for large graphs) | **3** (Some structure captured) | **3** (Can be used, but not ideal) | **3** (Easy but inefficient) | **13** |
| **BERT Embeddings** (Text-Based) | **2** (Slow on CPU) | **3** (Moderate) | **3** (Can use pre-trained models) | **5** (Captures meaning well) | **4** (PyTorch supports it, but needs preprocessing) | **2** (Requires NLP model) | **19** |
| **Word2Vec/FastText** | **3** (Faster than BERT) | **3** (Moderate) | **4** (Good for large datasets) | **4** (Captures word meaning) | **4** (PyTorch supports it) | **3** (Requires preprocessing) | **21** |
| **Knowledge Graph Embeddings (TransE, RotatE)** | **4** (Moderate) | **4** (Efficient for large graphs) | **5** (Scales well) | **5** (Captures graph meaning) | **5** (Designed for PyTorch models) | **3** (Requires model training) | **26** |



In [None]:
# Get all unique entities (from both head and tail)
all_entities = set(triples_df_train["head"]).union(set(triples_df_train["tail"]))

# Get all unique relations
all_relations = set(triples_df_train["relation"])

# Create mapping dictionaries
entity2id = {entity: idx for idx, entity in enumerate(all_entities)}
relation2id = {relation: idx for idx, relation in enumerate(all_relations)}

def encode_triples(df):
    df["head"] = df["head"].map(entity2id)
    df["relation"] = df["relation"].map(relation2id)
    df["tail"] = df["tail"].map(entity2id)
    return df

# Encode train, test, and validation sets
triples_df_train = encode_triples(triples_df_train)
triples_df_test = encode_triples(triples_df_test)
triples_df_val = encode_triples(triples_df_val)


In [None]:
import torch

# Convert to tensor format
train_tensor = torch.tensor(triples_df_train.values, dtype=torch.long)
test_tensor = torch.tensor(triples_df_test.values, dtype=torch.long)
val_tensor = torch.tensor(triples_df_val.values, dtype=torch.long)

In [None]:
import torch

# Check the shape of the tensors
print("Train Tensor Shape:", train_tensor.shape)
print("Test Tensor Shape:", test_tensor.shape)
print("Validation Tensor Shape:", val_tensor.shape)

# Access the first 5 samples
print("First 5 Training Samples:\n", train_tensor[:5])

# Get specific columns
heads = train_tensor[:, 0]  # Head entities
relations = train_tensor[:, 1]  # Relations
tails = train_tensor[:, 2]  # Tail entities

print("First 5 Head Entities:\n", heads[:5])
print("First 5 Relations:\n", relations[:5])
print("First 5 Tail Entities:\n", tails[:5])

# Perform simple operations
sum_tensor = heads + tails  # Example tensor addition
print("Sum of Head & Tail Entities:\n", sum_tensor[:5])

# Get unique values
unique_heads = torch.unique(heads)
print(f"Unique Head Entities Count: {unique_heads.shape[0]}")


Train Tensor Shape: torch.Size([1242550, 3])
Test Tensor Shape: torch.Size([5000, 3])
Validation Tensor Shape: torch.Size([5000, 3])
First 5 Training Samples:
 tensor([[158292,    282,  79197],
        [193190,    490, 184642],
        [243732,     56,  86323],
        [248311,    134,  78130],
        [ 34938,    253, 231834]])
First 5 Head Entities:
 tensor([158292, 193190, 243732, 248311,  34938])
First 5 Relations:
 tensor([282, 490,  56, 134, 253])
First 5 Tail Entities:
 tensor([ 79197, 184642,  86323,  78130, 231834])
Sum of Head & Tail Entities:
 tensor([237489, 377832, 330055, 326441, 266772])
Unique Head Entities Count: 116721


In [None]:
import torch

device = torch.device("cpu")  # Force CPU mode for now

print("Using Device:", device)


Using Device: cpu


In [None]:
import torch.nn as nn
import torch.optim as optim

# Define a simple MLP model
class SimpleMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SimpleMLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)


# Three layer network
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Set dimensions
input_dim = 3  # (head, relation, tail)
hidden_dim = 16
output_dim = 1  # Binary classification or regression

# Initialize model
model = SimpleMLP(input_dim, hidden_dim, output_dim).to(device)

# Define loss and optimizer
criterion = nn.MSELoss()  # Example: MSE loss for regression
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Dummy training loop
for epoch in range(5):  # Short training example
    optimizer.zero_grad()
    outputs = model(train_tensor.float())  # Convert tensor to float for Linear layers
    loss = criterion(outputs, torch.rand_like(outputs))  # Dummy target values
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")


Epoch 1, Loss: 1196703744.0
Epoch 2, Loss: 737774528.0
Epoch 3, Loss: 400586816.0
Epoch 4, Loss: 173411088.0
Epoch 5, Loss: 46432520.0
