Implementing a feedforward neural network (FNN) as a first step

In [3]:
from d2c.descriptors import D2C, DataLoader
from d2c.descriptors.utils import return_mb
import numpy as np

from tqdm import tqdm

In [4]:
dataloader = DataLoader()
dataloader.from_pickle('../example/synthetic_data.pkl')
observations = dataloader.get_observations()
dags = dataloader.get_dags()


In [5]:
len(observations)

7000

In [6]:
def pad_mb(before_padding_MB, size = 3):
    if before_padding_MB.shape[1] < size:
        padding = np.zeros((before_padding_MB.shape[0], size - before_padding_MB.shape[1]))
        before_padding_MB = np.hstack([before_padding_MB, padding])
    
    if before_padding_MB.shape[1] > size:
        before_padding_MB = before_padding_MB[:, :3]
    
    return before_padding_MB

In [7]:
def prepare_data_for_couple(observations, dag, source_name, target_name, is_causal):
    
    # Estimate Markov blankets
    MBca = return_mb(dag, source_name)
    MBef = return_mb(dag, target_name)
    
    padded_MBca = pad_mb(observations[MBca].values)
    padded_MBef = pad_mb(observations[MBef].values)

    # Gather features
    features = np.hstack([
        observations[source_name].values.reshape(-1,1) ,        # Observations of X
        observations[target_name].values.reshape(-1,1) ,        # Observations of Y
        padded_MBca,  # Observations of MB of X
        padded_MBef   # Observations of MB of Y
    ])
    
    # Create a dictionary for the sample
    sample = {
        'features': features,
        'is_causal': is_causal,
        'edge_source': source_name,
        'edge_dest': target_name
    }
    
    return sample

In [8]:
# from multiprocessing import Pool, cpu_count

# def process_dag(index):
#     current_dag = dags[index]
#     current_observations = observations[index]
#     local_samples = []
#     for edge in current_dag.edges:
#         s1 = prepare_data_for_couple(current_observations, current_dag, edge[0], edge[1], 1)
#         s2 = prepare_data_for_couple(current_observations, current_dag, edge[1], edge[0], 0)
#         local_samples.append(s1)
#         local_samples.append(s2)
#     return local_samples

# if __name__ == '__main__':
#     with Pool(cpu_count()) as pool:
#         results = pool.map(process_dag, range(len(dags)))
    
#     # Flatten the list of lists
#     samples = [item for sublist in results for item in sublist]
from tqdm import tqdm
samples = []

for index in tqdm(range(len(dags))):
    current_dag = dags[index]
    current_observations = observations[index]
    for edge in current_dag.edges:
        s1 = prepare_data_for_couple(current_observations, current_dag, edge[0], edge[1], 1)
        s2 = prepare_data_for_couple(current_observations, current_dag, edge[1], edge[0], 0)
        samples.append(s1)
        samples.append(s2)
    

100%|██████████| 7000/7000 [00:26<00:00, 259.45it/s]


In [9]:
# pickle samples
import pickle
with open('samples.pkl', 'wb') as f:
    pickle.dump(samples, f)

# # Load samples
# with open('samples.pkl', 'rb') as f:
#     samples = pickle.load(f)

In [10]:
import torch
from torch.utils.data import Dataset

class CausalDataset(Dataset):
    def __init__(self, samples):
        self.features = np.array([sample['features'] for sample in samples])
        self.labels = np.array([sample['is_causal'] for sample in samples])
        self.features = torch.tensor(self.features, dtype=torch.float32)
        self.labels = torch.tensor(self.labels, dtype=torch.long)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]


In [11]:
import torch.nn as nn

class CausalInferenceNN(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, num_classes=2):
        super(CausalInferenceNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.relu2 = nn.ReLU()
        self.output_layer = nn.Linear(hidden_dim // 2, num_classes)
    
    def forward(self, x):

        x = x.view(x.size(0), -1)
        
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.output_layer(x)
        return x


In [12]:
from torch.utils.data import DataLoader, random_split

dataset = CausalDataset(samples)


In [43]:
len(train_dataset), len(val_dataset)

(57619, 14405)

In [13]:

# Split dataset
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Data loaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [14]:
dataset.features.shape

torch.Size([72024, 250, 8])

In [15]:
input_dim = dataset.features.shape[1]*dataset.features.shape[2]
model = CausalInferenceNN(input_dim)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [16]:
def train(model, loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for features, labels in loader:
        features, labels = features.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    return total_loss / len(loader)


In [17]:
def validate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    with torch.no_grad():
        for features, labels in loader:
            features, labels = features.to(device), labels.to(device)
            outputs = model(features)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            
            _, predicted = torch.max(outputs.data, 1)
            correct += (predicted == labels).sum().item()
    accuracy = correct / len(loader.dataset)
    return total_loss / len(loader), accuracy


In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

num_epochs = 20

for epoch in range(num_epochs):
    train_loss = train(model, train_loader, criterion, optimizer, device)
    val_loss, val_accuracy = validate(model, val_loader, criterion, device)
    
    print(f'Epoch {epoch+1}/{num_epochs}')
    print(f'Train Loss: {train_loss:.4f}')
    print(f'Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')


Epoch 1/20
Train Loss: 1727232024836.2031
Validation Loss: 1657676313013.0603, Validation Accuracy: 0.8177
Epoch 2/20
Train Loss: 552210291922.3967
Validation Loss: 110159164826.4803, Validation Accuracy: 0.7915
Epoch 3/20
Train Loss: 1162573609672.7090
Validation Loss: 2848870140371.1919, Validation Accuracy: 0.8128
Epoch 4/20
Train Loss: 843222765007.9503
Validation Loss: 606193893982.1510, Validation Accuracy: 0.8078
Epoch 5/20
Train Loss: 196987102357.6695
Validation Loss: 216572935557.6734, Validation Accuracy: 0.8119
Epoch 6/20
Train Loss: 257478005058.0508
Validation Loss: 483694109393.2570, Validation Accuracy: 0.8108
Epoch 7/20
Train Loss: 133439017925.0666
Validation Loss: 277878479259.3223, Validation Accuracy: 0.8131
Epoch 8/20
Train Loss: 78694357053.2335
Validation Loss: 33309115956.4907, Validation Accuracy: 0.8085
Epoch 9/20
Train Loss: 3383004143.1971
Validation Loss: 122329813586.6343, Validation Accuracy: 0.8154
Epoch 10/20
Train Loss: 34625329722.4618
Validation Los

In [46]:
import torch.nn.functional as F

def test(model, loader, device):
    model.eval()
    correct = 0
    total = 0
    all_predictions = []
    all_probabilities = []
    with torch.no_grad():
        for features, labels in loader:
            features, labels = features.to(device), labels.to(device)
            outputs = model(features)
            
            # Compute probabilities using softmax
            probabilities = F.softmax(outputs, dim=1)
            all_probabilities.extend(probabilities.cpu().numpy())
            
            # Get predicted labels
            _, predicted = torch.max(outputs.data, 1)
            all_predictions.extend(predicted.cpu().numpy())
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = correct / total
    print(f'Test Accuracy: {accuracy:.4f}')
    
    return all_predictions, all_probabilities


In [27]:
x = np.array(val_dataset)
unique, counts = np.unique(x, return_counts=True)
print(np.asarray((unique, counts)).T)

[[    0 36012]
 [    1 36012]]


In [48]:
test_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
predictions, probabilities = test(model, test_loader, device)


Test Accuracy: 0.8314


In [49]:
probabilities

[array([-0.06711925, -0.07848471], dtype=float32),
 array([ 8.2306795, -1.1443149], dtype=float32),
 array([-8.81163 ,  7.511959], dtype=float32),
 array([0.13241565, 0.7659145 ], dtype=float32),
 array([-11407.085,  10077.365], dtype=float32),
 array([-0.0317541 ,  0.15514383], dtype=float32),
 array([32.69563   ,  0.44536906], dtype=float32),
 array([1.2359437, 0.7110414], dtype=float32),
 array([-0.32865036,  0.1479882 ], dtype=float32),
 array([4.700784  , 0.04450328], dtype=float32),
 array([ 8.542162  , -0.27089012], dtype=float32),
 array([-16.10503  ,  13.8095455], dtype=float32),
 array([-0.64252615,  1.067321  ], dtype=float32),
 array([-282.8842 ,  236.61456], dtype=float32),
 array([1.4413583 , 0.51307917], dtype=float32),
 array([-5.5171847,  4.663129 ], dtype=float32),
 array([-0.9254652 ,  0.78765225], dtype=float32),
 array([-0.38893798,  0.20019427], dtype=float32),
 array([8.749686  , 0.29989123], dtype=float32),
 array([-1.9512722,  3.0460813], dtype=float32),
 array

In [28]:
x = np.array(predictions)
unique, counts = np.unique(x, return_counts=True)
print(np.asarray((unique, counts)).T)

[[   0 7535]
 [   1 6870]]


In [45]:
val_labels = []

# Iterate over the validation loader
for batch in val_loader:
    _, labels = batch  # Since your __getitem__ method in CausalDataset returns features, labels
    val_labels.append(labels)

# Concatenate all validation labels into a single tensor
val_labels = torch.cat(val_labels, dim=0)


y_true = val_labels.numpy()
y_hat = np.array(predictions)

from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

confusion = confusion_matrix(y_true, y_hat)

f1 = f1_score(y_true, y_hat)

precision = precision_score(y_true, y_hat)

recall = recall_score(y_true, y_hat)

print(f'F1 Score: {f1:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print('Confusion Matrix:')
print(confusion)

F1 Score: 0.8274
Precision: 0.8473
Recall: 0.8084
Confusion Matrix:
[[6155 1049]
 [1380 5821]]
