### Discovering Causal Signals in Images

1. Train NCC classifier
    1. Create synthetic dataset with causal/anticausal label
    2. Train NCC classifier
2. Discovering causal signals in images
    1. Get image dataset
    2. Get bounding box of each category (train object detection model?)
    3. Compute feature score
    4. classify causal/anticausal//object/context

### 1. Train NCC classifier
#### 1. Create synthetic dataset

In [1]:
from sklearn.mixture import GaussianMixture
from scipy.interpolate import CubicHermiteSpline, PchipInterpolator, UnivariateSpline

def normalize(x):
    return (x-x.mean())/x.std()

In [2]:
n = 10000 # i=1,...n
m_i = 100 # j=1,...,m_i

In [3]:
# for reproducible results
np.random.seed(seed=0)

synthetic_data = np.empty((0,3))

## cause
# fix parameters for creating gaussian distributions
for i in tqdm(range(n)):
    m_i = 100

    k_i = np.random.randint(1, 5, size=1).item()
    r_i = np.random.uniform(0, 5, size=1)
    s_i = np.random.uniform(0, 5, size=1)
    # create gaussian distributions before mix
    mu = np.random.normal(0, r_i, size=k_i)
    sigma = np.absolute(np.random.normal(0, s_i, size=k_i))
    precision = np.reciprocal(sigma)
    mixture_coef = np.absolute(np.random.normal(0, 1, size=k_i))
    mixture_coef /= mixture_coef.sum()
    # mixture of Gaussians
    # sampling from mixture of Gaussians
    what_dist = np.random.choice(k_i, size=m_i, p=mixture_coef, replace=True)
    unique, counts = np.unique(what_dist, return_counts=True)
    x_ij = np.empty((0,))
    for idx,cnt in zip(unique, counts):
        x_ij = np.concatenate([x_ij, np.random.normal(mu[idx], sigma[idx], size=cnt)], axis=0)
    # GMM = GaussianMixture(
    #     n_components=k_i,
    #     weights_init=mixture_coef,
    #     means_init=mu,
    #     precisions_init=precision,
    #     random_state=0,
    # ).fit()
    # x_ij = GMM.sample(n_samples=m_i)
    
    ## mechanism
    # cubic Hermite spline with support [min-std,max+std]
    d_i = np.random.randint(4, 5, size=1).item()
    knots = np.random.normal(0, 1, size=d_i)
    knots2 = np.random.normal(0, 1, size=d_i)
    support = np.linspace(x_ij.min()-x_ij.std(), x_ij.max()+x_ij.std(), d_i)
    # f_i = CubicHermiteSpline(x_ij, y, dydx, axis)
    f_ij_spline = PchipInterpolator(sorted(knots), knots2)
    
    ## noise
    v_i = np.random.uniform(0, 5, size=1)
    e_ij = np.random.normal(0, v_i, size=m_i)
    knots = np.random.uniform(0, 5, size=d_i)
    v_ij_spline = UnivariateSpline(support, knots)
    v_ij = v_ij_spline(e_ij)  # value of smoothing spline with support
    hetero_noise = np.multiply(v_ij, e_ij)

    ## mechanism again
    f_ij = f_ij_spline(x_ij)
    f_ij = normalize(f_ij)
    
    ## noise effect y_ij
    y_ij = f_ij + hetero_noise
    y_ij = normalize(y_ij)
    
    ## sampling process (2n samples)
    index = np.ones((m_i,1), dtype=int)*i
    syn_data = np.concatenate([index, x_ij.reshape(-1,1), y_ij.reshape(-1,1)], axis=1)
    synthetic_data = np.concatenate([synthetic_data, syn_data])

100%|██████████| 10000/10000 [00:32<00:00, 304.11it/s]


In [12]:
print(synthetic_data.shape)
np.savetxt("data/synthetic_data.csv", synthetic_data, delimiter=",", fmt='%i,%2.8f,%2.8f')

(1000000, 3)


#### 2. Train NCC Classifier

- NCC: two embedding layers and two classification layers followed by a softmax output layer.
- each hidden layer is a composition of batch normalization, 100 hidden neurons, ReLU, 25% dropout.
- train for 10000 iterations using RMSProp with default params

In [1]:
import numpy as np
from tqdm import tqdm
import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
batch_size = 32
num_iteration = 10000

In [3]:
class SyntheticDataset(object):
    def __init__(self, dataset):
        self.dataset = dataset

    def __getitem__(self, index):
        X, Y = self.dataset[index,1], self.dataset[index,2]
        return ((torch.tensor((X,Y),dtype=float), 0), (torch.tensor((Y,X),dtype=float), 1))

    def __len__(self):
        return len(self.dataset)

In [4]:
synthetic_data = np.loadtxt('data/synthetic_data.csv', delimiter=',')
dataset = SyntheticDataset(synthetic_data)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

In [5]:
class NCC_Classifier(nn.Module):
    def __init__(self, num_classes=2, in_features=2, out_features=100):
        super(NCC_Classifier, self).__init__()
        
        self.emb_layers = nn.Sequential(*[
            nn.Linear(in_features, out_features),
            nn.BatchNorm1d(out_features),
            nn.ReLU(),
            nn.Dropout(p=0.25)
        ])
        
        self.emb_layer2 = nn.Sequential(*[
            nn.Linear(out_features, out_features),
            nn.BatchNorm1d(out_features),
            nn.ReLU(),
            nn.Dropout(p=0.25)
        ])
        
        self.classifier1 = nn.Sequential(*[
            nn.Linear(1, out_features),
            nn.BatchNorm1d(out_features),
            nn.ReLU(),
            nn.Dropout(p=0.25)
        ])

        self.classifier2 = nn.Sequential(*[
            nn.Linear(out_features, out_features),
            nn.BatchNorm1d(out_features),
            nn.ReLU(),
            nn.Dropout(p=0.25)
        ])

        self.classifier = nn.Linear(out_features, num_classes)
        
    def forward(self, x):
        x = self.emb_layer1(x)
        x = self.emb_layer2(x)
        x = x.mean()
        x = self.classifier1(x)
        x = self.classifier2(x)
        x = self.classifier(x)
        # softmax
        return x

In [6]:
device = 'cpu'
model = NCC_Classifier(in_features=2).to(device)
model.train()


NCC_Classifier(
  (emb_layer1): Sequential(
    (0): Linear(in_features=2, out_features=100, bias=True)
    (1): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.25, inplace=False)
  )
  (emb_layer2): Sequential(
    (0): Linear(in_features=100, out_features=100, bias=True)
    (1): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.25, inplace=False)
  )
  (classifier1): Sequential(
    (0): Linear(in_features=100, out_features=100, bias=True)
    (1): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.25, inplace=False)
  )
  (classifier2): Sequential(
    (0): Linear(in_features=100, out_features=100, bias=True)
    (1): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.25, inplace=False)
  )
  (classifier): Linear(i

In [7]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.RMSprop(model.parameters())
num_epoch = num_iteration // len(dataset)

for epoch in tqdm(range(2), ncols=80):
    train_loss, correct, total = 0., 0, 0

    for causal, anticausal in dataloader:
        inputs, NCC_label = causal
        N = NCC_label.size(0)
        
        optimizer.zero_grad()
        
        output = model(inputs.float())
        
        # loss = (1 - output1 + output2) / 2
        
        loss = criterion(output, NCC_label)
        loss.backward()
        optimizer.step()
        
        # metric
        train_loss += torch.sum(loss.detach()).cpu().item()
        
        _, preds = torch.max(output.data, 1)
        correct += preds.eq(NCC_label).sum().detach().cpu().data.numpy()
        total += N
        
    print('Epoch {}/{} TrainLoss {:.6f} TrainAcc {:.4f}'.format(epoch+1, num_epoch, train_loss, correct / total))

 50%|██████████████████████                      | 1/2 [04:11<04:11, 251.88s/it]

Epoch 1/10000 TrainLoss 0.633088 TrainAcc 1.0000


100%|████████████████████████████████████████████| 2/2 [08:49<00:00, 264.55s/it]

Epoch 2/10000 TrainLoss 0.000000 TrainAcc 1.0000





In [None]:
model.save()