In [1]:
import pickle 
import numpy 
import sys     
sys.path.append("../") # add parent directory to path
import preprocessing # this project
import typing 

In [2]:
# just load data so functions can get test inputs 
# no need to go into final code 

[X, Y] = preprocessing.main(
    use_feature_categories=["socio-demographics", "health", "social", "neighbourhood"],
    csv_file="../ML_social.csv", dump=False)

  df = pandas.read_csv(csv_file)
100%|██████████| 29648/29648 [00:07<00:00, 4025.47it/s]
100%|██████████| 26178/26178 [00:29<00:00, 875.60it/s]

(4, 18) (4, 4)





## prepare training data 

In [3]:
def slide_over_one_subject(subject_X, subject_Y, minimal_length):
    """Create samples from the data of one subject using a sliding window

    subject_X, subject_Y: 2D numpy array, rows for waves and columns for features
    subject_new_X, subject_new_Y: 3D numpy array, axis 0 for sliding window, axis 1 for waves, axis 2 for features

    """
    subject_new_X, subject_new_Y = [], []
    num_features = subject_X.shape[1]
    subject_X_windowed = numpy.lib.stride_tricks.sliding_window_view(
        subject_X, 
        window_shape=(minimal_length, num_features)
    ) 
    subject_new_X = subject_X_windowed[:, -1, :, :]

    num_targets = subject_Y.shape[1]
    subject_Y_windowed = numpy.lib.stride_tricks.sliding_window_view(
        subject_Y, 
        window_shape=(minimal_length, num_targets)
    ) 
    subject_new_Y = subject_Y_windowed[:, -1, :, :]

    return subject_new_X, subject_new_Y

def sampling_fixed_length(X, Y, minimal_length):
    new_X, new_Y = [], []
    for subject_X, subject_Y in zip(X, Y):
        if len(subject_Y)>= minimal_length:
            subject_new_X, subject_new_Y = slide_over_one_subject(subject_X, subject_Y, minimal_length=minimal_length)
            new_X += subject_new_X.tolist()
            new_Y += subject_new_Y.tolist()

    new_X, new_Y = numpy.array(new_X), numpy.array(new_Y)

    return new_X, new_Y 


In [4]:

# test the work on Y 
_, new_Y = sampling_fixed_length(X[:2], Y[:2], minimal_length=4) 
print (Y[:2])
print (new_Y)
print (new_Y[:, [0, -1], 1])
numpy.heaviside(new_Y[:, -1, 1]-new_Y[:, 0, 1], 0 )

[array([[0.44444444, 0.3       , 0.8       , 1.        ],
       [0.66666667, 0.6       , 0.8       , 1.        ],
       [0.40740741, 0.35      , 0.4       , 1.        ],
       [0.25925926, 0.2       , 0.2       , 1.        ]]), array([[0.66666667, 0.55      , 1.        , 1.        ],
       [0.59259259, 0.5       , 0.8       , 1.        ],
       [0.62962963, 0.5       , 1.        , 1.        ],
       [0.51851852, 0.4       , 0.8       , 1.        ],
       [0.62962963, 0.5       , 1.        , 1.        ]])]
[[[0.44444444 0.3        0.8        1.        ]
  [0.66666667 0.6        0.8        1.        ]
  [0.40740741 0.35       0.4        1.        ]
  [0.25925926 0.2        0.2        1.        ]]

 [[0.66666667 0.55       1.         1.        ]
  [0.59259259 0.5        0.8        1.        ]
  [0.62962963 0.5        1.         1.        ]
  [0.51851852 0.4        0.8        1.        ]]

 [[0.59259259 0.5        0.8        1.        ]
  [0.62962963 0.5        1.         1.        

array([0., 0., 0.])

In [5]:
def generate_labels(Y, target_index: int):
    """Extract the training label for a particular target dimension 

    Y: 3D ndarray, axis 0 is sample, axis 1 is wave, axis 2 is score 
    """
    # y_of_interest = Y[:, :, target_index] # 2D array 
    begin_score = Y[:, 0, target_index]
    end_score = Y[:, -1, target_index]

    labels = end_score - begin_score
    labels = numpy.heaviside(labels, 0)

    return labels.reshape((-1,1)) # 2d arrary, Nx1

# top level function for training data preparation 
# need to rerun for each target index rangeing from 0 to 2 
def prepare_training_data(X, Y, target_index: int, minimal_length):
    new_X, new_Y = sampling_fixed_length(X, Y, minimal_length=minimal_length) 
    new_Y = generate_labels(new_Y, target_index=target_index)
    return new_X, new_Y 


In [6]:
# Generate training data so we can test the model
# No need to go into final code 
train_X, train_y = prepare_training_data(X, Y, target_index=0, minimal_length=4)

## The networks

In [7]:
import torch.nn as nn
import torch

class RNN(nn.Module):
  def __init__(self, input_size, hidden_size, num_layers):
      super().__init__()
      self.rnn = torch.nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
    #   self.fc1 = torch.nn.Linear(hidden_size, 3)
      self.fc1 = torch.nn.Linear(hidden_size, 1)

  def forward(self, x):
      output, hn = self.rnn(x)
      x = self.fc1(hn[0])
    #   x = self.fc2(x)
      x = torch.sigmoid(x)
      return x

class FC(nn.Module):
  def __init__(self, input_size):
      super().__init__()
      self.fc1 = torch.nn.Linear(input_size, 50)
      self.fc2 = torch.nn.Linear(50, 20)
      self.fc3 = torch.nn.Linear(20, 10)
      self.fc4 = torch.nn.Linear(10, 1)

  def forward(self, x):
      x = torch.flatten(x, start_dim=1) # first dimension is batch 
      x = self.fc1(x)
      x = torch.sigmoid(x)
      x = self.fc2(x)
      x = torch.sigmoid(x)
      x = self.fc3(x)
      x = torch.sigmoid(x)
      x = self.fc4(x)
      x = torch.sigmoid(x)
    
      return x

In [8]:
# test 
# USING VARIABLES PREVIOUSLY LOADED INTO THE MEMORY

train_X_tensor = torch.from_numpy(train_X.astype("float32"))
train_y_tensor = torch.from_numpy(train_y.astype("float32"))

input_dimension = train_X.shape[2]
net = RNN(input_dimension, 5, 2)
net(train_X_tensor[10:20]) 

tensor([[0.7076],
        [0.7131],
        [0.7114],
        [0.7085],
        [0.6459],
        [0.6243],
        [0.6957],
        [0.7096],
        [0.7107],
        [0.7075]], grad_fn=<SigmoidBackward0>)

# The trainer 

In [9]:
def train(net, X, y, learning_rate, momentum, batch_size, print_batch_size):
    """

    net: a torch.nn instance
    X: 3D PyTorch Tensor, [sample, wave, feature]
    y: 2D PyTorch Tensor, Nx1, [sample, binary label]
    """
    import torch.optim as optim

    criterion = nn.BCELoss() 
    # criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(net.parameters(), lr=learning_rate)

    batch_size = batch_size
    print_batch_size = print_batch_size

    loss_log = [] 

    for epoch in range(3):  # loop over the dataset multiple times

        running_loss = 0.0
        for i in range(0, X.shape[0], batch_size):

            optimizer.zero_grad()

            batch_X, batch_y = X[i:i+batch_size], y[i: i+batch_size]

            # forward + backward + optimize
            prediction = net.forward(batch_X)
            loss = criterion(prediction, batch_y)
            
            # update weights at the end of each batch
            loss.backward()
            optimizer.step() 

            # log loss
            loss_batch = loss.item()
            running_loss += loss_batch # accumulated loss of many batches until print 
            loss_log.append(loss_batch)

            # print statistics
            if i % print_batch_size == 0 and i / print_batch_size > 1:
                print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / (print_batch_size/batch_size):.7f}')
                running_loss = 0.0

    return loss_log

In [10]:
# Test the network training 
# USING VARIABLES PREVIOUSLY PLACE IN THE NOTEBOOK

input_dimension = train_X_tensor.shape[2]
net = RNN(input_dimension, 5, 2)

loss_log = train(net, train_X_tensor, train_y_tensor, 
                    learning_rate=0.01, 
                    momentum=0.9,
                    batch_size=32, print_batch_size=32*400)

[1, 25601] loss: 1.2685060
[1, 38401] loss: 0.6194438
[2, 25601] loss: 1.2657085
[2, 38401] loss: 0.6191720
[3, 25601] loss: 1.2645060
[3, 38401] loss: 0.6185363


## Experiments

In [None]:
def exp_one_target_one_arch(use_feature_categories: typing.List[str], csv_path:str, 
                            target_index:int, minimal_length:int, arch:str, 
                            learning_rate:float, momentum:float, 
                            batch_size:int, print_batch_size:int):

    [X, Y] = preprocessing.main(use_feature_categories= use_feature_categories,
                csv_file=csv_path, dump=False)    

    train_X, train_y = prepare_training_data(X, Y, 
                                target_index=target_index, 
                                minimal_length=4)

    # numpy float 64 for PyTorch float 32
    train_X = torch.from_numpy(train_X.astype("float32"))
    train_y = torch.from_numpy(train_y.astype("float32"))

    input_dimension = train_X.shape[2]
    if arch == "RNN":
        net = RNN(input_dimension, 5, 2)
    elif arch == "FC":
        net = FC(input_dimension)

    loss_log = train(net, train_X, train_y, 
                     learning_rate=learning_rate, 
                     momentum=momentum,
                     batch_size=batch_size, print_batch_size=print_batch_size)

In [12]:
# Test using one condition 

feature_combination = ["socio-demographics", "health", "social", "neighbourhood"]
CSV_raw = "../ML_social.csv"
    
loss_log = exp_one_target_one_arch(use_feature_categories=feature_combination, 
                        csv_path=CSV_raw,
                        target_index=0, minimal_length=4, 
                        arch="RNN", learning_rate=0.001, momentum=0.9,
                        batch_size=32, print_batch_size=32*10)

  df = pandas.read_csv(csv_file)
100%|██████████| 29648/29648 [00:07<00:00, 4183.83it/s]
100%|██████████| 26178/26178 [00:30<00:00, 848.66it/s]


(4, 18) (4, 4)
[1,   641] loss: 1.4987940
[1,   961] loss: 0.6816145
[1,  1281] loss: 0.6550148
[1,  1601] loss: 0.6518805
[1,  1921] loss: 0.6256426
[1,  2241] loss: 0.6523841
[1,  2561] loss: 0.6342107
[1,  2881] loss: 0.6464762
[1,  3201] loss: 0.6457524
[1,  3521] loss: 0.6384335
[1,  3841] loss: 0.6361469
[1,  4161] loss: 0.6024196
[1,  4481] loss: 0.6382607
[1,  4801] loss: 0.6364564
[1,  5121] loss: 0.6012394
[1,  5441] loss: 0.6619647
[1,  5761] loss: 0.6275776
[1,  6081] loss: 0.6146175
[1,  6401] loss: 0.6561692
[1,  6721] loss: 0.6373615
[1,  7041] loss: 0.6207815
[1,  7361] loss: 0.6103640
[1,  7681] loss: 0.6402642
[1,  8001] loss: 0.6514095
[1,  8321] loss: 0.6123548
[1,  8641] loss: 0.6341721
[1,  8961] loss: 0.6271838
[1,  9281] loss: 0.6134559
[1,  9601] loss: 0.6192672
[1,  9921] loss: 0.6035549
[1, 10241] loss: 0.6277377
[1, 10561] loss: 0.6197450
[1, 10881] loss: 0.5947810
[1, 11201] loss: 0.6086776
[1, 11521] loss: 0.6133608
[1, 11841] loss: 0.6254957
[1, 12161] lo

### Complete loop

In [14]:
feature_combinations = [
    ["socio-demographics"],
    ["health"],
    ["social"],
    ["neighbourhood"],
    # ["socio-demographics", "health", "social", "neighbourhood"]
    ["health", "social"],
    ["social", "neighbourhood"],
    ["socio-demographics", "health"]
]

CSV_raw = "../ML_social.csv"
for use_feature_categories in feature_combinations:
    for target_index in range(4):
        print(f"Training for {use_feature_categories} on target {target_index}")

        # loss_log = exp_one_target_one_arch(use_feature_categories=feature_combination, 
        #                 csv_path=CSV_raw,
        #                 target_index=0, minimal_length=4, 
        #                 arch="RNN", learning_rate=0.001, momentum=0.9,
        #                 batch_size=32, print_batch_size=32*10)

        loss_log = exp_one_target_one_arch(use_feature_categories=use_feature_categories, 
                        csv_path=CSV_raw,
                        target_index=0, minimal_length=4, 
                        arch="RNN", learning_rate=0.01, momentum=0.9,
                        batch_size=32, print_batch_size=32*300)

NameError: name 'target_index' is not defined