In [1]:
import pandas as pd
import numpy as np
import torch
torch.manual_seed(0) # make sure the initial hidden state of RNN part keeps the same
from torch.utils.data import Dataset, DataLoader
import os
from numpy.linalg import norm
from matching.games import HospitalResident
from sklearn.metrics import pairwise
from sklearn.model_selection import KFold
from torch.utils.data import SubsetRandomSampler
from torch import optim

In [2]:
data_EHR1 = pd.read_csv("./data_EHR1.csv")
data_EHR1.head()

Unnamed: 0,f1_1,f2_1,f3_1,f4_1,f5_1,f6_1,f7_1,f8_1,f9_1,f10_1,f11_1,f12_1,f13_1,f14_1,f15_1,f16_1,f17_1,f18_1,f19_1,f20_1
0,1.696794,-0.621645,-2.350322,0.857568,0.18348,0.929362,1.66271,3.002824,-0.952821,0.780778,-2.149981,-0.096125,0.686564,0.932175,1.035598,0.100794,0.472664,0.961984,-1.581007,0.418345
1,4.490176,-1.369651,-6.381362,2.446793,0.648104,5.127069,3.632748,6.645584,-1.694836,1.961207,-5.39369,-1.725572,1.115977,2.599284,1.849126,0.149453,1.651858,2.380088,-2.98952,1.523643
2,6.195791,-1.803062,-8.885239,1.678346,1.484633,7.669556,5.218378,9.897765,-2.370407,2.332558,-7.979431,-2.123378,1.262524,3.922189,2.477547,1.246901,1.795737,2.639566,-4.948484,1.801274
3,5.943051,0.117324,-11.007327,0.045042,2.02657,7.370177,6.339163,11.226333,-2.77823,3.80461,-8.785526,-1.515712,0.418063,2.784484,2.132019,1.920771,1.803917,2.866873,-4.924506,1.716181
4,1.79269,2.802358,-5.212722,-1.270905,0.897933,4.450223,2.796456,7.54319,-1.245164,1.26586,-4.883092,-0.436753,-0.691122,2.223674,-0.473904,0.735597,1.970611,0.547733,-3.001847,0.29912


In [3]:
data_EHR1.shape

(60000, 20)

In [4]:
data_EHR2 = pd.read_csv("./data_EHR2.csv")
data_EHR2.head()

Unnamed: 0,f1_2,f2_2,f3_2,f4_2,f5_2,f15_2,f14_2,f6_2,f17_2,f9_2,f7_2,f12_2,f20_2,f19_2,f11_2,f18_2,f10_2,f16_2,f8_2,f13_2
0,1.002464,-0.05477,-1.607723,0.413153,0.52649,0.808553,0.521133,1.489252,1.09387,-0.143866,1.521181,-1.214374,-0.054487,-0.620618,-0.16542,0.877329,0.684882,-0.041809,2.285389,-0.688339
1,-0.208255,-0.430312,3.54231,0.273496,0.523185,1.403259,0.807507,-0.765867,0.872294,0.744647,-0.662414,0.211199,-0.622928,0.033095,1.369378,1.72168,-1.527514,-0.234272,0.20016,-0.147486
2,-3.177253,-0.359707,7.92868,0.673136,-0.53966,0.083743,0.46775,-3.756303,-0.516528,1.122243,-4.087978,-0.085167,-1.435108,-0.203887,3.23739,1.130163,-3.310383,-1.228359,-3.529958,0.776517
3,-3.293687,-0.708697,6.471826,0.595019,-2.137115,-0.089151,-1.102489,-3.33602,-0.365526,0.826688,-4.684949,-0.328677,-1.541318,-0.260947,2.497054,-0.693162,-2.074722,-1.027206,-3.263632,0.990452
4,0.941395,-1.305723,3.126918,-0.427142,-2.480852,-0.253852,-1.456013,-1.420847,0.261801,0.014554,-1.940401,-0.648343,-1.986704,-0.346978,-8.1e-05,-1.558701,-0.040872,-0.74887,-0.606107,1.572


In [5]:
data_EHR2.shape

(60000, 20)

# Get golden-standard-list from permutation matrix for the unmapped features in 2 EHRs

In [6]:
# get permutation matrix for the unmapped features in 2 EHRs
ump_f_EHR1 = list(data_EHR1.columns[5:])
ump_f_EHR2 = list(data_EHR2.columns[5:])

print(ump_f_EHR1)
print(ump_f_EHR2)

['f6_1', 'f7_1', 'f8_1', 'f9_1', 'f10_1', 'f11_1', 'f12_1', 'f13_1', 'f14_1', 'f15_1', 'f16_1', 'f17_1', 'f18_1', 'f19_1', 'f20_1']
['f15_2', 'f14_2', 'f6_2', 'f17_2', 'f9_2', 'f7_2', 'f12_2', 'f20_2', 'f19_2', 'f11_2', 'f18_2', 'f10_2', 'f16_2', 'f8_2', 'f13_2']


In [7]:
p = np.zeros((len(ump_f_EHR1), len(ump_f_EHR2)))
for i in range(len(ump_f_EHR1)):
    for j in range(len(ump_f_EHR2)):
        if ump_f_EHR1[i][1:-2] == ump_f_EHR2[j][1:-2]:
            p[i][j] = 1
p

array([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0.

# Training seq-to-seq model to regard transformation function as the fingerprint

In [8]:
# transfer gen_data shape from (60000, 20) to (2500, 24, 20)
data_1 = data_EHR1.to_numpy().reshape((2500, 24, 20))
data_2 = data_EHR2.to_numpy().reshape((2500, 24, 20))

### Define Dataset
* input: unmapped feature i in 2 EHRS (totally 15 unmapped features in 2 EHRs, the last 15 columns in data_1/data_2)
* target: pre-mapped features in 2 EHRS (the first 5 columns)


In [9]:
class Seq2seq_Dataset(Dataset):
    def __init__(self, data, EHR_version): 
        self.patients_num = data.shape[0]
        self.ump_features_list = []
        self.mp_features_list = []
        
        # load mp_features
        if os.path.exists("./mp_f_tensors_" + str(EHR_version) + ".pt"):
            self.mp_features_list = torch.load("./mp_f_tensors_" + str(EHR_version) + ".pt")
            print("Finish loading mapped features' tensors!")
        else:
            for i in range(self.patients_num):
                cur_mp_features = torch.tensor(data[i,:,:5])
                self.mp_features_list.append(cur_mp_features.float())
            
            torch.save(self.mp_features_list, "./mp_f_tensors_" + str(EHR_version) + ".pt")
            print("Finish transforming mapped features' tensors!")
        
        # load ump_features
        if os.path.exists("./ump_f_tensors_" + str(EHR_version) + ".pt"):
            self.ump_features_list = torch.load("./ump_f_tensors_" + str(EHR_version) + ".pt")
            print("Finish loading unmapped features' tensors!")
        else:
            for i in range(self.patients_num):
                cur_ump_features = torch.tensor(data[i, :, 5:])
                self.ump_features_list.append(cur_ump_features.float())
                
            torch.save(self.ump_features_list, "./ump_f_tensors_" + str(EHR_version) + ".pt")
            print("Finish transforming unmapped features' tensors!")
            
    def __len__(self):
        return self.patients_num
    
    def __getitem__(self, idx):
        sample_mp_features = self.mp_features_list[idx]
        # shape: [24, 5]
        sample_ump_features = self.ump_features_list[idx]
        # shape: [24, 15]
        return sample_mp_features, sample_ump_features
        
        

### Define utility functions used in below validation process ( function: val(val_loader, model, batch_size) )

* Get_ump_mp_matrix() : get the ump-predicted_mp matrix

In [10]:
def Get_ump_mp_matrix(num_ump_f, num_mp_f, seq_len, models_list_ump, val_dataloader):
    ump_mp_matrix = np.zeros((num_ump_f, seq_len * num_mp_f))
    for ump_id in range(num_ump_f):
        cur_model = models_list_ump[ump_id]
        cur_model.eval()
        for data in val_dataloader:
            cur_pred_map = cur_model(data)["predicts"] # (10, 24, 5)
            cur_pred_map_batch_sum = np.sum(cur_pred_map.detach().numpy().reshape((10, 120)), axis=0) # [120]
            ump_mp_matrix[ump_id][:] += cur_pred_map_batch_sum

    patients_num = len(val_dataloader.dataset)
    ump_mp_matrix = ump_mp_matrix / patients_num
    return ump_mp_matrix




* Matching_via_HRM(): Based on above cos_sim_matrix, apply Gale-Shapley Algorithm to get correct_matched number and match_matrix

In [52]:
def Matching_via_HRM(C_X1_train, C_X2_train, P_x1_O_to_R, num_mapped_axis):  # in this case here the small feature sized database is X1, so we need to treat it as hospital and there will be capacities on it.
    ####### ----------  X1 train ------------- ##########

    true_features_pref_X1_train = {}
    cross_recon_features_pref_X1_train = {}
    capacities_X1_train = {}

    for i in range(C_X1_train.shape[0]):  # C_X1_train.shape[0]: number of unmapped features in dataset_1
        sorted_index = np.argsort(-C_X1_train[i, :])
        sorted_col_index = ["C" + str(sorted_index[v] + 1) for v in range(len(sorted_index))]
        true_features_pref_X1_train["R" + str(i + 1)] = sorted_col_index
        capacities_X1_train["R" + str(i + 1)] = 1

    for j in range(C_X1_train.shape[1]): # C_X1_train.shape[1]:  number of unmapped features in dataset_2
        sorted_index = np.argsort(-C_X1_train[:, j])
        sorted_col_index = ["R" + str(sorted_index[v] + 1) for v in range(len(sorted_index))]
        cross_recon_features_pref_X1_train["C" + str(j + 1)] = sorted_col_index

    game_X1_train = HospitalResident.create_from_dictionaries(cross_recon_features_pref_X1_train,
                                                              true_features_pref_X1_train,
                                                              capacities_X1_train)

    ####### ----------  X2 train ------------- ##########

    true_features_pref_X2_train = {}
    cross_recon_features_pref_X2_train = {}
    capacities_X2_train = {}

    for i in range(C_X2_train.shape[0]):  # C_X2_train.shape[0]: number of unmapped features in dataset_2
        sorted_index = np.argsort(-C_X2_train[i, :])
        sorted_col_index = ["C" + str(sorted_index[v] + 1) for v in range(len(sorted_index))]
        true_features_pref_X2_train["R" + str(i + 1)] = sorted_col_index

    for j in range(C_X2_train.shape[1]):  # C_X2_train.shape[1]: number of unmapped features in dataset_1
        sorted_index = np.argsort(-C_X2_train[:, j])
        sorted_col_index = ["R" + str(sorted_index[v] + 1) for v in range(len(sorted_index))]
        cross_recon_features_pref_X2_train["C" + str(j + 1)] = sorted_col_index
        capacities_X2_train["C" + str(j + 1)] = 1

    # create_from_dictionaries(resident_prefs, hospital_prefs, capacities, clean=False)
    game_X2_train = HospitalResident.create_from_dictionaries(true_features_pref_X2_train,
                                                              cross_recon_features_pref_X2_train,
                                                              capacities_X2_train)

       ######   ------------  Final matching -----------   ##########

#     print("\n ------- Matching from X1_train  --------- \n")
    matching_x1_train = game_X1_train.solve()
#     print(matching_x1_train)

#     print("\n ------- Matching from X2_train  --------- \n")
    matching_x2_train = game_X2_train.solve()
#     print(matching_x2_train)
    x1_train_y = [int(str(v[0])[1:]) if v else None for v in matching_x1_train.values()]
    x2_train_y = [int(str(v[0])[1:]) if v else None for v in matching_x2_train.values()]

    # matching matrices
    matching_x1_train_matrix = np.zeros(C_X1_train.shape)
    # shape: [num_unmapped_features_in_d1, num_unmapped_features_in_d2]
    matching_x2_train_matrix = np.zeros(np.transpose(C_X2_train).shape)
    # shape: [num_unmapped_features_in_d1, num_unmapped_features_in_d2]

    for i in range(matching_x1_train_matrix.shape[0]):  # number of unmapped features in d_1
        if x1_train_y[i] is not None:
            matching_x1_train_matrix[i, x1_train_y[i] - 1] = 1  # shape: [# of ump features in d1, # of ump features in d2]
        # unmapped feature i in d_1 and unmapped feature "x1_train_y[i] - 1" in d_2 has a match

    for i in range(matching_x2_train_matrix.shape[0]):  # number of unmapped features in d_1
        if x2_train_y[i] is not None:
            matching_x2_train_matrix[i, x2_train_y[i] - 1] = 1  # shape: [# of ump features in d1, # of ump features in d2]
        # unmapped feature i in d_1 and unmapped feature "x2_train_y[i] - 1" in d_2 has a match
    # getting the number of correct matches that had a match in other database
    num_correct_from_x1 = 0
    num_correct_from_x2 = 0
    for i in range(P_x1_O_to_R.shape[0]):  # number of unmapped features in d_1
        if np.all(P_x1_O_to_R[i] == matching_x1_train_matrix[i]):
            # only when the positions of 0-1 are exactly the same, will this condition be true
            num_correct_from_x1 = num_correct_from_x1 + 1
        if np.all(P_x1_O_to_R[i] == matching_x2_train_matrix[i]):
            num_correct_from_x2 = num_correct_from_x2 + 1

    return num_correct_from_x1, num_correct_from_x2, matching_x1_train_matrix, matching_x2_train_matrix


* F1_score(): get the performance metric for validation process

In [12]:
def F1_score(p, x1_match_matrix_test, x2_match_matrix_test):
    TP_x1 = 0
    FP_x1 = 0
    TN_x1 = 0
    FN_x1 = 0
    for i in range(p.shape[0]):
        for j in range(p.shape[1]):
            if (p[i, j] == 1) & (x1_match_matrix_test[i, j] == 1):
                TP_x1 = TP_x1 + 1
            elif (p[i, j] == 1) & (x1_match_matrix_test[i, j] == 0):
                FN_x1 = FN_x1 + 1
            elif (p[i, j] == 0) & (x1_match_matrix_test[i, j] == 0):
                TN_x1 = TN_x1 + 1
            elif (p[i, j] == 0) & (x1_match_matrix_test[i, j] == 1):
                FP_x1 = FP_x1 + 1

    TP_x2 = 0
    FP_x2 = 0
    TN_x2 = 0
    FN_x2 = 0
    for i in range(p.shape[0]):
        for j in range(p.shape[1]):
            if (p[i, j] == 1) & (x2_match_matrix_test[i, j] == 1):
                TP_x2 = TP_x2 + 1
            elif (p[i, j] == 1) & (x2_match_matrix_test[i, j] == 0):
                FN_x2 = FN_x2 + 1
            elif (p[i, j] == 0) & (x2_match_matrix_test[i, j] == 0):
                TN_x2 = TN_x2 + 1
            elif (p[i, j] == 0) & (x2_match_matrix_test[i, j] == 1):
                FP_x2 = FP_x2 + 1
    F1_fromx1 = (2 * TP_x1) / (2 * TP_x1 + FN_x1 + FP_x1)
    F1_fromx2 = (2 * TP_x2) / (2 * TP_x2 + FN_x2 + FP_x2)

#     print("Sim cor F values ", F1_fromx1, F1_fromx2)
    return F1_fromx1, F1_fromx2

### Define train() val() function in k-fold cross validation:
* train()
* val(): return the F1 score of predicting matched pairs

In [53]:
def train(train_loader, model, optimizer):
    model.train()
    for i, data in enumerate(train_loader):
        optimizer.zero_grad()
        cur_loss = model(data)["loss"]
        cur_loss.backward()
        optimizer.step()


def val(val_loader_1, val_loader_2, num_ump_f_1, num_ump_f_2, num_mp_f, seq_len, models_list_1, models_list_2, true_p):
    
    ump_mp_matrix_1 = Get_ump_mp_matrix(num_ump_f_1, num_mp_f, seq_len, models_list_1, val_loader_1)
    ump_mp_matrix_2 = Get_ump_mp_matrix(num_ump_f_2, num_mp_f, seq_len, models_list_2, val_loader_2)
    
    cos_sim_matrix_1_to_2 = pairwise.cosine_similarity(ump_mp_matrix_1, ump_mp_matrix_2)
    cos_sim_matrix_2_to_1 = pairwise.cosine_similarity(ump_mp_matrix_2, ump_mp_matrix_1)
    correct_x1_test, correct_x2_test, x1_match_matrix, x2_match_matrix = Matching_via_HRM(cos_sim_matrix_1_to_2, cos_sim_matrix_2_to_1, true_p, num_mp_f)
    f1_fromx1, f1_fromx2 = F1_score(true_p, x1_match_matrix, x2_match_matrix)
    
    return f1_fromx1, f1_fromx2
            

### Using all the data to apply 5-fold cross validation to get each ablation model's avrage performance, then select the best one to be trained on all the dataset to slightly improved the final model's performance

In [14]:
from torch.utils.data import random_split, DataLoader


dataset_1_all = Seq2seq_Dataset(data_1, 1)
dataset_2_all = Seq2seq_Dataset(data_2, 2)

Finish loading mapped features' tensors!
Finish loading unmapped features' tensors!
Finish loading mapped features' tensors!
Finish loading unmapped features' tensors!


### Create kfold_cv_dl_list_1, kfold_cv_dl_list_2 to store train_idx and val_idx for each fold of 2 EHRs respectively

In [15]:
k_folds = 5
splits=KFold(n_splits=k_folds, shuffle=True, random_state=42)
kfold_cv_dl_list_1 = []
kfold_cv_dl_list_2 = []

for fold, (train_idx, val_idx) in enumerate(splits.split(np.arange(len(dataset_1_all)))):
    cur_split = {}
    cur_split["train_idx"] = train_idx
    cur_split["val_idx"] = val_idx
    kfold_cv_dl_list_1.append(cur_split)

for fold, (train_idx, val_idx) in enumerate(splits.split(np.arange(len(dataset_2_all)))):
    cur_split = {}
    cur_split["train_idx"] = train_idx
    cur_split["val_idx"] = val_idx
    kfold_cv_dl_list_2.append(cur_split)

# Define model_2: feed the output(the final hidden state of the last layer on each time stamp, with shape: [batch_size, seq_len(24), hidden_dim]) to MLP to predict mapped time series data

In [17]:
import torch.nn as nn
class Seq2seq_model_pass_final_hs_each_time(nn.Module):
    def __init__(self, ump_feature_id, batch_size, input_dim, hidden_dim, seq_len, num_mp_f):
        super(Seq2seq_model_pass_final_hs_each_time, self).__init__()
        # each model trained on only one unmapped feature: ump_feature_id
        # currently, ump_feature_id takes value from [0, 14]
        self.ump_id = ump_feature_id
        self.batch_size = batch_size
        self.seq_len = seq_len
        self.h_0 = torch.randn(1, self.batch_size, hidden_dim)
        # RNN to get final hidden states
        self.RNN = nn.RNN(input_dim, hidden_dim, batch_first=True)
        # MLP to predict pre-mapped features(flatten to shape: seq_len x num_mp_f)
        
        self.target_dim = num_mp_f
        
        self.dense1 = nn.Linear(hidden_dim, hidden_dim)
        self.MLP_drop1 = nn.Dropout(p=0.2)
        self.dense2 = nn.Linear(hidden_dim, int(self.target_dim * 2))
        self.MLP_drop2 = nn.Dropout(p=0.2)
        self.dense3 = nn.Linear(int(self.target_dim * 2), self.target_dim)
        self.MLP_drop3 = nn.Dropout(p=0.2)
        self.dense4 = nn.Linear(self.target_dim, self.target_dim)
        
    def forward(self, input_data):
        true_mapped_features = input_data[0]
        # shape: [10, 24, 5], targets
        unmapped_features = input_data[1][:, :, self.ump_id].reshape((self.batch_size, self.seq_len, 1)) 
        # shape: [10, 24, 1], includes all unmapped features
        output, _ = self.RNN(unmapped_features, self.h_0)
        # output: [10, 24, hidden_dim]
        map_predict = self.dense1(output)
        map_predict = nn.ReLU()(map_predict)
        map_predict = self.MLP_drop1(map_predict)
        
        map_predict = self.dense2(map_predict)
        map_predict = nn.ReLU()(map_predict)
        map_predict = self.MLP_drop2(map_predict)
        
        map_predict = self.dense3(map_predict)
        map_predict = nn.ReLU()(map_predict)
        map_predict = self.MLP_drop3(map_predict)
        
        map_predict = self.dense4(map_predict) # shape [batch_size, seq_len * num_mp_f]
        
        criterion = nn.MSELoss()
        loss = criterion(map_predict, true_mapped_features)
        return {"predicts": map_predict, "loss": loss}
        
        
        

# K-fold cross validation on the second model

In [55]:
num_epochs = 10
batch_size = 10
input_dim = 1 # each unmapped feature
hidden_dim_list = [5, 10, 15] # hidden dim of RNN model
num_ump_f_1 = 15
num_ump_f_2 = 15
seq_len = 24
num_mp_f = 5
lr = 0.002

for tune_time in range(len(hidden_dim_list)):
    hidden_dim = hidden_dim_list[tune_time]
    
    kfold_results_1_model2 = []
    kfold_results_2_model2 = []
    for fold, idx_list in enumerate(zip(kfold_cv_dl_list_1, kfold_cv_dl_list_2)):
#         print("current fold: ", fold)
        train_idx_1 = idx_list[0]["train_idx"]
        val_idx_1 = idx_list[0]["val_idx"]
        train_idx_2 = idx_list[1]["train_idx"]
        val_idx_2 = idx_list[1]["val_idx"]

        train_sampler_1 = SubsetRandomSampler(train_idx_1)
        val_sampler_1 = SubsetRandomSampler(val_idx_1)

        train_sampler_2 = SubsetRandomSampler(train_idx_2)
        val_sampler_2 = SubsetRandomSampler(val_idx_2)

        train_loader_1 = DataLoader(dataset_1_all, batch_size=batch_size, sampler=train_sampler_1)
        val_loader_1 = DataLoader(dataset_1_all, batch_size=batch_size, sampler=val_sampler_1)

        train_loader_2 = DataLoader(dataset_2_all, batch_size=batch_size, sampler=train_sampler_2)
        val_loader_2 = DataLoader(dataset_2_all, batch_size=batch_size, sampler=val_sampler_2)

        trained_models_list_1 = []
        trained_models_list_2 = []

        for ump_id in range(num_ump_f_1):
            model = Seq2seq_model_pass_final_hs_each_time(ump_id, batch_size, input_dim, hidden_dim, seq_len, num_mp_f)
            optimizer = optim.Adam(model.parameters(), lr = lr)
            for epoch in range(num_epochs):
                train(train_loader_1, model, optimizer)
            trained_models_list_1.append(model)

        for ump_id in range(num_ump_f_2):
            model = Seq2seq_model_pass_final_hs_each_time(ump_id, batch_size, input_dim, hidden_dim, seq_len, num_mp_f)
            optimizer = optim.Adam(model.parameters(), lr = lr)
            for epoch in range(num_epochs):
                train(train_loader_2, model, optimizer)
            trained_models_list_2.append(model)

        f1_fromx1, f1_fromx2 = val(val_loader_1, val_loader_2, num_ump_f_1, num_ump_f_2, num_mp_f, seq_len, trained_models_list_1, trained_models_list_2, p)
        kfold_results_1_model2.append(f1_fromx1)
        kfold_results_2_model2.append(f1_fromx2)
    print("\n ------- When hidden_dim equals to: ", hidden_dim, "--------- \n")
    print("avg_f1_fromx1: ", np.mean(kfold_results_1_model2), " std_f1_fromx1: ", np.std(kfold_results_1_model2))
    print("avg_f1_fromx2: ", np.mean(kfold_results_2_model2), " std_f1_fromx1: ", np.std(kfold_results_2_model2))
    
    


 ------- When hidden_dim equals to:  5 --------- 

avg_f1_fromx1:  0.10666666666666666  std_f1_fromx1:  0.05333333333333334
avg_f1_fromx2:  0.10666666666666666  std_f1_fromx1:  0.05333333333333334

 ------- When hidden_dim equals to:  10 --------- 

avg_f1_fromx1:  0.12  std_f1_fromx1:  0.11469767022723501
avg_f1_fromx2:  0.12  std_f1_fromx1:  0.11469767022723501

 ------- When hidden_dim equals to:  15 --------- 

avg_f1_fromx1:  0.06666666666666668  std_f1_fromx1:  0.05962847939999439
avg_f1_fromx2:  0.06666666666666668  std_f1_fromx1:  0.05962847939999439


# Ablation study 1: Add more RNN layers(tested on 3 stacking layers first) based on model2's structure

In [24]:
class ablation_model_more_rnn_layers(nn.Module):
    def __init__(self, ump_feature_id, batch_size, input_dim, hidden_dim, seq_len, num_mp_f):
        super(ablation_model_more_rnn_layers, self).__init__()
        # each model trained on only one unmapped feature: ump_feature_id
        # currently, ump_feature_id takes value from [0, 14]
        self.ump_id = ump_feature_id
        self.batch_size = batch_size
        self.seq_len = seq_len
        self.h_0 = torch.randn(3, self.batch_size, hidden_dim) # 3 is the number of stacked layers
        self.RNN = nn.RNN(input_dim, hidden_dim, num_layers=3, batch_first=True, dropout=0.2)
        
        self.target_dim = num_mp_f
        
        self.dense1 = nn.Linear(hidden_dim, hidden_dim)
        self.MLP_drop1 = nn.Dropout(p=0.2)
        self.dense2 = nn.Linear(hidden_dim, int(self.target_dim * 2))
        self.MLP_drop2 = nn.Dropout(p=0.2)
        self.dense3 = nn.Linear(int(self.target_dim * 2), self.target_dim)
        self.MLP_drop3 = nn.Dropout(p=0.2)
        self.dense4 = nn.Linear(self.target_dim, self.target_dim)
        
    def forward(self, input_data):
        true_mapped_features = input_data[0]
        # shape: [10, 24, 5], targets
        unmapped_features = input_data[1][:, :, self.ump_id].reshape((self.batch_size, self.seq_len, 1)) 
        # shape: [10, 24, 1], includes all unmapped features
        output, _ = self.RNN(unmapped_features, self.h_0)
        # output: [10, 24, hidden_dim]
        map_predict = self.dense1(output)
        map_predict = nn.ReLU()(map_predict)
        map_predict = self.MLP_drop1(map_predict)
        
        map_predict = self.dense2(map_predict)
        map_predict = nn.ReLU()(map_predict)
        map_predict = self.MLP_drop2(map_predict)
        
        map_predict = self.dense3(map_predict)
        map_predict = nn.ReLU()(map_predict)
        map_predict = self.MLP_drop3(map_predict)
        
        map_predict = self.dense4(map_predict) # shape [batch_size, seq_len * num_mp_f]
        
        criterion = nn.MSELoss()
        loss = criterion(map_predict, true_mapped_features)
        return {"predicts": map_predict, "loss": loss}
        
        
        

In [56]:


for tune_time in range(len(hidden_dim_list)):
    hidden_dim = hidden_dim_list[tune_time]
    kfold_results_1_abla1 = []
    kfold_results_2_abla1 = []
    for fold, idx_list in enumerate(zip(kfold_cv_dl_list_1, kfold_cv_dl_list_2)):
        print("current fold: ", fold)
        train_idx_1 = idx_list[0]["train_idx"]
        val_idx_1 = idx_list[0]["val_idx"]
        train_idx_2 = idx_list[1]["train_idx"]
        val_idx_2 = idx_list[1]["val_idx"]

        train_sampler_1 = SubsetRandomSampler(train_idx_1)
        val_sampler_1 = SubsetRandomSampler(val_idx_1)

        train_sampler_2 = SubsetRandomSampler(train_idx_2)
        val_sampler_2 = SubsetRandomSampler(val_idx_2)

        train_loader_1 = DataLoader(dataset_1_all, batch_size=batch_size, sampler=train_sampler_1)
        val_loader_1 = DataLoader(dataset_1_all, batch_size=batch_size, sampler=val_sampler_1)

        train_loader_2 = DataLoader(dataset_2_all, batch_size=batch_size, sampler=train_sampler_2)
        val_loader_2 = DataLoader(dataset_2_all, batch_size=batch_size, sampler=val_sampler_2)

        trained_models_list_1 = []
        trained_models_list_2 = []

        for ump_id in range(num_ump_f_1):
            model = ablation_model_more_rnn_layers(ump_id, batch_size, input_dim, hidden_dim, seq_len, num_mp_f)
            optimizer = optim.Adam(model.parameters(), lr = lr)
            for epoch in range(num_epochs):
                train(train_loader_1, model, optimizer)
            trained_models_list_1.append(model)

        for ump_id in range(num_ump_f_2):
            model = ablation_model_more_rnn_layers(ump_id, batch_size, input_dim, hidden_dim, seq_len, num_mp_f)
            optimizer = optim.Adam(model.parameters(), lr = lr)
            for epoch in range(num_epochs):
                train(train_loader_2, model, optimizer)
            trained_models_list_2.append(model)

        f1_fromx1, f1_fromx2 = val(val_loader_1, val_loader_2, num_ump_f_1, num_ump_f_2, num_mp_f, seq_len, trained_models_list_1, trained_models_list_2, p)
        kfold_results_1_abla1.append(f1_fromx1)
        kfold_results_2_abla1.append(f1_fromx2)
        
    print("\n ------- When hidden_dim equals to: ", hidden_dim, "--------- \n")
    print("avg_f1_fromx1: ", np.mean(kfold_results_1_abla1), " std_f1_fromx1: ", np.std(kfold_results_1_abla1))
    print("avg_f1_fromx2: ", np.mean(kfold_results_2_abla1), " std_f1_fromx1: ", np.std(kfold_results_2_abla1))
    

current fold:  0
current fold:  1
current fold:  2
current fold:  3
current fold:  4

 ------- When hidden_dim equals to:  5 --------- 

avg_f1_fromx1:  0.04  std_f1_fromx1:  0.05333333333333333
avg_f1_fromx2:  0.04  std_f1_fromx1:  0.05333333333333333
current fold:  0
current fold:  1
current fold:  2
current fold:  3
current fold:  4

 ------- When hidden_dim equals to:  10 --------- 

avg_f1_fromx1:  0.08  std_f1_fromx1:  0.049888765156985884
avg_f1_fromx2:  0.08  std_f1_fromx1:  0.049888765156985884
current fold:  0
current fold:  1
current fold:  2
current fold:  3
current fold:  4

 ------- When hidden_dim equals to:  15 --------- 

avg_f1_fromx1:  0.08  std_f1_fromx1:  0.049888765156985884
avg_f1_fromx2:  0.08  std_f1_fromx1:  0.049888765156985884


# Ablation study 2: Only add one Dropout layer after the third linear layer (the layer before the last linear layer)

In [31]:
class ablation_model_only_one_dp(nn.Module):
    def __init__(self, ump_feature_id, batch_size, input_dim, hidden_dim, seq_len, num_mp_f):
        super(ablation_model_only_one_dp, self).__init__()
        # each model trained on only one unmapped feature: ump_feature_id
        # currently, ump_feature_id takes value from [0, 14]
        self.ump_id = ump_feature_id
        self.batch_size = batch_size
        self.seq_len = seq_len
        self.h_0 = torch.randn(1, self.batch_size, hidden_dim)
        # RNN to get final hidden states
        self.RNN = nn.RNN(input_dim, hidden_dim, batch_first=True)
        # MLP to predict pre-mapped features(flatten to shape: seq_len x num_mp_f)
        
        self.target_dim = num_mp_f
        
        self.dense1 = nn.Linear(hidden_dim, hidden_dim)
        self.dense2 = nn.Linear(hidden_dim, int(self.target_dim * 2))
        self.dense3 = nn.Linear(int(self.target_dim * 2), self.target_dim)
        self.MLP_drop3 = nn.Dropout(p=0.2)
        self.dense4 = nn.Linear(self.target_dim, self.target_dim)
        
    def forward(self, input_data):
        true_mapped_features = input_data[0]
        # shape: [10, 24, 5], targets
        unmapped_features = input_data[1][:, :, self.ump_id].reshape((self.batch_size, self.seq_len, 1)) 
        # shape: [10, 24, 1], includes all unmapped features
        output, _ = self.RNN(unmapped_features, self.h_0)
        # output: [10, 24, hidden_dim]
        map_predict = self.dense1(output)
        map_predict = nn.ReLU()(map_predict)
        
        map_predict = self.dense2(map_predict)
        map_predict = nn.ReLU()(map_predict)
        
        map_predict = self.dense3(map_predict)
        map_predict = nn.ReLU()(map_predict)
        map_predict = self.MLP_drop3(map_predict)
        
        map_predict = self.dense4(map_predict) # shape [batch_size, seq_len * num_mp_f]
        
        criterion = nn.MSELoss()
        loss = criterion(map_predict, true_mapped_features)
        return {"predicts": map_predict, "loss": loss}

In [57]:


for tune_time in range(len(hidden_dim_list)):
    hidden_dim = hidden_dim_list[tune_time]
    kfold_results_1_abla2 = []
    kfold_results_2_abla2 = []
    for fold, idx_list in enumerate(zip(kfold_cv_dl_list_1, kfold_cv_dl_list_2)):
        print("current fold: ", fold)
        train_idx_1 = idx_list[0]["train_idx"]
        val_idx_1 = idx_list[0]["val_idx"]
        train_idx_2 = idx_list[1]["train_idx"]
        val_idx_2 = idx_list[1]["val_idx"]

        train_sampler_1 = SubsetRandomSampler(train_idx_1)
        val_sampler_1 = SubsetRandomSampler(val_idx_1)

        train_sampler_2 = SubsetRandomSampler(train_idx_2)
        val_sampler_2 = SubsetRandomSampler(val_idx_2)

        train_loader_1 = DataLoader(dataset_1_all, batch_size=batch_size, sampler=train_sampler_1)
        val_loader_1 = DataLoader(dataset_1_all, batch_size=batch_size, sampler=val_sampler_1)

        train_loader_2 = DataLoader(dataset_2_all, batch_size=batch_size, sampler=train_sampler_2)
        val_loader_2 = DataLoader(dataset_2_all, batch_size=batch_size, sampler=val_sampler_2)

        trained_models_list_1 = []
        trained_models_list_2 = []

        for ump_id in range(num_ump_f_1):
            model = ablation_model_only_one_dp(ump_id, batch_size, input_dim, hidden_dim, seq_len, num_mp_f)
            optimizer = optim.Adam(model.parameters(), lr = lr)
            for epoch in range(num_epochs):
                train(train_loader_1, model, optimizer)
            trained_models_list_1.append(model)

        for ump_id in range(num_ump_f_2):
            model = ablation_model_only_one_dp(ump_id, batch_size, input_dim, hidden_dim, seq_len, num_mp_f)
            optimizer = optim.Adam(model.parameters(), lr = lr)
            for epoch in range(num_epochs):
                train(train_loader_2, model, optimizer)
            trained_models_list_2.append(model)

        f1_fromx1, f1_fromx2 = val(val_loader_1, val_loader_2, num_ump_f_1, num_ump_f_2, num_mp_f, seq_len, trained_models_list_1, trained_models_list_2, p)
        kfold_results_1_abla2.append(f1_fromx1)
        kfold_results_2_abla2.append(f1_fromx2)
    
    print("\n ------- When hidden_dim equals to: ", hidden_dim, "--------- \n")
    print("avg_f1_fromx1: ", np.mean(kfold_results_1_abla2), " std_f1_fromx1: ", np.std(kfold_results_1_abla2))
    print("avg_f1_fromx2: ", np.mean(kfold_results_2_abla2), " std_f1_fromx1: ", np.std(kfold_results_2_abla2))
    
    
    

current fold:  0
current fold:  1
current fold:  2
current fold:  3
current fold:  4

 ------- When hidden_dim equals to:  5 --------- 

avg_f1_fromx1:  0.09333333333333334  std_f1_fromx1:  0.0679869268479038
avg_f1_fromx2:  0.09333333333333334  std_f1_fromx1:  0.0679869268479038
current fold:  0
current fold:  1
current fold:  2
current fold:  3
current fold:  4

 ------- When hidden_dim equals to:  10 --------- 

avg_f1_fromx1:  0.12000000000000002  std_f1_fromx1:  0.07774602526460402
avg_f1_fromx2:  0.12000000000000002  std_f1_fromx1:  0.07774602526460402
current fold:  0
current fold:  1
current fold:  2
current fold:  3
current fold:  4

 ------- When hidden_dim equals to:  15 --------- 

avg_f1_fromx1:  0.04  std_f1_fromx1:  0.05333333333333333
avg_f1_fromx2:  0.04  std_f1_fromx1:  0.05333333333333333


# Ablation study 3: Delete all the Dropout layers (only keep ReLU activation function) based on model2's structure

In [35]:
class ablation_model_no_dp_layers(nn.Module):
    def __init__(self, ump_feature_id, batch_size, input_dim, hidden_dim, seq_len, num_mp_f):
        super(ablation_model_no_dp_layers, self).__init__()
        # each model trained on only one unmapped feature: ump_feature_id
        # currently, ump_feature_id takes value from [0, 14]
        self.ump_id = ump_feature_id
        self.batch_size = batch_size
        self.seq_len = seq_len
        self.h_0 = torch.randn(1, self.batch_size, hidden_dim)
        # RNN to get final hidden states
        self.RNN = nn.RNN(input_dim, hidden_dim, batch_first=True)
        # MLP to predict pre-mapped features(flatten to shape: seq_len x num_mp_f)
        
        self.target_dim = num_mp_f
        
        self.dense1 = nn.Linear(hidden_dim, hidden_dim)
        self.dense2 = nn.Linear(hidden_dim, int(self.target_dim * 2))
        self.dense3 = nn.Linear(int(self.target_dim * 2), self.target_dim)
        self.dense4 = nn.Linear(self.target_dim, self.target_dim)
        
    def forward(self, input_data):
        true_mapped_features = input_data[0]
        # shape: [10, 24, 5], targets
        unmapped_features = input_data[1][:, :, self.ump_id].reshape((self.batch_size, self.seq_len, 1)) 
        # shape: [10, 24, 1], includes all unmapped features
        output, _ = self.RNN(unmapped_features, self.h_0)
        # output: [10, 24, hidden_dim]
        map_predict = self.dense1(output)
        map_predict = nn.ReLU()(map_predict)
        
        map_predict = self.dense2(map_predict)
        map_predict = nn.ReLU()(map_predict)
        
        map_predict = self.dense3(map_predict)
        map_predict = nn.ReLU()(map_predict)
        
        map_predict = self.dense4(map_predict) # shape [batch_size, seq_len * num_mp_f]
        
        criterion = nn.MSELoss()
        loss = criterion(map_predict, true_mapped_features)
        return {"predicts": map_predict, "loss": loss}
        

In [58]:


for tune_time in range(len(hidden_dim_list)):
    hidden_dim = hidden_dim_list[tune_time]
    kfold_results_1_abla3 = []
    kfold_results_2_abla3 = []
    for fold, idx_list in enumerate(zip(kfold_cv_dl_list_1, kfold_cv_dl_list_2)):
        print("current fold: ", fold)
        train_idx_1 = idx_list[0]["train_idx"]
        val_idx_1 = idx_list[0]["val_idx"]
        train_idx_2 = idx_list[1]["train_idx"]
        val_idx_2 = idx_list[1]["val_idx"]

        train_sampler_1 = SubsetRandomSampler(train_idx_1)
        val_sampler_1 = SubsetRandomSampler(val_idx_1)

        train_sampler_2 = SubsetRandomSampler(train_idx_2)
        val_sampler_2 = SubsetRandomSampler(val_idx_2)

        train_loader_1 = DataLoader(dataset_1_all, batch_size=batch_size, sampler=train_sampler_1)
        val_loader_1 = DataLoader(dataset_1_all, batch_size=batch_size, sampler=val_sampler_1)

        train_loader_2 = DataLoader(dataset_2_all, batch_size=batch_size, sampler=train_sampler_2)
        val_loader_2 = DataLoader(dataset_2_all, batch_size=batch_size, sampler=val_sampler_2)

        trained_models_list_1 = []
        trained_models_list_2 = []

        for ump_id in range(num_ump_f_1):
            model = ablation_model_no_dp_layers(ump_id, batch_size, input_dim, hidden_dim, seq_len, num_mp_f)
            optimizer = optim.Adam(model.parameters(), lr = lr)
            for epoch in range(num_epochs):
                train(train_loader_1, model, optimizer)
            trained_models_list_1.append(model)

        for ump_id in range(num_ump_f_2):
            model = ablation_model_no_dp_layers(ump_id, batch_size, input_dim, hidden_dim, seq_len, num_mp_f)
            optimizer = optim.Adam(model.parameters(), lr = lr)
            for epoch in range(num_epochs):
                train(train_loader_2, model, optimizer)
            trained_models_list_2.append(model)

        f1_fromx1, f1_fromx2 = val(val_loader_1, val_loader_2, num_ump_f_1, num_ump_f_2, num_mp_f, seq_len, trained_models_list_1, trained_models_list_2, p)
        kfold_results_1_abla3.append(f1_fromx1)
        kfold_results_2_abla3.append(f1_fromx2)
        
    print("\n ------- When hidden_dim equals to: ", hidden_dim, "--------- \n")
    print("avg_f1_fromx1: ", np.mean(kfold_results_1_abla3), " std_f1_fromx1: ", np.std(kfold_results_1_abla3))
    print("avg_f1_fromx2: ", np.mean(kfold_results_2_abla3), " std_f1_fromx1: ", np.std(kfold_results_2_abla3))
    
    

current fold:  0
current fold:  1
current fold:  2
current fold:  3
current fold:  4

 ------- When hidden_dim equals to:  5 --------- 

avg_f1_fromx1:  0.05333333333333333  std_f1_fromx1:  0.02666666666666667
avg_f1_fromx2:  0.05333333333333333  std_f1_fromx1:  0.02666666666666667
current fold:  0
current fold:  1
current fold:  2
current fold:  3
current fold:  4

 ------- When hidden_dim equals to:  10 --------- 

avg_f1_fromx1:  0.08  std_f1_fromx1:  0.049888765156985884
avg_f1_fromx2:  0.08  std_f1_fromx1:  0.049888765156985884
current fold:  0
current fold:  1
current fold:  2
current fold:  3
current fold:  4

 ------- When hidden_dim equals to:  15 --------- 

avg_f1_fromx1:  0.08  std_f1_fromx1:  0.049888765156985884
avg_f1_fromx2:  0.08  std_f1_fromx1:  0.049888765156985884


# Ablation study 4: change the activation function in the MLP part from ReLU to LeakyReLU

In [39]:
class ablation_model_leakyReLU(nn.Module):
    def __init__(self, ump_feature_id, batch_size, input_dim, hidden_dim, seq_len, num_mp_f):
        super(ablation_model_leakyReLU, self).__init__()
        # each model trained on only one unmapped feature: ump_feature_id
        # currently, ump_feature_id takes value from [0, 14]
        self.ump_id = ump_feature_id
        self.batch_size = batch_size
        self.seq_len = seq_len
        self.h_0 = torch.randn(1, self.batch_size, hidden_dim)
        # RNN to get final hidden states
        self.RNN = nn.RNN(input_dim, hidden_dim, batch_first=True)
        # MLP to predict pre-mapped features(flatten to shape: seq_len x num_mp_f)
        
        self.target_dim = num_mp_f
        
        self.dense1 = nn.Linear(hidden_dim, hidden_dim)
        self.MLP_drop1 = nn.Dropout(p=0.2)
        self.dense2 = nn.Linear(hidden_dim, int(self.target_dim * 2))
        self.MLP_drop2 = nn.Dropout(p=0.2)
        self.dense3 = nn.Linear(int(self.target_dim * 2), self.target_dim)
        self.MLP_drop3 = nn.Dropout(p=0.2)
        self.dense4 = nn.Linear(self.target_dim, self.target_dim)
        
    def forward(self, input_data):
        true_mapped_features = input_data[0]
        # shape: [10, 24, 5], targets
        unmapped_features = input_data[1][:, :, self.ump_id].reshape((self.batch_size, self.seq_len, 1)) 
        # shape: [10, 24, 1], includes all unmapped features
        output, _ = self.RNN(unmapped_features, self.h_0)
        # output: [10, 24, hidden_dim]
        map_predict = self.dense1(output)
        map_predict = nn.LeakyReLU(0.1)(map_predict)
        map_predict = self.MLP_drop1(map_predict)
        
        map_predict = self.dense2(map_predict)
        map_predict = nn.LeakyReLU(0.1)(map_predict)
        map_predict = self.MLP_drop2(map_predict)
        
        map_predict = self.dense3(map_predict)
        map_predict = nn.LeakyReLU(0.1)(map_predict)
        map_predict = self.MLP_drop3(map_predict)
        
        map_predict = self.dense4(map_predict) # shape [batch_size, seq_len * num_mp_f]
        
        criterion = nn.MSELoss()
        loss = criterion(map_predict, true_mapped_features)
        return {"predicts": map_predict, "loss": loss}
        
        
        

In [59]:


for tune_time in range(len(hidden_dim_list)):
    hidden_dim = hidden_dim_list[tune_time]
    kfold_results_1_abla4 = []
    kfold_results_2_abla4 = []

    for fold, idx_list in enumerate(zip(kfold_cv_dl_list_1, kfold_cv_dl_list_2)):
        print("current fold: ", fold)
        train_idx_1 = idx_list[0]["train_idx"]
        val_idx_1 = idx_list[0]["val_idx"]
        train_idx_2 = idx_list[1]["train_idx"]
        val_idx_2 = idx_list[1]["val_idx"]

        train_sampler_1 = SubsetRandomSampler(train_idx_1)
        val_sampler_1 = SubsetRandomSampler(val_idx_1)

        train_sampler_2 = SubsetRandomSampler(train_idx_2)
        val_sampler_2 = SubsetRandomSampler(val_idx_2)

        train_loader_1 = DataLoader(dataset_1_all, batch_size=batch_size, sampler=train_sampler_1)
        val_loader_1 = DataLoader(dataset_1_all, batch_size=batch_size, sampler=val_sampler_1)

        train_loader_2 = DataLoader(dataset_2_all, batch_size=batch_size, sampler=train_sampler_2)
        val_loader_2 = DataLoader(dataset_2_all, batch_size=batch_size, sampler=val_sampler_2)

        trained_models_list_1 = []
        trained_models_list_2 = []

        for ump_id in range(num_ump_f_1):
            model = ablation_model_leakyReLU(ump_id, batch_size, input_dim, hidden_dim, seq_len, num_mp_f)
            optimizer = optim.Adam(model.parameters(), lr = lr)
            for epoch in range(num_epochs):
                train(train_loader_1, model, optimizer)
            trained_models_list_1.append(model)

        for ump_id in range(num_ump_f_2):
            model = ablation_model_leakyReLU(ump_id, batch_size, input_dim, hidden_dim, seq_len, num_mp_f)
            optimizer = optim.Adam(model.parameters(), lr = lr)
            for epoch in range(num_epochs):
                train(train_loader_2, model, optimizer)
            trained_models_list_2.append(model)

        f1_fromx1, f1_fromx2 = val(val_loader_1, val_loader_2, num_ump_f_1, num_ump_f_2, num_mp_f, seq_len, trained_models_list_1, trained_models_list_2, p)
        kfold_results_1_abla4.append(f1_fromx1)
        kfold_results_2_abla4.append(f1_fromx2)
        
    print("\n ------- When hidden_dim equals to: ", hidden_dim, "--------- \n")
    print("avg_f1_fromx1: ", np.mean(kfold_results_1_abla4), " std_f1_fromx1: ", np.std(kfold_results_1_abla4))
    print("avg_f1_fromx2: ", np.mean(kfold_results_2_abla4), " std_f1_fromx1: ", np.std(kfold_results_2_abla4))


current fold:  0
current fold:  1
current fold:  2
current fold:  3
current fold:  4

 ------- When hidden_dim equals to:  5 --------- 

avg_f1_fromx1:  0.06666666666666668  std_f1_fromx1:  0.05962847939999439
avg_f1_fromx2:  0.06666666666666668  std_f1_fromx1:  0.05962847939999439
current fold:  0
current fold:  1
current fold:  2
current fold:  3
current fold:  4

 ------- When hidden_dim equals to:  10 --------- 

avg_f1_fromx1:  0.09333333333333334  std_f1_fromx1:  0.0679869268479038
avg_f1_fromx2:  0.09333333333333334  std_f1_fromx1:  0.0679869268479038
current fold:  0
current fold:  1
current fold:  2
current fold:  3
current fold:  4

 ------- When hidden_dim equals to:  15 --------- 

avg_f1_fromx1:  0.13333333333333336  std_f1_fromx1:  0.0596284793999944
avg_f1_fromx2:  0.13333333333333336  std_f1_fromx1:  0.0596284793999944


# Based on above ablation studies, the seq_to_seq_model_2.0 should include below structure:
* 1-layer RNN sturcture
* only add one dropout layer after the Linear layer before the last Linear layer (in current case, add nn.Dropout after the third Linear Layer)
* use ReLU() as activation function


In [43]:
class seq_to_seq_model_2(nn.Module):
    def __init__(self, ump_feature_id, batch_size, input_dim, hidden_dim, seq_len, num_mp_f):
        super(seq_to_seq_model_2, self).__init__()
        # each model trained on only one unmapped feature: ump_feature_id
        # currently, ump_feature_id takes value from [0, 14]
        self.ump_id = ump_feature_id
        self.batch_size = batch_size
        self.seq_len = seq_len
        self.h_0 = torch.randn(1, self.batch_size, hidden_dim)
        # RNN to get final hidden states
        self.RNN = nn.RNN(input_dim, hidden_dim, batch_first=True)
        # MLP to predict pre-mapped features(flatten to shape: seq_len x num_mp_f)
        
        self.target_dim = num_mp_f
        
        self.dense1 = nn.Linear(hidden_dim, hidden_dim)
        self.dense2 = nn.Linear(hidden_dim, int(self.target_dim * 2))
        self.dense3 = nn.Linear(int(self.target_dim * 2), self.target_dim)
        self.MLP_drop3 = nn.Dropout(p=0.2)
        self.dense4 = nn.Linear(self.target_dim, self.target_dim)
        
    def forward(self, input_data):
        true_mapped_features = input_data[0]
        # shape: [10, 24, 5], targets
        unmapped_features = input_data[1][:, :, self.ump_id].reshape((self.batch_size, self.seq_len, 1)) 
        # shape: [10, 24, 1], includes all unmapped features
        output, _ = self.RNN(unmapped_features, self.h_0)
        # output: [10, 24, hidden_dim]
        map_predict = self.dense1(output)
        map_predict = nn.ReLU()(map_predict)
        
        map_predict = self.dense2(map_predict)
        map_predict = nn.ReLU()(map_predict)
        
        map_predict = self.dense3(map_predict)
        map_predict = nn.ReLU()(map_predict)
        map_predict = self.MLP_drop3(map_predict)
        
        map_predict = self.dense4(map_predict) # shape [batch_size, seq_len * num_mp_f]
        
        criterion = nn.MSELoss()
        loss = criterion(map_predict, true_mapped_features)
        return {"predicts": map_predict, "loss": loss}
        
        

In [44]:
kfold_results_1_seq2seq_2 = []
kfold_results_2_seq2seq_2 = []


for fold, idx_list in enumerate(zip(kfold_cv_dl_list_1, kfold_cv_dl_list_2)):
    print("current fold: ", fold)
    train_idx_1 = idx_list[0]["train_idx"]
    val_idx_1 = idx_list[0]["val_idx"]
    train_idx_2 = idx_list[1]["train_idx"]
    val_idx_2 = idx_list[1]["val_idx"]
    
    train_sampler_1 = SubsetRandomSampler(train_idx_1)
    val_sampler_1 = SubsetRandomSampler(val_idx_1)
    
    train_sampler_2 = SubsetRandomSampler(train_idx_2)
    val_sampler_2 = SubsetRandomSampler(val_idx_2)
    
    train_loader_1 = DataLoader(dataset_1_all, batch_size=batch_size, sampler=train_sampler_1)
    val_loader_1 = DataLoader(dataset_1_all, batch_size=batch_size, sampler=val_sampler_1)
    
    train_loader_2 = DataLoader(dataset_2_all, batch_size=batch_size, sampler=train_sampler_2)
    val_loader_2 = DataLoader(dataset_2_all, batch_size=batch_size, sampler=val_sampler_2)
    
    trained_models_list_1 = []
    trained_models_list_2 = []
    
    for ump_id in range(num_ump_f_1):
        model = seq_to_seq_model_2(ump_id, batch_size, input_dim, hidden_dim, seq_len, num_mp_f)
        optimizer = optim.Adam(model.parameters(), lr = lr)
        for epoch in range(num_epochs):
            train(train_loader_1, model, optimizer)
        trained_models_list_1.append(model)
        
    for ump_id in range(num_ump_f_2):
        model = seq_to_seq_model_2(ump_id, batch_size, input_dim, hidden_dim, seq_len, num_mp_f)
        optimizer = optim.Adam(model.parameters(), lr = lr)
        for epoch in range(num_epochs):
            train(train_loader_2, model, optimizer)
        trained_models_list_2.append(model)
        
    f1_fromx1, f1_fromx2 = val(val_loader_1, val_loader_2, num_ump_f_1, num_ump_f_2, num_mp_f, seq_len, trained_models_list_1, trained_models_list_2, p)
    kfold_results_1_seq2seq_2.append(f1_fromx1)
    kfold_results_2_seq2seq_2.append(f1_fromx2)
    

current fold:  0

 ------- Matching from X1_train  --------- 

{R1: [C12], R2: [C10], R3: [C3], R4: [C14], R5: [C1], R6: [C5], R7: [C7], R8: [C11], R9: [C2], R10: [C9], R11: [C4], R12: [C8], R13: [C15], R14: [C6], R15: [C13]}

 ------- Matching from X2_train  --------- 

{C1: [R12], C2: [R10], C3: [R3], C4: [R14], C5: [R1], C6: [R5], C7: [R7], C8: [R11], C9: [R2], C10: [R9], C11: [R4], C12: [R8], C13: [R15], C14: [R6], C15: [R13]}
current fold:  1

 ------- Matching from X1_train  --------- 

{R1: [C9], R2: [C10], R3: [C14], R4: [C5], R5: [C4], R6: [C7], R7: [C8], R8: [C1], R9: [C3], R10: [C12], R11: [C11], R12: [C15], R13: [C13], R14: [C2], R15: [C6]}

 ------- Matching from X2_train  --------- 

{C1: [R9], C2: [R10], C3: [R14], C4: [R5], C5: [R4], C6: [R7], C7: [R8], C8: [R1], C9: [R3], C10: [R12], C11: [R11], C12: [R15], C13: [R13], C14: [R2], C15: [R6]}
current fold:  2

 ------- Matching from X1_train  --------- 

{R1: [C5], R2: [C15], R3: [C2], R4: [C12], R5: [C14], R6: [C7], R7:

In [45]:
avg_f1_fromx1 = np.mean(kfold_results_1_seq2seq_2)
avg_f1_fromx1

0.10666666666666666

In [46]:
avg_f1_fromx2 = np.mean(kfold_results_2_seq2seq_2)
avg_f1_fromx2

0.10666666666666666