# **FEATURE ENGINEERING AND DATA PREPARATION** 

In [1]:
import warnings
warnings.simplefilter(action = 'ignore')
import numpy as np 
import pandas as pd 
import gc

In [2]:
from scipy.sparse import load_npz
train = load_npz("./Data/train_multi_inputs_values.sparse.npz")
train_index = np.load("./Data/train_multi_inputs_idxcol.npz", allow_pickle = True)
targets = load_npz("./Data/train_multi_targets_values.sparse.npz")
test = load_npz("./Data/test_multi_inputs_values.sparse.npz")
test_index = np.load("./Data/test_multi_inputs_idxcol.npz", allow_pickle = True)

In [3]:
meta_df = pd.read_csv('/kaggle/input/open-problems-multimodal/metadata.csv', index_col = 'cell_id')
meta_df = meta_df[meta_df.technology == 'multiome'].drop(columns = ['cell_type','technology'])
meta_df['gender'] = meta_df.donor.apply(lambda x : 1 if x != 13176 else 0)
meta_df

Unnamed: 0_level_0,day,donor,gender
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
458c2ae2c9b1,2,27678,1
01a0659b0710,2,27678,1
028a8bc3f2ba,2,27678,1
7ec0ca8bb863,2,27678,1
caa0b0022cdc,2,27678,1
...,...,...,...
96a60b026659,10,31800,1
d493e546991e,10,31800,1
05666c99aa48,10,31800,1
121f946642b5,10,31800,1


In [4]:
from sklearn.decomposition import TruncatedSVD as tSVD
row_indices = np.arange(train.shape[0])
np.random.seed(42)
np.random.shuffle(row_indices)
row_indices = row_indices[:50000] # Selecting Only 50000 For Our Analysis
def transform(data, index, dimensions, filter = False):
    global meta_df, row_indices
    svd = tSVD(n_components = dimensions, random_state = 1)
    data = svd.fit_transform(data)
    print(f'- Reduced To {dimensions} Dimensions') 

    meta_data = meta_df.reindex(index)
    data = np.column_stack((data, meta_data))
    print('- Important Columns Added')
    
    if filter : data = data[row_indices]
    print('- 50000 Rows Selected Randomly')
    print(f"- Final Shape Of Data is {data.shape}")
    print("-"* 65)
    print()
    return data

In [5]:
DIMENSION = 15

print("-"*22, 'For Train Data', "-"*22)
train = transform(train, list(train_index['index']), DIMENSION, True)

print("-"*22, 'For Target Data', "-"*22)
targets = targets[row_indices]
print('- 50000 Rows Selected Randomly')
targets = targets.toarray()
print(f"- Final Shape Of Data is {targets.shape}")
print("-"* 65)
print()

test_index = list(test_index['index'])
meta_test = meta_df.reindex(test_index)

print("-"*22, 'For Donor Test Data', "-"*22)
donor_test_index = meta_test[(meta_test.day != 10) & (meta_test.donor == 27678)].index
indices = [i for i in range(len(donor_test_index)) if donor_test_index[i] in test_index]
donor_test = test[indices , : ]
donor_test = transform(donor_test, donor_test_index, DIMENSION)

print("-"*22, 'For Day Test Data', "-"*22)
day_test_index = meta_test[meta_test.day == 10].index
indices = [i for i in range(len(day_test_index)) if day_test_index[i] in test_index]
day_test = test[indices , : ]
day_test = transform(day_test, day_test_index, DIMENSION)

del train_index, test, test_index, donor_test_index, day_test_index, meta_df

gc.collect()

---------------------- For Train Data ----------------------
- Reduced To 15 Dimensions
- Important Columns Added
- 50000 Rows Selected Randomly
- Final Shape Of Data is (50000, 18)
-----------------------------------------------------------------

---------------------- For Target Data ----------------------
- 50000 Rows Selected Randomly
- Final Shape Of Data is (50000, 23418)
-----------------------------------------------------------------

---------------------- For Donor Test Data ----------------------
- Reduced To 15 Dimensions
- Important Columns Added
- 50000 Rows Selected Randomly
- Final Shape Of Data is (24162, 18)
-----------------------------------------------------------------

---------------------- For Day Test Data ----------------------
- Reduced To 15 Dimensions
- Important Columns Added
- 50000 Rows Selected Randomly
- Final Shape Of Data is (31773, 18)
-----------------------------------------------------------------



22

In [6]:
# Saving The Clean Data
np.save('train', train)
np.save('targets', targets)
np.save('donor_test', donor_test)
np.save('day_test', day_test)

# **MODELLING**

In [7]:
# # Trun On The GPU And Run This (Not required if running locally)
# import warnings
# warnings.simplefilter(action = 'ignore')
# import numpy as np 
# import pandas as pd 
# import gc

# # Load The Data
# train = np.load('/kaggle/input/del-later/train.npy', allow_pickle = True)
# targets = np.load('/kaggle/input/del-later/targets.npy', allow_pickle = True)
# donor_test = np.load('/kaggle/input/del-later/donor_test.npy', allow_pickle = True)
# day_test = np.load('/kaggle/input/del-later/day_test.npy', allow_pickle = True)
# print(train.shape, targets.shape)

In [8]:
def correlation_score(y_true, y_hat):
    if type(y_true) == pd.DataFrame: y_true = y_true.values
    if type(y_hat) == pd.DataFrame: y_hat = y_hat.values
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_hat[i])[1, 0]
    return corrsum / len(y_true)

In [9]:
from keras.models import Sequential
from keras.layers import Dense

net = Sequential()
net.add(Dense(50, input_dim = train.shape[1], activation='relu'))
net.add(Dense(50, activation='relu'))
net.add(Dense(targets.shape[1], activation='linear'))
net.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])

EPOCHS = 20
BATCHES = 100

In [10]:
# KFold Grouped By Donor
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
from colorama import Fore, Style


y_cols = targets.shape[1]
groups = train[ : , -2]

kf = GroupKFold(n_splits = 3)
score_list = []
for fold, (idx_tr, idx_va) in enumerate(kf.split(train, groups = groups)):
    print(Fore.CYAN + f'Training Fold {fold} with Donors {list(map(int, set(train[idx_tr][: , -2])))} for Donor {list(map(int, set(train[idx_va][: , -2])))}'+ Style.RESET_ALL)
    X_tr = train[idx_tr]
    y_tr = targets[:,: y_cols][idx_tr]
    X_va = train[idx_va]
    y_va = targets[:,: y_cols][idx_va]
   
    net.fit(X_tr, y_tr.copy(), epochs = EPOCHS, batch_size = BATCHES, validation_data = (X_va, y_va))
    y_hat = net.predict(X_va)
        
    del X_tr, y_tr
    gc.collect()

    # We validate the model (with mse and correlation)
    mse = mean_squared_error(y_va, y_hat)
    corrscore = correlation_score(y_va, y_hat)

    del X_va, y_va
    gc.collect()

    print(Fore.CYAN + f"Fold {fold} {train.shape[1]:4}: mse = {mse:.4f}, corr =  {corrscore:.4f}"+ Style.RESET_ALL)
    print()
    score_list.append((mse, corrscore))

# Averaging Scores From All The Folds
if len(score_list) > 1:
    result_df = pd.DataFrame(score_list, columns=['mse', 'corrscore'])
    print(Fore.YELLOW + f"Average MSE : {result_df.mse.mean():.4f}, Average Correlation : {result_df.corrscore.mean():.4f}"+ Style.RESET_ALL)


[36mTraining Fold 0 with Donors [13176, 32606] for Donor [31800][0m
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
[36mFold 0   18: mse = 2.1812, corr =  0.6334[0m

[36mTraining Fold 1 with Donors [31800, 32606] for Donor [13176][0m
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
[36mFold 1   18: mse = 2.2892, corr =  0.6531[0m

[36mTraining Fold 2 with Donors [31800, 13176] for Donor [32606][0m
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch

In [11]:
# KFold Grouped By Day
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error

y_cols = targets.shape[1]
groups = train[ : , -3]

kf = GroupKFold(n_splits = 4)
score_list = []
for fold, (idx_tr, idx_va) in enumerate(kf.split(train, groups = groups)):
    print(Fore.CYAN + f'Training Fold {fold} with Day {list(map(int, set(train[idx_tr][: , -3])))} for Day {list(map(int, set(train[idx_va][: , -3])))}'+ Style.RESET_ALL)
    X_tr = train[idx_tr]
    y_tr = targets[:,: y_cols][idx_tr]
    X_va = train[idx_va]
    y_va = targets[:,: y_cols][idx_va]
   
    net.fit(X_tr, y_tr.copy(), epochs = EPOCHS, batch_size = BATCHES, validation_data = (X_va, y_va))
    y_hat = net.predict(X_va)
        
    del X_tr, y_tr
    gc.collect()

    # We validate the model (with mse and correlation)
    mse = mean_squared_error(y_va, y_hat)
    corrscore = correlation_score(y_va, y_hat)

    del X_va, y_va
    gc.collect()

    print(Fore.CYAN + f"Fold {fold} {train.shape[1]:4}: mse = {mse:.4f}, corr =  {corrscore:.4f}"+ Style.RESET_ALL)
    print()
    score_list.append((mse, corrscore))

# Averaging Scores From All The Folds
if len(score_list) > 1:
    result_df = pd.DataFrame(score_list, columns=['mse', 'corrscore'])
    print(Fore.YELLOW + f"Average MSE : {result_df.mse.mean():.4f}, Average Correlation : {result_df.corrscore.mean():.4f}"+ Style.RESET_ALL)


[36mTraining Fold 0 with Day [2, 3, 7] for Day [4][0m
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
[36mFold 0   18: mse = 2.1479, corr =  0.6649[0m

[36mTraining Fold 1 with Day [2, 4, 7] for Day [3][0m
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
[36mFold 1   18: mse = 2.1262, corr =  0.6481[0m

[36mTraining Fold 2 with Day [3, 4, 7] for Day [2][0m
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
[36mFold 2   18: mse = 2.1087, cor

In [12]:
# Retraining On Whole Data
net.fit(train, targets, epochs = EPOCHS, batch_size = BATCHES)
print(mean_squared_error(targets, net.predict(train)), correlation_score(targets, net.predict(train)))
print('Donor Test Prediction :/n', net.predict(donor_test))
print('Day Test Prediction :/n', net.predict(day_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
2.1087084 0.6460690431040537
Donor Test Prediction :/n [[0.60391665 0.3637505  0.24038602 ... 1.2372355  1.3559195  2.1369023 ]
 [0.5108135  0.34656546 0.30472034 ... 1.1135321  1.2038752  2.0147128 ]
 [0.5569416  0.35507983 0.27284575 ... 1.1748211  1.2792058  2.075252  ]
 ...
 [0.5384964  0.35167518 0.28559142 ... 1.1503135  1.2490835  2.0510442 ]
 [0.37314346 0.32115424 0.39985037 ... 0.93061405 0.9790502  1.8340335 ]
 [0.68102175 0.37798262 0.18710636 ... 1.3396828  1.4818376  2.238096  ]]
Day Test Prediction :/n [[0.3855572  0.32344556 0.3912725  ... 0.9471078  0.9993227  1.8503253 ]
 [0.36262494 0.3192127  0.40711868 ... 0.9166384  0.9618727  1.8202288 ]
 [0.36262494 0.3192127  0.40711868 ... 0.9166384  0.9618727  1.8202288 ]
 ...
 [0.472293   0.3394553

----------------