## Regressors

In [1]:
import torch
import pandas as pd
import numpy as np

from Liu2019_data_loader import train_test_loader, encode_data
from CNNx1_regressor import CNN_regressor
from DNN_regressor import DNN_regressor
from CNNx2_regressor import CNNx2_regressor

In [2]:
## load data
traindat = pd.read_csv('../Benchmarks/Liu2019_enrichment/cdr3s.table_Feb10.csv')
# exclude not_determined
dat = traindat.loc[traindat['enriched'] != 'not_determined']
x = dat['cdr3'].values
y_reg = dat['log10(R3/R2)'].values
# scale y_reg
y_reg_mean = np.mean(y_reg)
y_reg_std = np.std(y_reg)
y_reg_new = (y_reg - y_reg_mean) / y_reg_std

In [3]:
X_dat = np.array([encode_data(item, gapped = True, seq_len = 18) for item in x])
train_loader, test_loader = train_test_loader(X_dat, np.array(y_reg_new), batch_size=100)

In [4]:
X_dat.shape, y_reg_mean, y_reg_std

((54402, 18, 21), -0.6794162469900554, 0.6218747369373429)

### 1 DNN models

In [None]:
model_names = ['Seq_32_32_reg']
lrs = [0.005, 0.0005, 0.001, 5e-5, 0.0001]
para_dict = {'seq_len':18,
             'batch_size':100,
              'model_name':'Seq_32_32_reg',
              'epoch':70,
              'learning_rate':0.01,
              'fc_hidden_dim':32,
              'dropout_rate':0.3}

In [None]:
for i in range(len(model_names)):
    params = para_dict_DNN
    for lr in lrs:
        params['learning_rate'] = lr
        params['model_name'] = '-'.join([model_names[i],str(lr)])
        print('----------------------------')
        print('start training model %s' % params['model_name'])
        model = DNN_regressor(params)
        model.fit(train_loader)
                                         
        print('-- Train set results --')
        output = model.predict(train_loader)
        labels = np.concatenate([i for _, i in train_loader])
        r2, mse = model.evaluate(output, labels)
        train_r2_store.append(r2)
        train_mse_store.append(mse)
        print('-- Test set results --')
        output = model.predict(test_loader)
        labels = np.concatenate([i for _, i in test_loader])
        r2, mse = model.evaluate(output, labels)
        test_r2_store.append(r2)
        test_mse_store.append(mse)

In [None]:
output = model.predict(test_loader)
labels = np.concatenate([i for _, i in test_loader])
r2 = model.evaluate(output, labels)

In [None]:
model

### 3 CNN models

In [5]:
model_names = ['Seq_32x1_16', 'Seq_64x1_16_reg', 'Seq_32x1_16_filt3_reg']
n_filters = [32, 64, 32]
filter_sizes = [5,5,3]
lrs = [0.005, 0.0005, 0.001, 5e-5, 0.0001]
para_dict_CNN = {'batch_size':100,
             'seq_len':18,
             'epoch':70,
             'learning_rate':0.01,
             'step_size':20,
             'n_filter':32,
             'filter_size':5,
             'fc_hidden_dim':16,
             'stride':2,
             'dropout_rate': 0.3}

In [7]:
train_r2_store = []
test_mse_store = []
train_mse_store = []
test_r2_store = []

for i in range(len(model_names)):
    params = para_dict_CNN
    params['n_filter'] = n_filters[i]
    params['filter_size'] = filter_sizes[i]
    for lr in lrs:
        params['learning_rate'] = lr
        params['model_name'] = '-'.join([model_names[i],str(lr)])
        print('----------------------------')
        print('start training model %s' % params['model_name'])
        model = CNN_regressor(params)
        model.fit(train_loader)
                                         
        print('-- Train set results --')
        output = model.predict(train_loader)
        labels = np.concatenate([i for _, i in train_loader])
        r2, mse = model.evaluate(output, labels)
        train_r2_store.append(r2)
        train_mse_store.append(mse)
        print('-- Test set results --')
        output = model.predict(test_loader)
        labels = np.concatenate([i for _, i in test_loader])
        r2, mse = model.evaluate(output, labels)
        test_r2_store.append(r2)
        test_mse_store.append(mse)

----------------------------
start training model Seq_32x1_16-0.005
-- Train set results --
R2 score = 0.543
MSE = 0.460
-- Test set results --
R2 score = 0.497
MSE = 0.490
----------------------------
start training model Seq_32x1_16-0.0005
Epoch: 10: Loss=226.463
R2 score = 0.484
MSE = 0.520
Epoch: 20: Loss=215.932
R2 score = 0.507
MSE = 0.496
Epoch: 30: Loss=209.950
R2 score = 0.521
MSE = 0.482
Epoch: 40: Loss=206.382
R2 score = 0.529
MSE = 0.474
Epoch: 50: Loss=203.412
R2 score = 0.536
MSE = 0.467
Epoch: 60: Loss=201.464
R2 score = 0.540
MSE = 0.463
Epoch: 70: Loss=199.618
R2 score = 0.545
MSE = 0.458
-- Train set results --
R2 score = 0.545
MSE = 0.458
-- Test set results --
R2 score = 0.485
MSE = 0.502
----------------------------
start training model Seq_32x1_16-0.001
Epoch: 10: Loss=225.806
R2 score = 0.485
MSE = 0.518
Epoch: 20: Loss=215.717
R2 score = 0.508
MSE = 0.495
Epoch: 30: Loss=209.992
R2 score = 0.521
MSE = 0.482
Epoch: 40: Loss=204.760
R2 score = 0.533
MSE = 0.470
Ep

In [21]:
import pandas as pd
from itertools import combinations, product
model_names = ['Seq_32x1_16', 'Seq_64x1_16_reg', 'Seq_32x1_16_filt3_reg']
comb_names = product(model_names, lrs)
names = ['-'.join([model_name,str(lr)]) for model_name, lr in comb_names]
result_df = pd.DataFrame({'model_name': names,
                          'train_r2': train_r2_store, 'train_mse': train_mse_store,
                          'test_r2': test_r2_store, 'test_mse': test_mse_store})
result_df.to_csv('regressor_log_CNN.csv', index=None)

### 2 CNNx2 models

In [22]:
model_names = ['Seq_32x2_16_reg', 'Seq_embed_32x1_16_reg']
n_filter1s = [32, 8]
n_filter2s = [64, 32]
filter_size1s = [5,1]
filter_size2s = [5,5]
lrs = [0.005, 0.0005, 0.001, 5e-5, 0.0001]
para_dict_CNNx2 = {'seq_len':18,
              'batch_size':100,
              'model_name':'Seq_32x2_16_reg',
              'epoch':70,
              'learning_rate':0.001,
              'step_size':5,
              'n_filter1':32,
              'n_filter2':64,
              'filter_size1':5,
              'filter_size2':5,
              'fc_hidden_dim':16,
              'dropout_rate':0.5,
              'stride':2}

In [None]:
train_r2_store = []
test_mse_store = []
train_mse_store = []
test_r2_store = []

for i in range(len(model_names)):
    params = para_dict_CNNx2
    params['n_filter1'] = n_filter1s[i]
    params['filter_size1'] = filter_size1s[i]
    params['n_filter2'] = n_filter2s[i]
    params['filter_size2'] = filter_size2s[i]
    for lr in lrs:
        params['learning_rate'] = lr
        params['model_name'] = '-'.join([model_names[i],str(lr)])
        print('----------------------------')
        print('start training model %s' % params['model_name'])
        model = CNNx2_regressor(params)
        model.fit(train_loader)
                                         
        print('-- Train set results --')
        output = model.predict(train_loader)
        labels = np.concatenate([i for _, i in train_loader])
        r2, mse = model.evaluate(output, labels)
        train_r2_store.append(r2)
        train_mse_store.append(mse)
        print('-- Test set results --')
        output = model.predict(test_loader)
        labels = np.concatenate([i for _, i in test_loader])
        r2, mse = model.evaluate(output, labels)
        test_r2_store.append(r2)
        test_mse_store.append(mse)

model = CNNx2_regressor(para_dict)
model.fit(train_loader)

----------------------------
start training model Seq_32x2_16_reg-0.005
Epoch: 10: Loss=240.349
R2 score = 0.452
MSE = 0.551
Epoch: 20: Loss=219.929
R2 score = 0.498
MSE = 0.505


## Classifiers

In [None]:
import torch
import pandas as pd
import numpy as np

from Liu2019_data_loader import train_test_loader, encode_data
from CNNx1_classifier import CNN_classifier
from DNN_classifier import DNN_classifier
from CNNx2_classifier import CNNx2_classfier

In [None]:
## load data
traindat = pd.read_csv('cdr3s.table 2.csv')
# exclude not_determined
dat = traindat.loc[traindat['enriched'] != 'not_determined']
x = dat['cdr3'].values
y_class = [int(xx == 'positive') for xx in dat['enriched'].values]

### 1 DNN model

In [None]:
model_names = ['Seq_32_32_class']
lrs = [0.005, 0.0005, 0.001, 5e-5, 0.0001]
para_dict = {'seq_len':18,
             'batch_size':100,
              'model_name':'Seq_32_32_class',
              'epoch':70,
              'learning_rate':0.01,
              'fc_hidden_dim':32,
              'dropout_rate':0.3}

In [None]:
train_acc_store = []
test_mcc_store = []
train_mcc_store = []
test_acc_store = []

for i in range(len(model_names)):
    params = para_dict_DNN
    for lr in lrs:
        params['learning_rate'] = lr
        params['model_name'] = '-'.join([model_names[i],str(lr)])
        print('----------------------------')
        print('start training model %s' % params['model_name'])
        model = DNN_classifier(params)
        model.fit(train_loader)
                                         
        print('-- Train set results --')
        output = model.predict(train_loader)
        labels = np.concatenate([i for _, i in train_loader])
        mat, acc, mcc = model.evaluate(output, labels)
        train_acc_store.append(acc)
        train_mcc_store.append(mcc)
        print('-- Test set results --')
        output = model.predict(test_loader)
        labels = np.concatenate([i for _, i in test_loader])
        mat, acc, mcc = model.evaluate(output, labels)
        test_acc_store.append(acc)
        test_mcc_store.append(mcc)

### 3 CNN models

In [5]:
model_names = ['Seq_32x1_16_class', 'Seq_64x1_16_class', 'Seq_32x1_16_filt3_class']
n_filters = [32, 64, 32]
filter_sizes = [5,5,3]
lrs = [0.005, 0.0005, 0.001, 5e-5, 0.0001]
para_dict_CNN = {'batch_size':100,
             'seq_len':18,
             'epoch':70,
             'learning_rate':0.01,
             'step_size':20,
             'n_filter':32,
             'filter_size':5,
             'fc_hidden_dim':16,
             'stride':2,
             'dropout_rate': 0.3}

In [None]:

for i in range(len(model_names)):
    params = para_dict_CNN
    params['n_filter'] = n_filters[i]
    params['filter_size'] = filter_sizes[i]
    for lr in lrs:
        params['learning_rate'] = lr
        params['model_name'] = '-'.join([model_names[i],str(lr)])
        print('----------------------------')
        print('start training model %s' % params['model_name'])
        model = CNN_classifier(params)
        model.fit(train_loader)
                                         
        print('-- Train set results --')
        output = model.predict(train_loader)
        labels = np.concatenate([i for _, i in train_loader])
        mat, acc, mcc = model.evaluate(output, labels)
        train_acc_store.append(acc)
        train_mcc_store.append(mcc)
        print('-- Test set results --')
        output = model.predict(test_loader)
        labels = np.concatenate([i for _, i in test_loader])
        mat, acc, mcc = model.evaluate(output, labels)
        test_acc_store.append(acc)
        test_mcc_store.append(mcc)

----------------------------
start training model Seq_32x1_16-0.005
-- Train set results --
R2 score = 0.543
MSE = 0.460
-- Test set results --
R2 score = 0.497
MSE = 0.490
----------------------------
start training model Seq_32x1_16-0.0005
Epoch: 10: Loss=226.463
R2 score = 0.484
MSE = 0.520
Epoch: 20: Loss=215.932
R2 score = 0.507
MSE = 0.496
Epoch: 30: Loss=209.950
R2 score = 0.521
MSE = 0.482
Epoch: 40: Loss=206.382
R2 score = 0.529
MSE = 0.474
Epoch: 50: Loss=203.412
R2 score = 0.536
MSE = 0.467
Epoch: 60: Loss=201.464
R2 score = 0.540
MSE = 0.463
Epoch: 70: Loss=199.618
R2 score = 0.545
MSE = 0.458
-- Train set results --
R2 score = 0.545
MSE = 0.458
-- Test set results --
R2 score = 0.485
MSE = 0.502
----------------------------
start training model Seq_32x1_16-0.001
Epoch: 10: Loss=225.806
R2 score = 0.485
MSE = 0.518
Epoch: 20: Loss=215.717
R2 score = 0.508
MSE = 0.495
Epoch: 30: Loss=209.992
R2 score = 0.521
MSE = 0.482
Epoch: 40: Loss=204.760
R2 score = 0.533
MSE = 0.470
Ep

### 2 CNNx2 models

In [None]:
model_names = ['Seq_32x2_16_class', 'Seq_embed_32x1_16_class']
n_filter1s = [32, 8]
n_filter2s = [64, 32]
filter_size1s = [5,1]
filter_size2s = [5,5]
lrs = [0.005, 0.0005, 0.001, 5e-5, 0.0001]
para_dict_CNNx2 = {'seq_len':18,
              'batch_size':100,
              'model_name':'Seq_32x2_16_reg',
              'epoch':70,
              'learning_rate':0.001,
              'step_size':5,
              'n_filter1':32,
              'n_filter2':64,
              'filter_size1':5,
              'filter_size2':5,
              'fc_hidden_dim':16,
              'dropout_rate':0.5,
              'stride':2}

for i in range(len(model_names)):
    params = para_dict_CNNx2
    params['n_filter1'] = n_filter1s[i]
    params['filter_size1'] = filter_size1s[i]
    params['n_filter2'] = n_filter2s[i]
    params['filter_size2'] = filter_size2s[i]
    for lr in lrs:
        params['learning_rate'] = lr
        params['model_name'] = '-'.join([model_names[i],str(lr)])
        print('----------------------------')
        print('start training model %s' % params['model_name'])
        model = CNNx2_regressor(params)
        model.fit(train_loader)
                                         
        print('-- Train set results --')
        output = model.predict(train_loader)
        labels = np.concatenate([i for _, i in train_loader])
        mat, acc, mcc = model.evaluate(output, labels)
        train_acc_store.append(acc)
        train_mcc_store.append(mcc)
        print('-- Test set results --')
        output = model.predict(test_loader)
        labels = np.concatenate([i for _, i in test_loader])
        mat, acc, mcc = model.evaluate(output, labels)
        test_acc_store.append(acc)
        test_mcc_store.append(mcc)
