In [1]:
import json
import os
import pickle as pkl
import sys
import warnings

import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import roc_auc_score, roc_curve

warnings.filterwarnings("ignore")

import AIPT.Models.Beshnova2020.CNN
import AIPT.Utils.logging

print("current working directory:")
%pwd

File: /home/ec2-user/SageMaker/.persisted_conda/aipt/lib/python3.6/site-packages/AIPT-0.0.1-py3.6.egg/AIPT/Models/Beshnova2020/CNN.py 
 Last modified: 2020-10-15 00:03:59 UTC
current working directory:


'/home/ec2-user/SageMaker/antibody-in-pytorch/AIPT/Models/Beshnova2020'

In [2]:
import AIPT.Utils.Dev.dev_utils as dev_utils

aipt_path = '/home/ec2-user/SageMaker/antibody-in-pytorch/'
aipt_reload = dev_utils.get_aipt_reload_fn(aipt_path)

In [3]:
'''
set up paths
'''

aipt_dir = '/home/ec2-user/SageMaker/antibody-in-pytorch/AIPT'
print("\t".join(os.listdir(aipt_dir)))

entry_point.py	__main__.py	__pycache__	Benchmarks	.ipynb_checkpoints	__init__.py	Utils	Models	.gitignore


## Data Processing

In [4]:
# todo: pca

In [5]:
from AIPT.Benchmarks.OAS_dataset import OAS_data_loader as oas

In [6]:
seq_dir = os.path.join(aipt_dir, "Benchmarks/OAS_dataset/data/seq_db")

index_path = "OAS_index.txt"
input_seq_type = "CDR3"
output_field = "BType"
cell_types = [
    "Naive-B-Cells",
    "Memory-B-Cells",
]  # todo: this is confusing - doesn't refer to "Species"

train_loader, train_eval_loader, test_eval_loader, seq_len = oas.OAS_data_loader(
    index_path,
    output_field,
    input_seq_type,
    cell_types,
    seq_dir=seq_dir,
    gapped=True,
    pad=False,
    batch_size=20,
    model_name="Beshnova2020",
)

Training data                                            file_name Longitudinal  Chain  \
0  Ellebedy_2016_Day-0_memory-B-cell_IGHA_memory-...        Day-0  Heavy   
1  Ellebedy_2016_Day-0_memory-B-cell_IGHA_memory-...        Day-0  Heavy   
2  Ellebedy_2016_Day-0_memory-B-cell_IGHG_memory-...        Day-0  Heavy   

                    Author Isotype    Age  Size_igblastn Disease  \
0  Ellebedy et al., (2016)    IGHA  18-49          84333    None   
1  Ellebedy et al., (2016)    IGHA  18-49          59091    None   
2  Ellebedy et al., (2016)    IGHG  18-49          52891    None   

                                      Link BSource           BType   Size  \
0  https://www.nature.com/articles/ni.3533    PBMC  Memory-B-Cells  61336   
1  https://www.nature.com/articles/ni.3533    PBMC  Memory-B-Cells  43137   
2  https://www.nature.com/articles/ni.3533    PBMC  Memory-B-Cells  41221   

  Species Vaccine  Subject  valid_entry_num  
0   human    None  Donor-6             2019  
1   hum

In [7]:
import pandas as pd
index_df = pd.read_csv(index_path, sep="\t")

file_names = index_df['file_name']

In [8]:

data_dfs = []

for index, row in index_df.iterrows():
    file_name = row['file_name']
    df = pd.read_csv(os.path.join(seq_dir, f'{file_name}.txt'), sep='\t')
    length_df = df.apply(lambda row: len(row['CDR3_aa']), axis=1)
    data_df = df[length_df == 11]
    data_df['BType'] = row['BType']
    data_df = data_df[['CDR3_aa', 'BType']]
    data_dfs.append(data_df)
    
data = pd.concat(data_dfs)
data

Unnamed: 0,CDR3_aa,BType
13,ARVEGWVLFDK,Memory-B-Cells
28,TREWGRRTFDV,Memory-B-Cells
29,AHKSVYREVDY,Memory-B-Cells
35,AHRSTNREVDY,Memory-B-Cells
39,AHRSTNREVDY,Memory-B-Cells
...,...,...
1503,ARDRFDYGFDY,Naive-B-Cells
1505,AREANTLGMDV,Naive-B-Cells
1516,AGLRTTDAFDI,Naive-B-Cells
1520,ARQEDTAMVDY,Naive-B-Cells


In [9]:
cell_type_to_ix = {cell_type: ix for ix, cell_type in enumerate(cell_types)}
print('cell_type_to_ix:', cell_type_to_ix)

data['label'] = data.apply(lambda row: cell_type_to_ix[row['BType']], axis=1)
data

cell_type_to_ix: {'Naive-B-Cells': 0, 'Memory-B-Cells': 1}


Unnamed: 0,CDR3_aa,BType,label
13,ARVEGWVLFDK,Memory-B-Cells,1
28,TREWGRRTFDV,Memory-B-Cells,1
29,AHKSVYREVDY,Memory-B-Cells,1
35,AHRSTNREVDY,Memory-B-Cells,1
39,AHRSTNREVDY,Memory-B-Cells,1
...,...,...,...
1503,ARDRFDYGFDY,Naive-B-Cells,0
1505,AREANTLGMDV,Naive-B-Cells,0
1516,AGLRTTDAFDI,Naive-B-Cells,0
1520,ARQEDTAMVDY,Naive-B-Cells,0


In [10]:
from AIPT.Benchmarks.OAS_dataset import OAS_data_loader
from sklearn.model_selection import train_test_split

# train_data = data[['CDR3_aa', 'label']]
# seq_encodings = OAS_data_loader.encode_index(data=train_data['CDR3_aa'])
# btypes = train_data['label'].values
# train_loader = torch.utils.data.DataLoader(list(zip(seq_encodings, btypes)), shuffle=True, batch_size=32)

def get_data_loader(data, batch_size=22):
    seq_encodings = OAS_data_loader.encode_index(data=data['CDR3_aa'])
    btypes = data['label'].values
    loader = torch.utils.data.DataLoader(list(zip(seq_encodings, btypes)), shuffle=True, batch_size=32)
    return loader

train_data, test_data = train_test_split(data, train_size=0.8)
train_loader = get_data_loader(train_data)
test_loader = get_data_loader(test_data)

## Load Model

In [11]:
aipt_reload(AIPT.Models.Beshnova2020.CNN)
aipt_reload(AIPT.Utils.logging)
from AIPT.Models.Beshnova2020.CNN import CNN
from AIPT.Utils.logging import today, current_time


File: /home/ec2-user/SageMaker/.persisted_conda/aipt/lib/python3.6/site-packages/AIPT-0.0.1-py3.6.egg/AIPT/Models/Beshnova2020/CNN.py 
 Last modified: 2020-10-15 00:04:49 UTC



In [12]:
import os

log_root_dir = '/home/ec2-user/SageMaker/logs/tensorboard'
run_name = 'test'
timezone = 'EST'

para_dict = {
    'seq_len': 11,
    'embedding_dim': 15, # paper uses dim 15 PCA features
    'epoch': 2000,
    'classes': cell_types,
    'learning_rate': 10**-4,
    'run_name': run_name,
    'log_dir': os.path.join(log_root_dir, today(tz=timezone), run_name, current_time(tz=timezone))
}

embedding_fn = nn.Embedding(21, para_dict['embedding_dim'])

model = CNN(para_dict, embedding_fn)
print('LOG DIR:', para_dict['log_dir'])

LOG DIR: /home/ec2-user/SageMaker/logs/tensorboard/2020-10-14/test/19.04.50


## Tensorboard

In [13]:
import subprocess as sp

start_tensorboard = True

if start_tensorboard:
    reload_interval = "15"  # seconds
    tensorboard_proc = sp.Popen(
        [
            "tensorboard",
            "--logdir",
            para_dict["log_dir"],
        ],
        universal_newlines=True,
        stdout=sp.PIPE,
        stderr=sp.PIPE,
    )

## Train

In [14]:
outputs = model.fit(train_loader, test_loader=test_loader)
# print('train outputs', outputs)
model

fit called
No saved model found.
Epoch: 5: Train Loss=27.690
Test: 
[[   0  238]
 [   0 1597]]
Accuracy = 0.870 ,MCC = 0.000

Epoch: 5: Test Loss=7.192
Test: 
[[  0  64]
 [  0 395]]
Accuracy = 0.861 ,MCC = 0.000


Epoch: 10: Train Loss=22.778
Test: 
[[   0  238]
 [   0 1597]]
Accuracy = 0.870 ,MCC = 0.000

Epoch: 10: Test Loss=6.200
Test: 
[[  0  64]
 [  0 395]]
Accuracy = 0.861 ,MCC = 0.000


Epoch: 15: Train Loss=22.408
Test: 
[[   0  238]
 [   0 1597]]
Accuracy = 0.870 ,MCC = 0.000

Epoch: 15: Test Loss=6.050
Test: 
[[  0  64]
 [  0 395]]
Accuracy = 0.861 ,MCC = 0.000


Epoch: 20: Train Loss=22.195
Test: 
[[   0  238]
 [   0 1597]]
Accuracy = 0.870 ,MCC = 0.000

Epoch: 20: Test Loss=6.074
Test: 
[[  0  64]
 [  0 395]]
Accuracy = 0.861 ,MCC = 0.000


Epoch: 25: Train Loss=22.032
Test: 
[[   0  238]
 [   0 1597]]
Accuracy = 0.870 ,MCC = 0.000

Epoch: 25: Test Loss=6.199
Test: 
[[  0  64]
 [  0 395]]
Accuracy = 0.861 ,MCC = 0.000


Epoch: 30: Train Loss=21.739
Test: 
[[   0  238]
 [   

CNN(
  (embedding_fn): Embedding(21, 15)
  (conv1): Conv1d(15, 8, kernel_size=(2,), stride=(1,))
  (pool1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(8, 16, kernel_size=(2,), stride=(1,))
  (pool2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=32, out_features=10, bias=True)
  (fc2): Linear(in_features=10, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [15]:
import collections
xset = []
yset = []
with torch.no_grad():
    for i, dat in enumerate(train_loader):
        x,y = dat
#         print(x,y)
        result = model.forward(x)
        xset.append(result[0][0].item())
        yset.append(result[0][1].item())
print(collections.Counter(xset))
print(collections.Counter(yset))

Counter({-35.9437370300293: 4, -12.330662727355957: 2, -19.6767520904541: 2, -18.843257904052734: 1, -3.314357280731201: 1, 2.075023651123047: 1, -8.54932975769043: 1, -19.760047912597656: 1, -12.331697463989258: 1, -26.413986206054688: 1, -10.48336124420166: 1, -3.7856030464172363: 1, -20.98637580871582: 1, 2.5148520469665527: 1, 4.150931358337402: 1, -8.435462951660156: 1, -15.886244773864746: 1, -3.3637285232543945: 1, -14.169770240783691: 1, -3.4067111015319824: 1, -9.680033683776855: 1, -16.079030990600586: 1, -8.941361427307129: 1, -11.4566011428833: 1, -23.233081817626953: 1, -20.640533447265625: 1, -17.890214920043945: 1, -27.72584342956543: 1, -21.942256927490234: 1, 10.448698043823242: 1, -10.570355415344238: 1, -11.41996955871582: 1, 4.4610514640808105: 1, -3.009575366973877: 1, -13.567416191101074: 1, -4.919668674468994: 1, -15.95495319366455: 1, -14.5397310256958: 1, -3.3030295372009277: 1, -5.956918716430664: 1, -16.45050621032715: 1, -3.6610493659973145: 1, -22.903678894

In [16]:
output, labels, loss = model.predict(train_loader)
print(loss)
model.evaluate(output, labels)

# train_data['label'].values

tensor(0.4737)
Test: 
[[ 233    5]
 [   1 1596]]
Accuracy = 0.997 ,MCC = 0.985


(array([[ 233,    5],
        [   1, 1596]]), 0.9967302452316076, 0.9854587213469699)

## Test

In [17]:
output, labels, loss = model.predict(test_loader)
print(loss)
model.evaluate(output, labels)

tensor(38.2729)
Test: 
[[ 27  37]
 [ 32 363]]
Accuracy = 0.850 ,MCC = 0.353


(array([[ 27,  37],
        [ 32, 363]]), 0.8496732026143791, 0.35278617419838787)

## Tensorboard Output

In [18]:
# for line in iter(tensorboard_proc.stdout):
#     print(line)
# print(tensorboard_proc.stdout)
# print(tensorboard_proc.stderr)
# todo: get this output piping to work