In [1]:
import json
import os
import sys
import warnings
import pickle as pkl
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve
warnings.filterwarnings("ignore")

import AIPT.Models.Beshnova2020.CNN

%pwd


File: /home/ec2-user/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/AIPT-0.0.1-py3.6.egg/AIPT/Models/Beshnova2020/CNN.py 
 Last modified: 2020-10-06 21:41:14 UTC


'/home/ec2-user/SageMaker/antibody-in-pytorch/AIPT/Models/Beshnova2020'

In [2]:
import AIPT.Utils.Dev.dev_utils as dev_utils

aipt_path = '/home/ec2-user/SageMaker/antibody-in-pytorch/'
aipt_reload = dev_utils.get_aipt_reload_fn(aipt_path)

In [3]:
'''
set up paths
'''

aipt_dir = '/home/ec2-user/SageMaker/antibody-in-pytorch/AIPT'
print("\t".join(os.listdir(aipt_dir)))

entry_point.py	__main__.py	__pycache__	Benchmarks	.ipynb_checkpoints	__init__.py	Utils	Models	.gitignore


## Data Processing

In [4]:
# todo: pca

In [5]:
from AIPT.Benchmarks.OAS_dataset import OAS_data_loader as oas

In [6]:
seq_dir = os.path.join(aipt_dir, 'Benchmarks/OAS_dataset/data/seq_db')

index_path = 'OAS_index.txt'
input_seq_type = 'CDR3'
output_field = 'BType'
cell_types = ['Naive-B-Cells', 'Memory-B-Cells'] # todo: this is confusing - doesn't refer to "Species"


train_loader, train_eval_loader, test_eval_loader, seq_len = oas.OAS_data_loader(index_path, output_field, input_seq_type, cell_types, seq_dir=seq_dir,
                                                                                 gapped=True, 
                                                                                 pad=False, 
                                                                                 batch_size=16, 
                                                                                 model_name='Beshnova2020') 

Training data                                            file_name Longitudinal  Chain  \
0  Ellebedy_2016_Day-0_memory-B-cell_IGHA_memory-...        Day-0  Heavy   
1  Ellebedy_2016_Day-0_memory-B-cell_IGHA_memory-...        Day-0  Heavy   
2  Ellebedy_2016_Day-0_memory-B-cell_IGHG_memory-...        Day-0  Heavy   

                    Author Isotype    Age  Size_igblastn Disease  \
0  Ellebedy et al., (2016)    IGHA  18-49          84333    None   
1  Ellebedy et al., (2016)    IGHA  18-49          59091    None   
2  Ellebedy et al., (2016)    IGHG  18-49          52891    None   

                                      Link BSource           BType   Size  \
0  https://www.nature.com/articles/ni.3533    PBMC  Memory-B-Cells  61336   
1  https://www.nature.com/articles/ni.3533    PBMC  Memory-B-Cells  43137   
2  https://www.nature.com/articles/ni.3533    PBMC  Memory-B-Cells  41221   

  Species Vaccine  Subject  valid_entry_num  
0   human    None  Donor-6             2019  
1   hum

In [7]:
yset = set()
for x,y in train_loader:
    yset.add(y[0].item())
print(yset)

{0}


In [8]:
import pandas as pd
index_df = pd.read_csv(index_path, sep="\t")

file_names = index_df['file_name']

In [9]:

data_dfs = []

for index, row in index_df.iterrows():
    file_name = row['file_name']
    df = pd.read_csv(os.path.join(seq_dir, f'{file_name}.txt'), sep='\t')
    length_df = df.apply(lambda row: len(row['CDR3_aa']), axis=1)
    data_df = df[length_df == 11]
    data_df['BType'] = row['BType']
    data_df = data_df[['CDR3_aa', 'BType']]
    data_dfs.append(data_df)
    
data = pd.concat(data_dfs)
data

Unnamed: 0,CDR3_aa,BType
13,ARVEGWVLFDK,Memory-B-Cells
28,TREWGRRTFDV,Memory-B-Cells
29,AHKSVYREVDY,Memory-B-Cells
35,AHRSTNREVDY,Memory-B-Cells
39,AHRSTNREVDY,Memory-B-Cells
...,...,...
1503,ARDRFDYGFDY,Naive-B-Cells
1505,AREANTLGMDV,Naive-B-Cells
1516,AGLRTTDAFDI,Naive-B-Cells
1520,ARQEDTAMVDY,Naive-B-Cells


In [10]:
cell_type_to_ix = {cell_type: ix for ix, cell_type in enumerate(cell_types)}
print('cell_type_to_ix:', cell_type_to_ix)

data['label'] = data.apply(lambda row: cell_type_to_ix[row['BType']], axis=1)
data

cell_type_to_ix: {'Naive-B-Cells': 0, 'Memory-B-Cells': 1}


Unnamed: 0,CDR3_aa,BType,label
13,ARVEGWVLFDK,Memory-B-Cells,1
28,TREWGRRTFDV,Memory-B-Cells,1
29,AHKSVYREVDY,Memory-B-Cells,1
35,AHRSTNREVDY,Memory-B-Cells,1
39,AHRSTNREVDY,Memory-B-Cells,1
...,...,...,...
1503,ARDRFDYGFDY,Naive-B-Cells,0
1505,AREANTLGMDV,Naive-B-Cells,0
1516,AGLRTTDAFDI,Naive-B-Cells,0
1520,ARQEDTAMVDY,Naive-B-Cells,0


In [11]:
from AIPT.Benchmarks.OAS_dataset import OAS_data_loader

train_data = data[['CDR3_aa', 'label']]
seq_encodings = OAS_data_loader.encode_index(data=train_data['CDR3_aa'])
btypes = train_data['label'].values
train_loader = torch.utils.data.DataLoader(list(zip(seq_encodings, btypes)))

In [12]:
for row in train_loader:
    x,y = row
    print(x,y)
    break

tensor([[ 0, 14, 17,  3,  5, 18, 17,  9,  4,  2,  8]]) tensor([1])


## Load Model

In [13]:
aipt_reload(AIPT.Models.Beshnova2020.CNN)
from AIPT.Models.Beshnova2020.CNN import CNN


File: /home/ec2-user/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/AIPT-0.0.1-py3.6.egg/AIPT/Models/Beshnova2020/CNN.py 
 Last modified: 2020-10-06 21:44:51 UTC


In [14]:
para_dict = {
    'seq_len': 11,
    'embedding_dim': 15 # paper uses dim 15 PCA features
}

embedding_fn = nn.Embedding(21, para_dict['embedding_dim'])

model = CNN(para_dict, embedding_fn)

## Train

In [15]:
outputs = model.fit(train_loader)
print('train outputs', outputs)

fit called
Found saved model from Epoch 50
train outputs None


In [16]:
output = model.predict(train_loader)
model.evaluate(output, train_data['label'].values)

Test: 
[[ 302    0]
 [1992    0]]
Accuracy = 0.132 ,MCC = 0.000


(array([[ 302,    0],
        [1992,    0]]),
 0.13164777680906714,
 0.0)

## Test

In [17]:
output = model.predict(test_loader)
model.evaluate(output, test_data['label'])

NameError: name 'test_loader' is not defined