### load and explore 

In [2]:
data_path = './data/Training_data.txt'
vocab_path = './data/vocab.txt'

In [3]:
import pandas as pd
df = pd.read_csv(data_path,sep='\t')

In [6]:
print(len(df))

36391


In [7]:
df

Unnamed: 0,Seq,Label
0,WSHPSFYPFR,1
1,WLMACFFVFR,0
2,WTVDGLYEYD,1
3,WRATSFYLNT,0
4,WRSIAFFMFA,0
...,...,...
36386,YHSVAFFPYT,0
36387,FNDRGFFTFR,0
36388,FGLAHSYTFS,1
36389,WLALGMYQFA,1


In [30]:
with open(vocab_path,'r') as f:
    vocab = f.read()

vocab = vocab.replace('\n','')

import re
p = re.compile('\s+')
vocab = re.sub(p,' ',vocab)

import ast
vocab = ast.literal_eval(vocab.split('=')[1].strip())

len_vocab = len(vocab)
vocab_map = dict(zip(vocab,range(len_vocab)))

In [None]:
# dataset
# split
# dataloader
# train/val
# test (metrics) 

### Stratified Split (train,val,test: 0.6, 0.2, 0.2) 

In [40]:
# split
from matplotlib import pyplot as plt
df.columns = ['seq','label']

In [49]:
df['label'].value_counts()

0    25114
1    11277
Name: label, dtype: int64

In [68]:
from sklearn.model_selection import train_test_split as tts
a = tts(df,test_size=0.2,shuffle=True,stratify=df['label'],random_state=1004)

In [76]:
a[0]['label'].value_counts()

0    20091
1     9021
Name: label, dtype: int64

In [74]:
a[1]['label'].value_counts()

0    5023
1    2256
Name: label, dtype: int64

In [79]:
len(a[0]),len(a[1])

(29112, 7279)

In [82]:
tr,val = tts(a[0],test_size=0.25,shuffle=True,stratify=a[0]['label'],random_state=1004)
te = a[1]

In [83]:
len(tr),len(val),len(te)

(21834, 7278, 7279)

In [89]:
tr.to_csv('./data/split/train.csv',index_label='index')

In [90]:
val.to_csv('./data/split/val.csv',index_label='index')

In [92]:
te.to_csv('./data/split/test.csv',index_label='index')

### Dataset & DataLoader for pytorch 

In [2]:
def get_vocab_map(vocab_path='./data/vocab.txt'):
    with open(vocab_path,'r') as f:
        vocab = f.read()

    vocab = vocab.replace('\n','')

    import re
    p = re.compile('\s+')
    vocab = re.sub(p,' ',vocab)

    import ast
    vocab = ast.literal_eval(vocab.split('=')[1].strip())

    len_vocab = len(vocab)
    vocab_map = dict(zip(vocab,range(len_vocab)))
    return vocab_map
vocab_map =get_vocab_map()

import torch
from torch.nn import functional as F
import pandas as pd
class ProteinDataset(torch.utils.data.Dataset):
    def __init__(self,path):
        self.df = pd.read_csv(path)

    def __getitem__(self,idx):
        item = self.df.iloc[idx]
        x = item['seq']
        y = item['label']
        
        x = self.seq2oneHot(x)
        y = torch.tensor(y)
        # y = self.label2oneHot(y)
        return x,y
    
    def __len__(self):
        return len(self.df)
    
    def seq2oneHot(self,seq):
        seq2int = [ vocab_map[x] for x in list(seq) ]
        seq2int = torch.tensor(seq2int)
        oneHot = F.one_hot(seq2int,num_classes=len(vocab_map) )
        return oneHot
    
    def label2oneHot(self,label):
        return F.one_hot(torch.tensor(label),num_classes=2)
    

trdt  = ProteinDataset('./data/split/train.csv')
valdt = ProteinDataset('./data/split/val.csv')
tedt  = ProteinDataset('./data/split/test.csv')

trdl  = torch.utils.data.DataLoader(trdt, batch_size=64, num_workers=4)
valdl  = torch.utils.data.DataLoader(valdt, batch_size=64, num_workers=4)
tedl  = torch.utils.data.DataLoader(tedt, batch_size=32, num_workers=4)

### Model 

|i|model | used|
|---|--------| ----|
|0  |ResNet| ✔|
|1  | ResNext   |x|
|2  | MaxFilterCNN|✔ |
|3  | LSTM|     x|
|4  |Self-attention|✔ |
<!-- |5  ||✔ | -->


In [3]:
# b,10,20
from torch import nn
class MaxFilterCNN(nn.Module):
    def __init__(self):
        super(MaxFilterCNN,self).__init__()
        self.maxFconv = nn.Sequential(
            nn.Conv2d(1,8,kernel_size=(3,20) ),
            nn.BatchNorm2d(8),
            nn.ReLU()
        )
        class squeeze(nn.Module):
            def __init__(self):
                super(squeeze,self).__init__()
            def forward(self,x):
                x = x.squeeze(-1)
                return x
        self.sq = squeeze()
        self.conv1d0 = nn.Sequential(
                    nn.Conv1d(8,8,kernel_size=3),
                    # nn.MaxPool1d(2),
                    nn.BatchNorm1d(8),
                    nn.ReLU()
                )
        self.conv1d1 = nn.Sequential(
                    nn.Conv1d(8,8,kernel_size=2),
                    # nn.MaxPool1d(2),
                    nn.BatchNorm1d(8),
                    nn.ReLU()
                )

        mli = nn.ModuleList([self.maxFconv,self.sq,self.conv1d0,self.conv1d1])
        sample = torch.rand(1,1,10,20)
        for f in mli:
            sample = f(sample)
        b,c,l = sample.shape
        num_node  = c*l 

        self.last = nn.Sequential(
                    nn.Flatten(),
                    nn.Linear(num_node,2)
                )
    def forward(self,x):
        x = x.to(torch.float)
        x = x.unsqueeze(1)
        x = self.maxFconv(x)
        x = self.sq(x)
        x = self.conv1d0(x)
        x = self.conv1d1(x)
        x = self.last(x)
        return x
        

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = MaxFilterCNN().to(device)
loss = nn.CrossEntropyLoss()
params = [p for p in model.parameters() if p.requires_grad]
opt  = torch.optim.Adam(params)

### train/val/test 

In [5]:
def train(dl,model,lossf,opt):
    model.train()
    for x,y in dl:
        x,y = x.to(device),y.to(device)
        pre = model(x)
        loss = lossf(pre,y)

        opt.zero_grad()
        loss.backward()
        opt.step()

def test(dl,model,lossf):
    model.eval()
    size, acc , losses = len(dl.dataset) ,0,0
    with torch.no_grad():
        for x,y in dl:
            x,y = x.to(device),y.to(device)
            pre = model(x)
            loss = lossf(pre,y)
    
            acc += (pre.argmax(1)==y).type(torch.float).sum().item()
            losses += loss.item()
    accuracy = round(acc/size,4)
    val_loss = round(losses/size,4)
    print(f'acc/loss: {accuracy}/{val_loss}')
    return accuracy,val_loss

import copy
patience = 5
val_losses = {0:0}
for i in range(100):
    train(trdl,model,loss,opt)
    acc,val_loss = test(valdl,model,loss)
    
    
    if max(val_losses.values() ) < acc:
        val_losses[i] = acc
        best_model = copy.copy(model)
    if i == max(val_losses,key=val_losses.get)+patience:
        break

acc/loss: 0.8185/0.0063
acc/loss: 0.8331/0.006
acc/loss: 0.8376/0.0059
acc/loss: 0.8399/0.0058
acc/loss: 0.8395/0.0058
acc/loss: 0.841/0.0058
acc/loss: 0.8423/0.0058
acc/loss: 0.8428/0.0058
acc/loss: 0.8435/0.0057
acc/loss: 0.8438/0.0057
acc/loss: 0.8434/0.0057
acc/loss: 0.8431/0.0057
acc/loss: 0.8427/0.0057
acc/loss: 0.8424/0.0057
acc/loss: 0.8439/0.0058
acc/loss: 0.8427/0.0058
acc/loss: 0.842/0.0058
acc/loss: 0.8424/0.0058
acc/loss: 0.8428/0.0058
acc/loss: 0.8427/0.0058
