In [2]:
import pandas as pd
import numpy as np

## Data Preparation

In [3]:
#load data
df_seq=pd.read_csv('pdb_data_seq.csv')
df_properties=pd.read_csv('pdb_data_no_dups.csv')
df_total=df_seq.merge(df_properties,left_on='structureId',right_on = 'structureId')


In [4]:
df_total.columns

Index(['structureId', 'chainId', 'sequence', 'residueCount_x',
       'macromoleculeType_x', 'classification', 'experimentalTechnique',
       'macromoleculeType_y', 'residueCount_y', 'resolution',
       'structureMolecularWeight', 'crystallizationMethod',
       'crystallizationTempK', 'densityMatthews', 'densityPercentSol',
       'pdbxDetails', 'phValue', 'publicationYear'],
      dtype='object')

#### Select only protein, and filtered by top N

In [5]:
#select the data in top n by count, top 10?
count = df_total['classification'].value_counts(dropna=False)[:10]
df_selected=df_total[df_total['classification'].isin(set(count.index))]
#we want only protein
df_selected=df_selected[df_selected['macromoleculeType_x'].isin(set(['Protein']))]

#### Select proteins with only one chain

In [6]:
#select proteins with only one chain in the data set
#how to justify this operation?
#df_onechain = df_selected[df_selected.groupby('structureId').structureId.transform(len) == 1]

In [7]:
test_df = df_selected[['structureId','classification','sequence']]
#test_df = df_onechain[['structureId','classification','sequence']]

Things to be done

Further select data and simplify problem, select proteins with only one chain?


Figure out how to convert sequence data into array and training model afterwards.


More models and discussion (*LSTM)

Models build on features other than sequence.



## Model Training

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier,ExtraTreesClassifier

In [9]:
#optional, take part of the data for faster verification
data = test_df.sample(50000)

#need to remove nulls
data = data.dropna()
X_train, X_test,y_train,y_test = \
train_test_split(data['sequence'], data['classification'], test_size = 0.1, random_state = 1)

#### Feature Extraction From Sequence Data

In [10]:
#vectorize data, prepare for building models
#Convert a collection of text documents to a matrix of token counts
#seems has nothing to do with sequence but only with the frequency

#ngram is a parameter we need to focus on, 

#vect = CountVectorizer(analyzer = 'char_wb', ngram_range = (3,3))
vect = CountVectorizer(analyzer = 'char_wb')
#vect =TfidfVectorizer(analyzer = "char_wb",sublinear_tf= True )
# Fit and Transform CountVectorizer
#occasionally may meet np.nan error
vect.fit(X_train)
X_train_df = vect.transform(X_train)
X_test_df = vect.transform(X_test)

#to store the results for different mothods
prediction = dict()

#### Naive Bayes

In [11]:
model = MultinomialNB()
model.fit(X_train_df, y_train)
#test on test set
NB_pred = model.predict(X_test_df)
prediction["MultinomialNB"] = accuracy_score(NB_pred, y_test)
print( prediction['MultinomialNB'])

0.3082


#### adaboost

In [12]:
model = AdaBoostClassifier()
model.fit(X_train_df,y_train)
ADA_pred = model.predict(X_test_df)
prediction["Adaboost"] = accuracy_score(ADA_pred , y_test)
print(prediction["Adaboost"])

0.3604


#### RandomForestClassifier

In [13]:
model = RandomForestClassifier()
model.fit(X_train_df,y_train)
ADA_pred = model.predict(X_test_df)
prediction["Random_Forest"] = accuracy_score(ADA_pred , y_test)
print(prediction["Random_Forest"])



0.8694


#### Neural Network

In [14]:
X_train_df

<45000x23 sparse matrix of type '<class 'numpy.int64'>'
	with 906670 stored elements in Compressed Sparse Row format>

In [15]:
X_test_df

<5000x23 sparse matrix of type '<class 'numpy.int64'>'
	with 100749 stored elements in Compressed Sparse Row format>

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data.dataloader as dataloader
import torch.optim as optim

from torch.utils.data import TensorDataset
from torch.autograd import Variable
from torchvision import transforms

In [None]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        
        self.fc1 = nn.Linear(9736, 2000)
        self.bc1 = nn.BatchNorm1d(2000)
        
        self.fc2 = nn.Linear(2000, 252)
        self.bc2 = nn.BatchNorm1d(252)
        
        self.fc3 = nn.Linear(252, 10)
        
        
    def forward(self, x):
        x = x.view((-1, 9736))
        h = self.fc1(x)
        h = self.bc1(h)
        h = F.relu(h)
        #h = F.dropout(h, p=0.5, training=self.training)
        
        h = self.fc2(h)
        h = self.bc2(h)
        h = F.relu(h)
        #h = F.dropout(h, p=0.2, training=self.training)
        
        h = self.fc3(h)
        out = F.log_softmax(h,dim = 1)
        return out

In [None]:
model = Model()
model.cuda() # CUDA!
optimizer = optim.Adam(model.parameters(), lr=0.005)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Prepare data for pytorch

In [None]:
#convert y labels to index encoding
y_train_array = np.zeros(162111)
for i in range(len(y_train)):
    cur_key = y_train.iloc[i]
    idx = cls.index(cur_key)
    y_train_array[i]=idx

In [None]:
y_test_array = np.zeros(18013)
for i in range(len(y_test)):
    cur_key = y_test.iloc[i]
    idx = cls.index(cur_key)
    y_test_array[i]=idx

In [None]:
"""
criterion = nn.CrossEntropyLoss()
losses = []
for epoch in range(5):
    
    for i in range(1600):
        data = torch.from_numpy(X_train_df[i:i+100].toarray()).float().cuda()
        target = torch.from_numpy(y_train_array[i:i+100]).to(device = device, dtype=torch.int64)
        #print(data)
        # Get Samples
        data, target = Variable(data.cuda()), Variable(target.cuda())
        #target = target.squeeze(1)
        # Init
        optimizer.zero_grad()

        # Predict
        #print(type(data))
        y_pred = model(data) 
        #print(y_pred)
        # Calculate loss
        loss = criterion(y_pred, target)
        losses.append(loss.item())
        # Backpropagation
        loss.backward()
        optimizer.step()
        
        
        # Display
        if i % 100 == 1:
            print(loss.item())
            
    print()
    
    
    #test on test set
    test_acc = 0
    for i in range(180):
        data = torch.from_numpy(X_test_df[i:i+100].toarray()).float().cuda()
        target = torch.from_numpy(y_test_array[i:i+100]).to(device = device, dtype=torch.int64)
        
        y_pred = model(data)
        _,predicted = torch.max(y_pred,1)
        test_acc += (predicted == target).sum().item()
        
        #print('temp accuracy ',(predicted == target).sum().item()/100)
        
    print('current test_accuracy',test_acc/18000)
"""

Another vectoriation method

### RNN

In [16]:
#https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os
import unicodedata
import string
import torch
all_letters = string.ascii_letters[26:]
n_letters = len(all_letters)

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )
def findFiles(path): return glob.glob(path)

In [17]:
all_letters

'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

In [18]:
category_lines = {}
all_categories = []

In [19]:

all_categories= list(count.index)

### Turning seq to tensors

In [20]:
for i in range(len(X_train)):
    if y_train.iloc[i] not in category_lines:
        category_lines[y_train.iloc[i]] = [X_train.iloc[i]]
    else:
        category_lines[y_train.iloc[i]].append(X_train.iloc[i])
n_categories = len(category_lines)

In [21]:
def letterToIndex(letter):
    return all_letters.find(letter)
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

In [22]:
print(letterToTensor('J'))

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.]])


In [23]:
import torch.nn as nn
import torch.nn.functional as F
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        #self.lstm = nn.LSTM(hidden_dim, hidden_dim, num_layers=num_layers)
        
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)
    
class LSTM(nn.Module):

    def __init__(self, input_size, hidden_size, output_size):
        super(LSTM,self).__init__()
        
        
        self.input_dim = input_size
        self.hidden_dim = hidden_size
        self.num_layers = 1
        
        self.input2hidden = nn.Linear(input_size,hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)
        self.hidden2tag = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        self.hidden = self.init_hidden()
        
    def init_hidden(self):
        h = Variable(torch.zeros(self.num_layers, 1, self.hidden_dim))
        c = Variable(torch.zeros(self.num_layers, 1, self.hidden_dim))
        return h,c


    def forward(self, seq):
        output = self.input2hidden(seq)
        #print(self.hidden)
        #print(output.view(len(seq),1,-1))
        output,self.hidden = self.lstm(output.view(len(seq),1,-1),self.hidden)
        #print(output)
        #print(self.hidden)
        output = self.hidden2tag(output)[0]
        #print(output)
        #print(output)
        
        return self.softmax(output)
n_hidden = 128


In [24]:
rnn = RNN(n_letters, n_hidden, n_categories)#.to('cuda:0')
#rnn = LSTM(n_letters, n_hidden, n_categories).cuda()

In [26]:
def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return all_categories[category_i], category_i


In [29]:
import random
def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

def randomTrainingExample():
    category = randomChoice(all_categories)
    line = randomChoice(category_lines[category])
    category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)#.to('cuda:0')
    line_tensor = lineToTensor(line)#.to('cuda:0')
    return category, line, category_tensor, line_tensor

category, line, category_tensor, line_tensor = randomTrainingExample()
print('category =', category, '/ sequence =', line)


category = VIRUS / sequence = TDAPVSKASTVTGFGRGTNDVHLSGMSRISQAVLPAGTGTDGYVVVDATIVPDLLPRLGHAARIFQRYAVETLEFEIQPMCPANTGGGYVAGFLPDPTDNDHTFDALQATRGAVVAKWWESRTVRPQYTRTLLWTSSGKEQRLTSPGRLILLCVGNNTDVVNVSVLCRWSVRLSVPSLENPEE


In [31]:
from tqdm import tqdm
import time
import math
criterion = nn.CrossEntropyLoss()
n_iters = 20000
print_every = 500
plot_every = 1000


# Keep track of losses for plotting
current_loss = 0
all_losses = []

learning_rate = 0.01 # If you set this too high, it might explode. If too low, it might not learn
import progressbar
progress = progressbar.ProgressBar()
def train(category_tensor, line_tensor):
    hidden = rnn.initHidden()

    rnn.zero_grad()

    for i in range(line_tensor.size()[0]):

        output, hidden = rnn(line_tensor[i], hidden)

    loss = criterion(output, category_tensor)
    loss.backward()

    for p in rnn.parameters():
        p.data.add_(-learning_rate, p.grad.data)

    return output, loss.item()

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()

for iter in tqdm(range(1, n_iters + 1)):
    category, line, category_tensor, line_tensor = randomTrainingExample()
    
    
    
    
    category_tensor = category_tensor
    line_tensor = line_tensor
    output, loss = train(category_tensor, line_tensor)
    current_loss += loss

    # Print iter number, loss, name and guess
    if iter % print_every == 0:
        guess, guess_i = categoryFromOutput(output)
        correct = '✓' if guess == category else '✗ (%s)' % category
        print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, timeSince(start), loss, line, guess, correct))

    # Add current loss avg to list of losses
    if iter % plot_every == 0:
        all_losses.append(current_loss / plot_every)
        current_loss = 0

  2%|█▉                                                                            | 498/20000 [00:40<22:01, 14.76it/s]

500 2% (0m 40s) 2.2825 MENGKRDRQDMEVNTTPRKPRVLLAASGSVAAIKFGNLCHCFTEWAEVRAVVTKSSLHFLDKLSLPQEVTLYTDEDEWSSWNKIGDPVLHIELRRWADVLVIAPLSANTLGKIAGGLCDNLLTCIIRAWDYTKPLFVAPAMNTLMWNNPFTERHLLSLDELGITLIPPIKKRLASGDYGNGAMAEPSLIYSTVRLFWESQAHQQTGGTS / TRANSCRIPTION ✗ (LYASE)


  5%|███▉                                                                          | 997/20000 [01:18<20:22, 15.55it/s]

1000 5% (1m 18s) 2.4773 EVQLVESGGGLVKPGGSLRLSCVGSEFTFSDAWMTWVRQAPGKGLEWVGHMRPTPEGGAKDYAAPVKGRFTVSRDDSKRTLYLQMNSLKIEDTAVYYCMTGVEKGDFWSDDYSQHYNTYLIDVWGKGTTVTVSSASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKRVEPKSCDHHHHHH / OXIDOREDUCTASE ✗ (IMMUNE SYSTEM)


  7%|█████▊                                                                       | 1499/20000 [02:00<26:42, 11.55it/s]

1500 7% (2m 0s) 2.3854 GHMPDLADLFPGFGSEWINTSSGRIFARVGGDGPPLLLLHGFPQTHVMWHRVAPKLAERFKVIVADLPGYGWSDMPESDEQHTPYTKRAMAKQLIEAMEQLGHVHFALAGHDRGARVSYRLALDSPGRLSKLAVLDILPTYEYWQRMNRAYALKIYHWSFLAQPAPLPENLLGGDPDFYVKAKLASWTRAGDLSAFDPRAVEHYRIAFADPMRRHVMCEDYRAGAYADFEHDKIDVEAGNKIPVPMLALWGASGIAQSAATPLDVWRKWASDVQGAPIESGHFLPEEAPDQTAEALVRFFSAAPGS / LYASE ✗ (HYDROLASE)


 10%|███████▋                                                                     | 1998/20000 [02:39<22:35, 13.28it/s]

2000 10% (2m 39s) 2.2282 KREAEARWRQTWSGPGTTKRFPETVLARCVKYTEIHPEMRHVDCQSVWDAFKGAFISKHPCDITEEDYQPLMKLGTQTVPCNKILLWSRIKDLAHQFTQVQRDMFTLEDTLLGYLADDLTWCGEFDTSKINYQSCPDWRKDCSNNPVSVFWKTVSRRFAEAACDVVHVMLDGSRSKIFDKDSTFGSVQVHNLQPEKVQTLEAWVIHGGREDSRDLCQDPTIKELESIISKRNIQFSCKNIYRPDKFLQCVKNPEDSSCTSEI / LYASE ✗ (HYDROLASE)


 12%|█████████▌                                                                   | 2498/20000 [03:18<24:00, 12.15it/s]

2500 12% (3m 18s) 2.4538 DIQMTQTTSSLSASLGDRVTISCRASQDITNYLNWYQQKPDGTVKLLIYYTSRLHSGVPSRFSGSGSGTDYSLTISNLEQEDIATYFCQQGKTLPTFGGGTKLEIKRADAAPTVSIFPPSSEQLTSGGASVVCFLNNFYPKDINVKWKIDGSERQNGVLNSWTDQDSKDSTYSMSSTLTLTKDEYERHNSYTCEATHKTSTSPIVKSFNR / VIRAL PROTEIN ✗ (IMMUNE SYSTEM)


 15%|███████████▌                                                                 | 2999/20000 [03:57<25:02, 11.32it/s]

3000 15% (3m 57s) 2.2679 MGPLQYKDLKIDIKTSPPPECINDLLQAVDSQEVRDYCEKKGWIVNITSQVQTERNINRA / LYASE ✗ (VIRAL PROTEIN)


 17%|█████████████▍                                                               | 3499/20000 [04:34<21:47, 12.62it/s]

3500 17% (4m 34s) 2.1735 MKLKTTLFGNVYQFKDVKEVLAKANELRSGDVLAGVAAASSQERVAAKQVLSEMTVADIRNNPVIAYEDDCVTRLIQDDVNETAYNQIKNWSISELREYVLSDETSVDDIAFTRKGLTSEVVAAVAKICSNADLIYGAKKMPVIKKANTTIGIPGTFSARLQPNDTRDDVQSIAAQIYEGLSFGVGDAVIGVNPVTDDVENLSRVLDTIYGVIDKFNIPTQGCVLAHVTTQIEAIRRGAPGGLIFQSICGSEKGLKEFGVELAMLDEARAVGAEFNRIAGENCLYFETGQGSALSAGANFGADQVTMEARNYGLARHYDPFIVNTVVGFIGPEYLYNDRQIIRAGLEDHFMGKLSGISMGCDCCYTNHADADQNLNENLMILLATAGCNYIMGMPLGDDIMLNYQTTAFHDTATVRQLLNLRPSPEFERWLESMGIMANGRLTKRAGDPSLFF / RIBOSOME ✗ (LYASE)


 20%|███████████████▍                                                             | 3998/20000 [05:07<14:59, 17.78it/s]

4000 20% (5m 7s) nan EVTLQESGGGLVQPGGSMKLSCAASGFTFSDAWVDWVRQSPGKGLEWVAEIRNKANNHATKYTESVKGRFTISRDDSKSSVYLQMNSLRAEDTGIYYCTSVPQLGRGFAYWGQGTLVTVSAASTTPPSVYPLAPGSGGASTSGSMVTLGCLVKGYFPEPVTVTWNSGALSSGVHTFPAVLNGDLYTLSSSVTVPSSTWPSQTVTCNVAHPASSTQVDKKIVPK / VIRAL PROTEIN ✗ (IMMUNE SYSTEM)


 22%|█████████████████▎                                                           | 4499/20000 [05:38<13:26, 19.22it/s]

4500 22% (5m 38s) nan PQVTLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPTPVNIIGRNLLTQIGATLNF / VIRAL PROTEIN ✗ (HYDROLASE/HYDROLASE INHIBITOR)


 25%|███████████████████▏                                                         | 4999/20000 [06:08<14:16, 17.52it/s]

5000 25% (6m 8s) nan VPL / VIRAL PROTEIN ✗ (HYDROLASE)


 27%|█████████████████████▏                                                       | 5496/20000 [06:36<13:03, 18.51it/s]

5500 27% (6m 36s) nan ASAAQSTPITGKVTAVIGAIVDVHFEQSELPAILNALEIKTPQGKLVLEVAQHLGENTVRTIAMDGTEGLVRGEKVLDTGGPISVPVGRETLGRIINVIGEPIDERGPIKSKLRKPIHADPPSFAEQSTSAEILETGIKVVDLLAPYARGGKIGLFGGAGVGKTVFIQELINNIAKAHGGFSVFTGVGERTREGNDLYREMKETGVINLEGESKVALVFGQMNEPPGARARVALTGLTIAEYFRDEEGQDVLLFIDNIFRFTQAGSEVSALLGRIPSAVGYQPTLATDMGLLQERITTTKKGSVTSVQAVYVPADDLTDPAPATTFAHLDATTVLSRGISELGIYPAVDPLDSKSRLLDAAVVGQEHYDVASKVQETLQTYKSLQDIIAILGMDELSEQDKLTVERARKIQRFLSQPFAVAEVFTGIPGKLVRLKDTVASFKAVLEGKYDNIPEHAFYMVGGIEDVVAKAEKLAAEAN / VIRAL PROTEIN ✗ (HYDROLASE)


 30%|███████████████████████                                                      | 5998/20000 [07:06<13:20, 17.50it/s]

6000 30% (7m 6s) nan MSKIEKLSILGVRSFGPHHPETIAFNTPLTLIVGYNGSGKTTVIECLKYATTGELPPNSTRNGAFIHDPDLVGEKEVRAQVKLSFRSTIGESYVVTRNIQLLVQRNNKRTQKTLEGSLLLRNNGERTVISTRVAELDKLVSEKLGVPPAILDAVIFCHQDDSLWPMSEPAALKKRFDEIFEAQKYTKVIENIRLLKKKKGDELKILKEREVQDKANKERAEKVDGGAGGAGGELDLKDAKAKYKETHIKVETTKAAIEDLGRGMAAVDHAIMQYHSKMMEQINRTIAELWQSTYQGTDIDTIQIRSDVESTTSSDSGTRRNYNYRVSMVKGDTEMDMRGRCSAGQKVLASIIIRLALAESFCANCGLIALDEPTTNLDSDNIRSLAESLHGIIKARQAQGNLQLIVITHDEEFLKYMQCSDFCDDFYRVKRDEKQNSVIVRESITRITE / VIRAL PROTEIN ✗ (HYDROLASE)


 32%|████████████████████████▎                                                    | 6300/20000 [07:26<16:23, 13.93it/s]

KeyboardInterrupt: 

In [401]:
guess, guess_i = categoryFromOutput(output)

In [402]:
guess

'RIBOSOME'

In [403]:
guess_i

0

In [399]:
category_i

NameError: name 'category_i' is not defined

In [391]:
import progressbar

LSTM

In [151]:
class Params():
    def __init__(self):
        self.batch_size = 32
        self.hidden_size = 12
        self.embedding_size = 24
        self.epochs = 50
        self.nr_classes =2
        self.gpu = False
        self.learning_rate = 0.001
        self.train_ratio = 0.82
        self.val_ratio = 0.1
        self.test_ratio = 1-self.val_ratio-self.train_ratio
        self.n_layers = 2
        self.dropout = 0.3


Args = Params()

In [188]:
class LSTM_classifier(nn.Module):
    def __init__(self, vocab_size, hidden_size, n_layers, dropout, embed_size):
        super().__init__()

        self.input_size = vocab_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.dropout = dropout
        self.embed_size = embed_size
        self.embedding = nn.Embedding(self.input_size, self.embed_size)
        self.rnn = nn.LSTM(input_size=self.embed_size,
                           hidden_size=hidden_size,
                           dropout=dropout,
                           num_layers=n_layers, bidirectional=True)
        self.hidden2label = nn.Linear(2*hidden_size, 10)
        #self.hidden = self.init_hidden()
        self.softmax = nn.LogSoftmax(dim=1)
        self.dropoutLayer = nn.Dropout()

    def init_hidden(self, batch_size):
        h0 = Variable(torch.zeros(self.n_layers*2, batch_size, self.hidden_size))
        c0 = Variable(torch.zeros(self.n_layers*2, batch_size, self.hidden_size))
        return h0, c0

    def forward(self, inputs, input_lengths):  # inputs: antal i batch x max ord längd
        self.hidden = self.init_hidden(inputs.size(-1)) # -1 if batch_first=False
        embedded = self.embedding(inputs)  # antal i batch * max ord längd * Embedding_dim
        packed = pack_padded_sequence(embedded, input_lengths, batch_first=False)  # packad
        outputs, self.hidden = self.rnn(packed, self.hidden)  #(packed, self.hidden)  #
        output, output_lengths = pad_packed_sequence(outputs, batch_first=False)
        #print(output.size())
        output = torch.transpose(output, 0, 1)
        #print(output.size())
        output = torch.transpose(output, 1, 2)
        #print(output.size())
        output = torch.tanh(output)
        #print(output.size())
        output, indices = F.max_pool1d(output,output.size(2), return_indices=True)
        #print(output.size())
        output = torch.tanh(output)
        output = output.squeeze(2)
        output = self.dropoutLayer(output)
        #print(output.size())
        output = self.hidden2label(output)
        #print(output.size())
        output = self.softmax(output)
        return output, self.hidden

In [189]:
def train(input_tensor, input_sizes, target_tensor):
    model.hidden = model.init_hidden(Args.batch_size) 
    model.zero_grad()
    output, hidden = model.forward(input_tensor, input_sizes)
    loss = criterion(output, target_tensor)
    loss.backward()
    optimizer.step()
    return output, loss.item()

In [192]:
model = LSTM_classifier(vocab_size = n_letters, 
                        hidden_size = Args.hidden_size, 
                        n_layers = Args.n_layers, 
                        dropout = Args.dropout,
                        embed_size = Args.embedding_size).cuda()
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

In [191]:
n_letters

26

In [179]:
test = y_train.iloc[list(range(0,10))]

In [181]:
test.values

array(['HYDROLASE', 'TRANSFERASE', 'HYDROLASE', 'OXIDOREDUCTASE',
       'HYDROLASE', 'HYDROLASE', 'LYASE', 'LYASE', 'HYDROLASE',
       'TRANSFERASE'], dtype=object)

In [201]:
label_map['TRANSFERASE']

2

In [218]:
import time
from torch.autograd import Variable

current_loss = 0

all_losses = []
val_acc = []
train_acc = []
predictions = []
labels = []
#print_every = int(len(train_data)/Args.batch_size) #every epoch
#n_iters  = int(len(train_data)/Args.batch_size*Args.epochs)
#plot_every = print_every
start = time.time()
epochs = 10
batch_size = 32
label_map = dict(zip(list(count.index),list(range(10))))
for i in range(epochs):
    for idx in range(len(X_train)%batch_size):
        cur_x = X_train.iloc[list(range(idx,idx+batch_size))].values
        temp_y = y_train.iloc[list(range(idx,idx+batch_size))].values
        #print(type(cur_y))
        #print(cur_y[0])
        cur_x = lineToTensor(cur_x)
        cur_y = np.zeros(batch_size)
        for i in range(len(cur_y)):
            cur_y[i] = label_map[temp_y[i]]
        #print(type(cur_y))
        cur_y = torch.from_numpy(cur_y.astype(np.int))
        input_lengths = [len(x) for x in cur_x]
        print(cur_y)
        #print(cur_x)
        print(input_lengths)
        output, loss = train(input_tensor=cur_x, 
                     input_sizes=input_lengths, 
                     target_tensor=cur_y)
        current_loss += loss
        predictions.extend(list(torch.argmax(output, dim=1).numpy()))
        #labels.extend(list(target_tensor.numpy()))

                
                
        
        
        
        
        

tensor([1, 2, 1, 3, 1, 1, 5, 5, 1, 2, 1, 1, 1, 1, 4, 1, 2, 2, 5, 2, 1, 1, 1, 2,
        2, 4, 4, 9, 5, 1, 1, 1], dtype=torch.int32)
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


RuntimeError: Expected tensor for argument #1 'indices' to have scalar type Long; but got CPUType instead (while checking arguments for embedding)