In [28]:
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch import Tensor
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
source_folder = "solo_classifier_dataset"

In [29]:
def converter(instr):
    return np.fromstring(instr[1:-1],sep=' ')
df_train = pd.read_csv(source_folder + '/train.csv',converters={'notes':converter})
df_test = pd.read_csv(source_folder + '/test.csv',converters={'notes':converter})
df_val = pd.read_csv(source_folder + '/val.csv',converters={'notes':converter})

In [30]:
notes = np.array(df_train['notes'].values)
train_notes = []
for i in range(len(notes)):
    train_notes.append(notes[i].astype(int))
train_notes = np.array(train_notes)
train_notes_len = df_train['notes_len'].values
trainy = df_train['labels'].values
#trainy = nn.functional.one_hot( torch.from_numpy(trainy).to(torch.int64), num_classes=2)

notes = np.array(df_test['notes'].values)
test_notes = []
for i in range(len(notes)):
    test_notes.append(notes[i].astype(int))
test_notes = np.array(test_notes)
test_notes_len = df_test['notes_len'].values
testy = df_test['labels'].values
#testy = nn.functional.one_hot( torch.from_numpy(testy).to(torch.int64), num_classes=2)

notes = np.array(df_val['notes'].values)
val_notes = []
for i in range(len(notes)):
    val_notes.append(notes[i].astype(int))
val_notes = np.array(val_notes)
val_notes_len = df_val['notes_len'].values
valy = df_val['labels'].values
#valy = nn.functional.one_hot( torch.from_numpy(valy).to(torch.int64), num_classes=2)

  train_notes = np.array(train_notes)
  test_notes = np.array(test_notes)
  val_notes = np.array(val_notes)


In [31]:
vocab = 0
for i in range(len(train_notes)):
    for j in range(len(train_notes[i])):
        if (train_notes[i][j]) > vocab:
            vocab = (train_notes[i][j])
for i in range(len(test_notes)):
    for j in range(len(test_notes[i])):
        if (test_notes[i][j]) > vocab:
            vocab = (test_notes[i][j])
for i in range(len(val_notes)):
    for j in range(len(val_notes[i])):
        if (val_notes[i][j]) > vocab:
            vocab = (val_notes[i][j])
print(vocab)

74


In [32]:
batch_size=64

trainset = TensorDataset( torch.from_numpy(trainy), torch.from_numpy(train_notes).float(), torch.from_numpy(train_notes_len).int() )
testset = TensorDataset( torch.from_numpy(testy), torch.from_numpy(test_notes).float(), torch.from_numpy(test_notes_len).int())
valset = TensorDataset( torch.from_numpy(valy), torch.from_numpy(val_notes).float(),  torch.from_numpy(val_notes_len).int() )

trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)

testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=2)

valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size,
                                         shuffle=True, num_workers=2)

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

In [6]:
import os
#os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [7]:
class LSTM(nn.Module):

    def __init__(self, dimension=128):
        super(LSTM, self).__init__()

        self.embedding = nn.Embedding(20000, 300)
        self.dimension = dimension
        self.lstm = nn.LSTM(input_size=300,
                            hidden_size=dimension,
                            num_layers=1,
                            batch_first=True,
                            bidirectional=True)
        self.drop = nn.Dropout(p=0.5)

        self.fc = nn.Linear(2*dimension, 1)

    def forward(self, notes, notes_len):
        notes_emb = self.embedding(notes)

        packed_input = pack_padded_sequence(notes_emb, notes_len, batch_first=True, enforce_sorted=False)
        packed_output, _ = self.lstm(packed_input)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)

        out_forward = output[range(len(output)), notes_len - 1, :self.dimension]
        out_reverse = output[:, 0, self.dimension:]
        out_reduced = torch.cat((out_forward, out_reverse), 1)
        notes_fea = self.drop(out_reduced)

        notes_fea = self.fc(notes_fea)
        notes_fea = torch.squeeze(notes_fea, 1)
        notes_out = torch.sigmoid(notes_fea)

        return notes_out

In [8]:
if torch.cuda.is_available():  
    dev = "cuda:0" 
else:  
    dev = "cpu" 
print(dev)
device = torch.device(dev)
print(device)

cuda:0
cuda:0


In [9]:
# Save and Load Functions https://towardsdatascience.com/lstm-text-classification-using-pytorch-2c6c657f8fc0

def save_checkpoint(save_path, model, optimizer, valid_loss):

    if save_path == None:
        return
    
    state_dict = {'model_state_dict': model.state_dict(),
                  'optimizer_state_dict': optimizer.state_dict(),
                  'valid_loss': valid_loss}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')


def load_checkpoint(load_path, model, optimizer):

    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    model.load_state_dict(state_dict['model_state_dict'])
    optimizer.load_state_dict(state_dict['optimizer_state_dict'])
    
    return state_dict['valid_loss']


def save_metrics(save_path, train_loss_list, valid_loss_list, global_steps_list):

    if save_path == None:
        return
    
    state_dict = {'train_loss_list': train_loss_list,
                  'valid_loss_list': valid_loss_list,
                  'global_steps_list': global_steps_list}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')


def load_metrics(load_path):

    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    return state_dict['train_loss_list'], state_dict['valid_loss_list'], state_dict['global_steps_list']


In [10]:
# Training Function

def train(model,
          optimizer,
          criterion = nn.BCELoss(),
          train_loader = trainloader,
          valid_loader = valloader,
          num_epochs = 5,
          eval_every = len(trainloader) // 2,
          file_path = 'solo_classifier_weights',
          best_valid_loss = float("Inf")):
    
    # initialize running values
    running_loss = 0.0
    valid_running_loss = 0.0
    global_step = 0
    train_loss_list = []
    valid_loss_list = []
    global_steps_list = []

    # training loop
    model.train()
    for epoch in range(num_epochs):
        total = 0
        total_correct = 0
        for i, (labels, notes, notes_len) in enumerate(train_loader):           
            labels = labels.to(device)
            notes = notes.to(device)
            notes_len = notes_len
            output = model(notes.long(), notes_len.long())

            loss = criterion(output, labels.float())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            labels_max = labels.detach().cpu()
            output_max = torch.round(output.detach().cpu())

            for i in range(len(labels_max)):
                total+=1
                if labels_max[i] ==  output_max[i]:
                    total_correct += 1
            accuracy = accuracy_score(labels_max, output_max)
            
            # update running values
            running_loss += loss.item()
            global_step += 1

            # evaluation step
            if global_step % eval_every == 0:
                model.eval()
                with torch.no_grad():                    
                  # validation loop
                    for i, (labels, notes, notes_len) in enumerate(valid_loader):
                        labels = labels.to(device)
                        notes = notes.to(device)
                        notes_len = notes_len
                        output = model(notes.long(), notes_len.long())
                        loss = criterion(output, labels.float())
                        valid_running_loss += loss.item()

                # evaluation
                average_train_loss = running_loss / eval_every
                average_valid_loss = valid_running_loss / len(valid_loader)
                train_loss_list.append(average_train_loss)
                valid_loss_list.append(average_valid_loss)
                global_steps_list.append(global_step)

                # resetting running values
                running_loss = 0.0                
                valid_running_loss = 0.0
                model.train()

                # print progress
                print('Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
                      .format(epoch+1, num_epochs, global_step, num_epochs*len(train_loader),
                              average_train_loss, average_valid_loss))
                
                # checkpoint
                if best_valid_loss > average_valid_loss:
                    best_valid_loss = average_valid_loss
                    save_checkpoint(file_path + '/model.pt', model, optimizer, best_valid_loss)
                    save_metrics(file_path + '/metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
        print("Epoch Accuracy: {}".format(total_correct/total))
    save_metrics(file_path + '/metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    print('Finished Training!')


model = LSTM().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

train(model=model, optimizer=optimizer, num_epochs=10)

Epoch [1/10], Step [8/170], Train Loss: 0.7201, Valid Loss: 0.6950
Model saved to ==> solo_classifier_weights/model.pt
Model saved to ==> solo_classifier_weights/metrics.pt
Epoch [1/10], Step [16/170], Train Loss: 0.7025, Valid Loss: 0.6971
Epoch Accuracy: 0.505671077504726
Epoch [2/10], Step [24/170], Train Loss: 0.6986, Valid Loss: 0.6930
Model saved to ==> solo_classifier_weights/model.pt
Model saved to ==> solo_classifier_weights/metrics.pt
Epoch [2/10], Step [32/170], Train Loss: 0.6916, Valid Loss: 0.6951
Epoch Accuracy: 0.5122873345935728
Epoch [3/10], Step [40/170], Train Loss: 0.6913, Valid Loss: 0.6926
Model saved to ==> solo_classifier_weights/model.pt
Model saved to ==> solo_classifier_weights/metrics.pt
Epoch [3/10], Step [48/170], Train Loss: 0.6896, Valid Loss: 0.6943
Epoch Accuracy: 0.5311909262759924
Epoch [4/10], Step [56/170], Train Loss: 0.6955, Valid Loss: 0.6921
Model saved to ==> solo_classifier_weights/model.pt
Model saved to ==> solo_classifier_weights/metrics.

In [None]:
torch.backends.cudnn.enabled = False