In [45]:
import operator
import os, math
import string
import numpy as np
import random
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.distributed as dist ##
import torch.multiprocessing as mp ##
from torch.autograd import Variable
from torch.nn import functional as F
from torch.nn.parallel import DistributedDataParallel as DDP ##
from torchtext import data
from torchtext import datasets
from torchtext import vocab
from ekphrasis.classes.tokenizer import SocialTokenizer

seed = 2019
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    
data_dir = '/home/dfsnow/rbans/data'  
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.__version__

flatten = lambda l: [item for sublist in l for item in sublist]

In [4]:
TEXT = data.Field(
    sequential=True,
    tokenize=SocialTokenizer(lowercase=True).tokenize,
    include_lengths=True, 
    batch_first=True,
    lower=True)

LABEL = data.Field(
    sequential=False,
    use_vocab=False,
    pad_token=None, 
    unk_token=None)

In [5]:
rnn_fields = [("id", None),
              ("score", None),
              ("body", TEXT),
              ("label", LABEL)]

In [6]:
train, validate, test = data.TabularDataset.splits(
    path=data_dir,
    train='test_train.csv',
    validation="test_validate.csv",
    test='test_test.csv',
    format='csv',
    skip_header=False, 
    fields=rnn_fields)

In [7]:
train_iter, validate_iter, test_iter = data.BucketIterator.splits(
    (train, validate, test), batch_sizes=(32, 32, 32),
    sort_key=lambda x: len(x.body), device=device,
    repeat=False, shuffle=True)

vec = vocab.Vectors('glove.twitter.27B.100d.txt', os.path.join(data_dir, 'embeddings'))

TEXT.build_vocab(train, validate, vectors=vec)

vocab_size = len(TEXT.vocab)
word_embeddings = TEXT.vocab.vectors


In [8]:
batch = next(iter(train_iter)) 
print(batch.label)
print(batch.body) 

tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0], device='cuda:0')
(tensor([[ 1.9500e+02,  3.0000e+00,  1.3793e+05,  ...,  1.0000e+00,
          1.0000e+00,  1.0000e+00],
        [ 8.0000e+00,  1.1290e+03,  3.5930e+03,  ...,  1.0000e+00,
          1.0000e+00,  1.0000e+00],
        [ 5.0000e+00,  3.0146e+04,  3.2400e+02,  ...,  1.0000e+00,
          1.0000e+00,  1.0000e+00],
        ...,
        [ 7.0000e+00,  2.4670e+03,  2.4200e+02,  ...,  1.0000e+00,
          1.0000e+00,  1.0000e+00],
        [ 3.8160e+03,  1.4250e+05,  3.6400e+02,  ...,  1.0000e+00,
          1.0000e+00,  1.0000e+00],
        [ 4.5500e+02,  4.7000e+01,  2.6000e+01,  ...,  1.0000e+00,
          1.0000e+00,  1.0000e+00]], device='cuda:0'), tensor([   13,    39,    18,    22,   101,    25,    34,    15,     3,
           39,     9,    12,    10,     8,    11,    12,     6,     6,
          157,    20,     5,   165,  

In [113]:
class LSTMClassifier(nn.Module):
    def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights):
        super(LSTMClassifier, self).__init__()
        
        self.batch_size = batch_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.embedding_length = embedding_length
        
        self.word_embeddings = nn.Embedding(vocab_size, embedding_length)# Initializing the look-up table.
        self.word_embeddings.weight = nn.Parameter(weights, requires_grad=False) # Assigning the look-up table to the pre-trained GloVe word embedding.
        self.lstm = nn.LSTM(embedding_length, hidden_size)
        self.label = nn.Linear(hidden_size, output_size)
        
    def forward(self, input_sentence, batch_size=None):
        input = self.word_embeddings(input_sentence) # embedded input of shape = (batch_size, num_sequences,  embedding_length)
        input = input.permute(1, 0, 2) # input.size() = (num_sequences, batch_size, embedding_length)
        if batch_size is None:
            h_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).cuda()) # Initial hidden state of the LSTM
            c_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).cuda()) # Initial cell state of the LSTM
        else:
            h_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())
            c_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())
            
        output, (final_hidden_state, final_cell_state) = self.lstm(input, (h_0, c_0))
        final_output = self.label(final_hidden_state[-1]) # final_hidden_state.size() = (1, batch_size, hidden_size) & final_output.size() = (batch_size, output_size)

        return final_output


In [156]:
class RNN(nn.Module):
    def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights):
        super(RNN, self).__init__()

        self.batch_size = batch_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.embedding_length = embedding_length
        
        self.word_embeddings = nn.Embedding(vocab_size, embedding_length)
        self.word_embeddings.weight = nn.Parameter(weights, requires_grad=False)
        self.rnn = nn.RNN(embedding_length, hidden_size, num_layers=2, bidirectional=True)
        self.label = nn.Linear(4*hidden_size, output_size)
    
    def forward(self, input_sentences, batch_size=None):

        input = self.word_embeddings(input_sentences)
        input = input.permute(1, 0, 2)
        if batch_size is None:
            h_0 = Variable(torch.zeros(4, self.batch_size, self.hidden_size).cuda()) # 4 = num_layers*num_directions
        else:
            h_0 =  Variable(torch.zeros(4, batch_size, self.hidden_size).cuda())
        output, h_n = self.rnn(input, h_0)
        # h_n.size() = (4, batch_size, hidden_size)
        h_n = h_n.permute(1, 0, 2) # h_n.size() = (batch_size, 4, hidden_size)
        h_n = h_n.contiguous().view(h_n.size()[0], h_n.size()[1]*h_n.size()[2])
        # h_n.size() = (batch_size, 4*hidden_size)
        logits = self.label(h_n) # logits.size() = (batch_size, output_size)
        
        return logits


In [158]:
# def clip_gradient(model, clip_value):
#     params = list(filter(lambda p: p.grad is not None, model.parameters()))
#     for p in params:
#         p.grad.data.clamp_(-clip_value, clip_value)
    
def train_model(model, train_iter, epoch):
    total_epoch_loss = 0
    total_epoch_acc = 0
    model.cuda()
    optim = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))
    steps = 0
    model.train()
    for idx, batch in enumerate(train_iter):
        text = batch.body[0]
        target = batch.label
        target = torch.autograd.Variable(target).long()
        if torch.cuda.is_available():
            text = text.cuda()
            target = target.cuda()
        if (text.size()[0] is not 32):
            continue
        optim.zero_grad()
        prediction = model(text)
        loss = loss_fn(prediction, target)
        num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).float().sum()
        acc = 100.0 * num_corrects/len(batch)
        loss.backward()
#         clip_gradient(model, 1e-1)
        optim.step()
        steps += 1
        
        if steps % 1000 == 0:
            print (f'Epoch: {epoch+1}, Idx: {idx+1}, Training Loss: {loss.item():.4f}, Training Accuracy: {acc.item(): .2f}%')
        
        total_epoch_loss += loss.item()
        total_epoch_acc += acc.item()
        
    return total_epoch_loss/len(train_iter), total_epoch_acc/len(train_iter)

def eval_model(model, val_iter):
    total_epoch_loss = 0
    total_epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for idx, batch in enumerate(val_iter):
            text = batch.body[0]
            if (text.size()[0] is not 32):
                continue
            target = batch.label
            target = torch.autograd.Variable(target).long()
            if torch.cuda.is_available():
                text = text.cuda()
                target = target.cuda()
            prediction = model(text)
            loss = loss_fn(prediction, target)
            num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).sum()
            acc = 100.0 * num_corrects/len(batch)
            total_epoch_loss += loss.item()
            total_epoch_acc += acc.item()

    return total_epoch_loss/len(val_iter), total_epoch_acc/len(val_iter)
    

learning_rate = 2e-5
batch_size = 32
output_size = 2
hidden_size = 256
embedding_length = 100

model = RNN(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
loss_fn = F.cross_entropy
        
for epoch in range(4):
    train_loss, train_acc = train_model(model, train_iter, epoch)
    val_loss, val_acc = eval_model(model, validate_iter)
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%')
    
test_loss, test_acc = eval_model(model, test_iter)
print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')


Epoch: 1, Idx: 1000, Training Loss: 0.0165, Training Accuracy:  100.00%
Epoch: 1, Idx: 2000, Training Loss: 0.0279, Training Accuracy:  100.00%
Epoch: 1, Idx: 3000, Training Loss: 0.2521, Training Accuracy:  93.75%
Epoch: 1, Idx: 4000, Training Loss: 0.3287, Training Accuracy:  96.88%


KeyboardInterrupt: 

In [122]:
torch.save(model.state_dict(), '/home/dfsnow/rbans/model.pyt')

model.load_state_dict(torch.load('/home/dfsnow/rbans/model.pyt'))
model.to(device)


LSTMClassifier(
  (word_embeddings): Embedding(145880, 100)
  (lstm): LSTM(100, 256)
  (label): Linear(in_features=256, out_features=2, bias=True)
)

In [123]:
test_sen1 = "Yes, very mundane. I really like it. good art. wow"
test_sen2 = "Blackpill incel death hilary clinton pizzagate"

test_sen1 = TEXT.preprocess(test_sen1)
test_sen1 = [[TEXT.vocab.stoi[x] for x in test_sen1]]

test_sen2 = TEXT.preprocess(test_sen2)
test_sen2 = [[TEXT.vocab.stoi[x] for x in test_sen2]]

test_sen = np.asarray(test_sen2)
test_sen = torch.LongTensor(test_sen).cuda()
with torch.no_grad():
    test_tensor = Variable(test_sen)
model.eval()
output = model(test_tensor, 1)
print(output)
out = F.softmax(output, 1)
print(out)
if (torch.argmax(out[0]) == 1):
    print ("Sentiment: Positive")
else:
    print ("Sentiment: Negative")

tensor(1.00000e-02 *
       [[ 7.4716, -6.7843]], device='cuda:0')
tensor([[ 0.5356,  0.4644]], device='cuda:0')
Sentiment: Negative


In [133]:
applied_fields = [("id", None),
              ("score", None),
              ("body", TEXT),
              ("label", LABEL)]

applied = data.TabularDataset(
    path=os.path.join(data_dir, 'test_validate.csv'),
    format='csv',
    skip_header=False, 
    fields=applied_fields)

In [134]:
applied_iter = data.BucketIterator(applied, batch_size=32,
    sort_key=lambda x: len(x.body), device=device,
    repeat=False, shuffle=False, sort_within_batch=False)


In [135]:
def run_model(model, iter):

    temp = []
    model.eval()
    with torch.no_grad():
        for idx, batch in enumerate(iter):
            text = batch.body[0]
            if (text.size()[0] is not 32):
                continue
            if torch.cuda.is_available():
                text = text.cuda()
            prediction = model(text)
            out = F.softmax(prediction, 1)
            temp.append([torch.argmax(x) == 1 for x in out])
                
    return temp
            
temp_list = run_model(model, applied_iter)

In [139]:
sum(flatten(temp_list))

tensor(3, dtype=torch.uint8, device='cuda:0')

In [140]:
print([test[i].body for i, x in enumerate(flatten(temp_list)) if x == 1])

[['adhiero', 'con', 'toda', 'violencia', ',', 'encima', 'pecho', '!'], ['quite', 'a', 'limited', 'amount', '.'], ['<url>']]


In [155]:
print([x.body for i, x in enumerate(train) if int(x.label) == 1])

