## CNN sentence classification

In [1]:
import pandas as pd
import torch
import torch.autograd as autograd
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext import vocab
from torchtext import data
from torchtext.data import Field
from torchtext.data import TabularDataset
from torchtext.data import Iterator
from torchtext.vocab import GloVe
import numpy as np
from tqdm import tqdm, tqdm_notebook

In [2]:
def tokenizer(x):
    return x.split()

In [3]:
BOS_WORD = '<s>'
EOS_WORD = '</s>'
BLANK_WORD = "<blank>"
max_vocab = 8000
fix_length=30

In [4]:
TEXT = Field(sequential=True, tokenize=tokenizer, pad_token=BLANK_WORD, lower=True, batch_first=True, fix_length=fix_length)
LABEL = Field(sequential=False, unk_token=None)

In [5]:
train_data = TabularDataset(path='./text_emotion.csv', 
                            format='csv', 
                            skip_header=True,
                            fields=[("tweet_id", None),("sentiment", LABEL),("author", None),("content",TEXT)])

In [6]:
glove = vocab.Vectors('data/glove.6B.300d.txt')
tqdm_notebook().pandas() 

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [7]:
#vec = vocab.Vectors('glove.twitter.27B.100d.txt', './data/glove_embedding/')
#https://medium.com/@sonicboom8/sentiment-analysis-torchtext-55fb57b1fab8

In [8]:
#TEXT.build_vocab(train_data, max_size=max_vocab)
TEXT.build_vocab(train_data, min_freq=3)
LABEL.build_vocab(train_data)

In [9]:
TEXT.vocab.set_vectors(glove.stoi, glove.vectors, dim=300)
TEXT.fix_length = fix_length

In [10]:
one_example = train_data.examples[0]
one_example.content[:3]

['@tiffanylue', 'i', 'know']

In [11]:
train_loader = Iterator(train_data, 
                        batch_size=64, 
                        device=-1, 
                        repeat=False, )

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


In [12]:
for batch in train_loader:
    break;
print(batch.content.shape)
print(batch.sentiment.shape)

torch.Size([64, 30])
torch.Size([64])


In [13]:
TEXT.vocab.itos

['<unk>',
 '<blank>',
 'i',
 'to',
 'the',
 'a',
 'my',
 'and',
 'you',
 'is',
 'in',
 'for',
 'it',
 'of',
 'on',
 'have',
 'so',
 'that',
 'me',
 'but',
 'just',
 "i'm",
 'with',
 'be',
 'at',
 'was',
 'not',
 'day',
 'this',
 'all',
 'get',
 'good',
 'like',
 'are',
 'out',
 'up',
 '-',
 "it's",
 'your',
 'go',
 'no',
 'got',
 'now',
 'going',
 'love',
 'do',
 'from',
 'happy',
 'will',
 'work',
 'im',
 'what',
 'we',
 "don't",
 'about',
 'u',
 'one',
 'really',
 'back',
 'its',
 'too',
 'am',
 'had',
 'see',
 'can',
 'know',
 'some',
 "can't",
 'if',
 'time',
 'new',
 'when',
 'as',
 'lol',
 'want',
 'think',
 'how',
 '&amp;',
 'still',
 'an',
 'today',
 'they',
 'miss',
 'last',
 '2',
 'more',
 'off',
 'need',
 'oh',
 'hope',
 'has',
 'there',
 'been',
 'home',
 'much',
 'feel',
 'thanks',
 'night',
 'great',
 'only',
 'or',
 'would',
 'he',
 'her',
 'wish',
 'then',
 'well',
 'why',
 'very',
 'here',
 "i'll",
 'by',
 'make',
 'gonna',
 'she',
 'did',
 "that's",
 'getting',
 'twit

In [14]:
len(train_data)

40000

In [15]:
TEXT.vocab.itos[4]

'the'

In [16]:
def init_network(model, method='xavier', exclude='embedding', seed=123):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    for name, w in model.named_parameters():
        if not exclude in name:
            if 'weight' in name:
                if method is 'xavier':
                    nn.init.xavier_normal_(w)
                elif method is 'kaiming':
                    nn.init.kaiming_normal_(w)
                else:
                    nn.init.normal_(w)
            elif 'bias' in name:
                nn.init.constant_(w, 0.0)
            else: 
                pass

In [17]:
def print_model(model, ignore='embedding'):
    total = 0
    for name, w in model.named_parameters():
        if not ignore or ignore not in name:
            total += w.nelement()
            print('{} : {}  {} parameters'.format(name, w.shape, w.nelement()))
    print('-------'*4)
    print('Total {} parameters'.format(total))

In [18]:
batch_size=1024
epochs=200
embidding_dim = 300
seq_length = 50
vocab_size = len(TEXT.vocab.itos)
num_filters = 128
kernel_sizes = [3,4,5]
hidden_dim = 128 # hidden size of fully conntected layer
label_size = len(LABEL.vocab)
print_every = 1000

In [19]:
class TextCNN(nn.Module):
    #num_filters = out-channels
    def __init__(self, lm, padding_idx, vocab_size, embedding_dim, num_filters, kernel_sizes, num_classes, dropout_prob):
        super(TextCNN, self).__init__()
        
        #self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding = nn.Embedding.from_pretrained(lm)
        self.embedding.padding_idx = padding_idx
        
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(kernel_sizes[0], embedding_dim))
        self.conv2 = nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(kernel_sizes[1], embedding_dim))
        self.conv3 = nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(kernel_sizes[2], embedding_dim))
        
        self.fc = nn.Linear(len(kernel_sizes)*num_filters, num_classes)
        self.dropout = nn.Dropout(dropout_prob)
    
    def forward(self, inputs):
        #x = [batch size, sent len]
        x = inputs

        #embedded = [batch size, sent len, emb dim]
        embedded = self.embedding(x)
        #print(embedded.shape)
        
        #embedded = [batch size, 1, sent len, emb dim]
        embedded = embedded.unsqueeze(1)
        #print(embedded.shape)
        
        #conv_n = [batch size, n_filters, sent_len - filter_sizes[n]]
        conved1 = F.relu(self.conv1(embedded).squeeze(3))
        conved2 = F.relu(self.conv2(embedded).squeeze(3))
        conved3 = F.relu(self.conv3(embedded).squeeze(3))
        #print(conved11.shape)
        
        #pooled_n = [batch size, n_filters]
        pooled1 = F.max_pool1d(conved1, conved1.shape[2]).squeeze(2)
        pooled2 = F.max_pool1d(conved2, conved2.shape[2]).squeeze(2)
        pooled3 = F.max_pool1d(conved3, conved3.shape[2]).squeeze(2)
        #print(pooled11.shape)
        
        
        #cat = [batch size, n_filters * len(filter_sizes)]
        cat = self.dropout(torch.cat((pooled1, pooled2, pooled3), dim=1))
        #print(cat.shape)
        
        fc = self.fc(cat)
        return fc
        #return F.log_softmax(fc)

In [20]:
# GPU 사용할 수 있는지 확인
train_on_gpu = torch.cuda.is_available()
if(train_on_gpu):
    print('Training on GPU!')
else: 
    print('No GPU available, training on CPU; consider making n_epochs very small.')
#train_on_gpu = False

Training on GPU!


In [31]:
model = TextCNN(TEXT.vocab.vectors, TEXT.vocab.stoi[TEXT.pad_token], vocab_size, embidding_dim, num_filters, kernel_sizes, label_size, 0.1)
init_network(model)
if(train_on_gpu):
    model.cuda()
model.train()

TextCNN(
  (embedding): Embedding(11631, 300, padding_idx=1)
  (conv1): Conv2d(1, 128, kernel_size=(3, 300), stride=(1, 1))
  (conv2): Conv2d(1, 128, kernel_size=(4, 300), stride=(1, 1))
  (conv3): Conv2d(1, 128, kernel_size=(5, 300), stride=(1, 1))
  (fc): Linear(in_features=384, out_features=13, bias=True)
  (dropout): Dropout(p=0.1)
)

In [34]:
criterion = F.cross_entropy
#criterion = nn.BCEWithLogitsLoss()
#optimizer = optim.Adam(model.parameters(), lr=1e-3)
#optimizer = torch.optim.SGD(model.parameters(), lr=1e-3,momentum=0.8)
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))
print_model(model, ignore=None)

embedding.weight : torch.Size([11631, 300])  3489300 parameters
conv1.weight : torch.Size([128, 1, 3, 300])  115200 parameters
conv1.bias : torch.Size([128])  128 parameters
conv2.weight : torch.Size([128, 1, 4, 300])  153600 parameters
conv2.bias : torch.Size([128])  128 parameters
conv3.weight : torch.Size([128, 1, 5, 300])  192000 parameters
conv3.bias : torch.Size([128])  128 parameters
fc.weight : torch.Size([13, 384])  4992 parameters
fc.bias : torch.Size([13])  13 parameters
----------------------------
Total 3955489 parameters


In [35]:
best_acc = 0.0
counter = 0
index = 0

for e in range(epochs):
    for i,batch in enumerate(train_loader):
        counter += 1
        
        #if len(batch) != batch_size: continue
        if(train_on_gpu):
            inputs, targets = Variable(batch.content).cuda(), Variable(batch.sentiment).cuda()
        else:
            inputs, targets = batch.content, batch.sentiment
        counter += 1
        model.zero_grad()
        
        output = model(inputs)
        #print("output: ", output.shape)
        #print("targets: ", targets.shape)
        
        #loss = criterion(output, targets)
        loss = F.cross_entropy(output, targets.view(-1))
        loss.backward()
        optimizer.step()
        
        if counter % print_every == 0:
            print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()))

Epoch: 1/200... Step: 1000... Loss: 2.0054...
Epoch: 2/200... Step: 2000... Loss: 1.9270...
Epoch: 3/200... Step: 3000... Loss: 1.7252...
Epoch: 4/200... Step: 4000... Loss: 1.4458...
Epoch: 4/200... Step: 5000... Loss: 1.5459...
Epoch: 5/200... Step: 6000... Loss: 1.6439...
Epoch: 6/200... Step: 7000... Loss: 1.2206...
Epoch: 7/200... Step: 8000... Loss: 0.9150...
Epoch: 8/200... Step: 9000... Loss: 0.7754...
Epoch: 8/200... Step: 10000... Loss: 1.0881...
Epoch: 9/200... Step: 11000... Loss: 0.9381...
Epoch: 10/200... Step: 12000... Loss: 0.9848...
Epoch: 11/200... Step: 13000... Loss: 0.6031...
Epoch: 12/200... Step: 14000... Loss: 0.7047...
Epoch: 12/200... Step: 15000... Loss: 0.6862...
Epoch: 13/200... Step: 16000... Loss: 0.7743...
Epoch: 14/200... Step: 17000... Loss: 0.5336...
Epoch: 15/200... Step: 18000... Loss: 0.6734...
Epoch: 16/200... Step: 19000... Loss: 0.5369...
Epoch: 16/200... Step: 20000... Loss: 0.5121...
Epoch: 17/200... Step: 21000... Loss: 0.6138...
Epoch: 18/20

Epoch: 136/200... Step: 170000... Loss: 0.6293...
Epoch: 137/200... Step: 171000... Loss: 0.2843...
Epoch: 138/200... Step: 172000... Loss: 0.0619...
Epoch: 139/200... Step: 173000... Loss: 0.0563...
Epoch: 140/200... Step: 174000... Loss: 0.3814...
Epoch: 140/200... Step: 175000... Loss: 0.3445...
Epoch: 141/200... Step: 176000... Loss: 0.1387...
Epoch: 142/200... Step: 177000... Loss: 0.2497...
Epoch: 143/200... Step: 178000... Loss: 0.1744...
Epoch: 144/200... Step: 179000... Loss: 0.1871...
Epoch: 144/200... Step: 180000... Loss: 0.3278...
Epoch: 145/200... Step: 181000... Loss: 0.2158...
Epoch: 146/200... Step: 182000... Loss: 0.2493...
Epoch: 147/200... Step: 183000... Loss: 0.1050...
Epoch: 148/200... Step: 184000... Loss: 0.2431...
Epoch: 148/200... Step: 185000... Loss: 0.1136...
Epoch: 149/200... Step: 186000... Loss: 0.1674...
Epoch: 150/200... Step: 187000... Loss: 0.2166...
Epoch: 151/200... Step: 188000... Loss: 0.2013...
Epoch: 152/200... Step: 189000... Loss: 0.0428...


In [24]:
import os
filename = "document_cls_text_cnn10.pth"
PATH = os.path.join("model", filename)
#torch.save(model.state_dict(), PATH)

In [25]:
#model = TextCNN(TEXT.vocab.vectors, TEXT.vocab.stoi[TEXT.pad_token], num_filters, kernel_sizes, label_size, 0.5)
model = TextCNN(TEXT.vocab.vectors, TEXT.vocab.stoi[TEXT.pad_token], vocab_size, embidding_dim, num_filters, kernel_sizes, label_size, 0.1)

In [26]:
model.load_state_dict(torch.load(PATH))

In [144]:
#sentence = "How are YOU convinced that I have always wanted you? What signals did I give off...damn I think I just lost another friend"
#sentence = "The storm is here and the electricity is gone"
#sentence = "Damm servers still down  i need to hit 80 before all the koxpers pass me"
#sentence = "Need to pack for CALI CALI! Cannot waittt! Thinking a glass of wine is in order to celebrate my weekend vaca. Still work 2morrow, tho."
#sentence = "I'm worried I can do anything"
##sentence = "I felt ecstatic when I passed my exam"
#sentence = "I was overjoyed at the birth of my son."
##sentence = "During the Christmas holidays I felt wonderfully merry."
#sentence = "I’m feeling a little low at the moment."
sentence = "I was so annoyed when I failed my English test."
sentence = "Afraid of your own shadow"

In [145]:
s = [TEXT.vocab.stoi[word.lower()] for word in tokenizer(sentence)]
s

[1280, 13, 38, 493, 8253]

In [146]:
nse = np.asarray(s)
feature_tensor = torch.from_numpy(nse)
feature_tensor = feature_tensor.unsqueeze(0)
batch_size = feature_tensor.size(0)

In [147]:
if(train_on_gpu):
    feature_tensor = feature_tensor.cuda()
    model.cuda()

model.eval()
print(feature_tensor.shape)

torch.Size([1, 5])


In [148]:
output = model(feature_tensor).squeeze()
output

tensor([  2.3611,   8.2147,  -4.5427,  -1.5517,  -0.9713,   3.3712,  -8.3086,
         -3.6140,  -3.4412,   2.0452,  -6.9978, -13.7870,  -5.4736],
       device='cuda:0', grad_fn=<SqueezeBackward0>)

In [149]:
#pred = torch.sigmoid(output.view(-1)).cpu().data.numpy().tolist()
#loss = F.cross_entropy(logit, target, reduction='sum')
pred = F.softmax(output)

  This is separate from the ipykernel package so we can avoid doing imports until


In [150]:
_, predicted = torch.max(output, 0)
value = predicted.data.tolist()
value

1

In [151]:
LABEL.vocab.itos[value]

'worry'

In [41]:
LABEL.vocab.itos

['neutral',
 'worry',
 'happiness',
 'sadness',
 'love',
 'surprise',
 'fun',
 'relief',
 'hate',
 'empty',
 'enthusiasm',
 'boredom',
 'anger']