In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter, OrderedDict
import nltk
import re
from tqdm import tqdm
import pandas as pd
from pprint import pprint
from copy import deepcopy
flatten = lambda l: [item for sublist in l for item in sublist]
random.seed(1024)

In [2]:
USE_CUDA = torch.cuda.is_available()
print(USE_CUDA)
if USE_CUDA:
    gpus = [0]
    torch.cuda.set_device(0)

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

False


In [3]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex: eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [4]:
def pad_to_batch(batch):
    x,y = zip(*batch)
    max_x = max([s.size(1) for s in x])
    x_p = []
    for i in range(len(batch)):
        if x[i].size(1) < max_x:
            x_p.append(torch.cat([x[i], Variable(LongTensor([word2index['<PAD>']] * (max_x - x[i].size(1)))).view(1, -1)], 1))
        else:
            x_p.append(x[i])
    return torch.cat(x_p), torch.cat(y).view(-1)

In [5]:
def prepare_sequence(seq, to_index):
    idxs = list(map(lambda w: to_index[w] if to_index.get(w) is not None else to_index["<UNK>"], seq))
    return Variable(LongTensor(idxs))

Twitter dataset from https://www.kaggle.com/kazanova/sentiment140

In [7]:
data = pd.read_csv('training.1600000.processed.noemoticon.csv', 
                   encoding = "ISO-8859-1",
                  error_bad_lines=False);

In [8]:
data = data.iloc[:,[0,-1]]
data.columns = ["Sentiment", "Tweet"]
data.tail()

Unnamed: 0,Sentiment,Tweet
1599994,4,Just woke up. Having no school is the best fee...
1599995,4,TheWDB.com - Very cool to hear old Walt interv...
1599996,4,Are you ready for your MoJo Makeover? Ask me f...
1599997,4,Happy 38th Birthday to my boo of alll time!!! ...
1599998,4,happy #charitytuesday @theNSPCC @SparksCharity...


In [9]:
dictionary = {0: "Negative", 2: "Neutral", 4: "Positive"}
data["Sentiment"] = data["Sentiment"].replace(dictionary, regex = True)

In [10]:
df = list(data[["Sentiment", "Tweet"]].itertuples(index=False, name=None))

In [11]:
df[0]

('Negative',
 "is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!")

In [12]:
y, X = list(zip(*df))
X = list(X);X[35:40]

['@cocomix04 ill tell ya the story later  not a good day and ill be workin for like three more hours...',
 '@MissXu sorry! bed time came here (GMT+1)   http://is.gd/fNge',
 "@fleurylis I don't either. Its depressing. I don't think I even want to know about the kids in suitcases. ",
 "Bed. Class 8-12. Work 12-3. Gym 3-5 or 6. Then class 6-10. Another day that's gonna fly by. I miss my girlfriend ",
 "really don't feel like getting up today... but got to study to for tomorrows practical exam... "]

In [13]:
for i in tqdm(range(len(X))):
    X[i] = " ".join(filter(lambda x:x[0]!='@', X[i].split()))
    X[i] = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', X[i])
    X[i] = re.sub('[$.!?&%]', '', X[i])
    X[i] = re.sub('\d', '#', X[i]).split()

100%|██████████| 1599999/1599999 [00:25<00:00, 61795.76it/s]


In [14]:
new = [pair for pair in zip(X,y) if len(pair[0]) != 0] 

In [15]:
X, y = list(zip(*new))
X = list(X);

In [16]:
X[10]

['I', 'just', 're-pierced', 'my', 'ears']

In [17]:
vocab = list(set(flatten(X)));len(vocab)

672801

In [18]:
word2index = {"<PAD>": 0, "<UNK>":1}

for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)
        
index2word = {v:k for k, v in word2index.items()}

target2index = {}

for target in set(y):
    if target2index.get(target) is None:
        target2index[target] = len(target2index)
        
index2target = {v:k for k, v in target2index.items()}

In [19]:
X_p, y_p = [], []

for pair in tqdm(zip(X, y), total = len(X)):
    X_p.append(prepare_sequence(pair[0], word2index).view(1, -1))
    y_p.append(Variable(LongTensor([target2index[pair[1]]])).view(1, -1))

100%|██████████| 1597009/1597009 [00:50<00:00, 31321.36it/s]


In [20]:
data_p = list(zip(X_p, y_p))
random.shuffle(data_p)

train_data = data_p[: int(len(data_p) * 0.8)]
test_data = data_p[int(len(data_p) * 0.8):]

In [21]:
train_data[:2]

[(tensor([[484880, 230938, 631122,  29546,  52343, 467811, 153934]]),
  tensor([[0]])),
 (tensor([[566754, 255547, 376131, 266059, 341162]]), tensor([[1]]))]

In [22]:
import gensim

Download google pretrained word2vec word embeddings from https://github.com/mmihaltz/word2vec-GoogleNews-vectors 

In [23]:
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [24]:
pretrained = []

for key in word2index.keys():
    try:
        pretrained.append(model[word2index[key]])
    except:
        pretrained.append(np.random.rand(300))
pretrained_vectors = np.vstack(pretrained)

In [25]:
pretrained_vectors.shape

(672803, 300)

In [26]:
class  CNNClassifier(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, output_size, kernel_dim=100, kernel_sizes=(3, 4, 5), dropout=0.5):
        super(CNNClassifier,self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([nn.Conv2d(1, kernel_dim, (K, embedding_dim)) for K in kernel_sizes])

        # kernal_size = (K,D) 
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(len(kernel_sizes) * kernel_dim, output_size)
    
    
    def init_weights(self, pretrained_word_vectors, is_static=False):
        self.embedding.weight = nn.Parameter(torch.from_numpy(pretrained_word_vectors).float())
        if is_static:
            self.embedding.weight.requires_grad = False


    def forward(self, inputs, is_training=False):
        inputs = self.embedding(inputs).unsqueeze(1) # (B,1,T,D)
        inputs = [F.relu(conv(inputs)).squeeze(3) for conv in self.convs] #[(N,Co,W), ...]*len(Ks)
        inputs = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in inputs] #[(N,Co), ...]*len(Ks)

        concated = torch.cat(inputs, 1)

        if is_training:
            concated = self.dropout(concated) # (N,len(Ks)*Co)
        out = self.fc(concated) 
        return F.log_softmax(out,1)

In [27]:
EPOCH = 5
BATCH_SIZE = 50
KERNEL_SIZES = [2,3,4]
KERNEL_DIM = 100
LR = 0.001

In [28]:
model = CNNClassifier(len(word2index), 300, len(target2index), KERNEL_DIM, KERNEL_SIZES)
model.init_weights(pretrained_vectors)

if USE_CUDA:
    model = model.cuda()
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = LR)

In [None]:
for epoch in tqdm(range(EPOCH)):
    losses = []
    for i, batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        inputs, targets = pad_to_batch(batch)
        
        model.zero_grad()
        preds = model(inputs, True)
        
        loss = loss_function(preds, targets)
        losses.append(loss.data.tolist())
        loss.backward()
        optimizer.step()

        if i % 100 == 0:
            print("[%d/%d] mean_loss : %0.2f" %(epoch, EPOCH, np.mean(losses)))
            losses = []