In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext import data, vocab
import torch.optim as optim
import pandas as pd
from tqdm import tqdm
import random
import re
import spacy

SEED = 1994

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

Generating bigrams as suggested by FastText. It appends bigrams to the end of the sentences.

In [2]:
def generate_bigrams(x):
    n_grams = set(zip(*[x[i:] for i in range(2)]))
    for n_gram in n_grams:
        x.append(' '.join(n_gram))
    return x

In [3]:
TEXT = data.Field(tokenize='spacy', preprocessing=generate_bigrams)
LABEL = data.LabelField(sequential=False, use_vocab=False, dtype = torch.float)

Training only on 10% of the original dataset ~150,000 tweets

In [4]:
df = data.TabularDataset(
        path='subset.csv', format='csv',
        fields=[('text', TEXT), ('label', LABEL)],
        skip_header = True)

In [5]:
train_data, valid_data = df.split(random_state=random.seed(SEED))

In [6]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')

Number of training examples: 112000
Number of validation examples: 48000


In [7]:
train_data[5].text

['i',
 'feel',
 'so',
 'unbelievably',
 'hideously',
 'crap',
 'today',
 ' ',
 'but',
 'i',
 'have',
 'so',
 'much',
 'work',
 'to',
 'do',
 'i',
 'should',
 'just',
 'get',
 'on',
 'with',
 'it',
 '.',
 'should just',
 '  but',
 'get on',
 'just get',
 'so unbelievably',
 'feel so',
 'to do',
 'do i',
 'work to',
 'i have',
 'today  ',
 'crap today',
 'much work',
 'but i',
 'hideously crap',
 'it .',
 'unbelievably hideously',
 'on with',
 'i feel',
 'with it',
 'so much',
 'have so',
 'i should']

In [8]:
train_data[5].label

'0'

We are going to use pretrained GloVe vectors. The pretrained word embeddings for Twitter data comes in 4 sizes. I have chosen the largest word embedding (200 dimensions)

In [9]:
vec = vocab.Vectors('glove.twitter.27B.200d.txt')

In [13]:
TEXT.build_vocab(train_data, valid_data, max_size = 25000, vectors = vec)

In [14]:
LABEL.build_vocab(train_data)

In [15]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2


In [16]:
italy = TEXT.vocab.vectors[TEXT.vocab.stoi['italy']];italy

tensor([ 0.1135,  0.2170,  0.4643,  0.7430, -0.7222, -0.2391, -0.2278, -0.2432,
         0.9551, -0.8257,  0.1303,  0.0845, -0.9726, -0.2043, -0.0751, -0.3190,
        -0.1875, -0.0026, -0.5369,  0.1796,  0.2475, -0.2581, -0.4072,  0.3753,
         0.2138,  0.3937, -0.0552, -0.3846, -0.4128,  0.5225,  0.1712, -0.0910,
         0.5309,  0.8437,  0.6932, -0.6771,  0.5844, -0.1318,  0.0964,  0.3654,
         0.4073, -0.3495, -0.5212, -0.0173, -0.0656, -0.7051, -0.0181, -0.1174,
        -0.6284,  0.1590, -0.9738,  0.0019,  0.6969, -0.0835,  0.1399, -0.2120,
         0.6052,  0.2179, -0.0760, -0.3116, -0.4843,  0.8462, -0.0819,  0.3877,
         0.2572,  0.8986, -0.0341,  0.3691,  0.0783, -0.3254, -1.0125, -0.1220,
         1.0881,  0.6419, -0.2615,  0.0723, -0.4124,  0.1466,  0.2530,  0.0599,
         0.7032, -0.5501,  0.5050, -0.3880, -0.4303, -0.3041, -0.3142,  0.2204,
        -0.6796, -0.0640,  0.0872, -0.2838, -0.0974, -0.1704, -0.7401,  0.4895,
        -0.3014,  0.0279, -0.5424, -0.58

In [17]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator = data.BucketIterator.splits(
    datasets = (train_data, valid_data), 
    batch_sizes = (128,64),
    sort_key = lambda x:len(x.text),
    sort_within_batch = True,
    repeat = False,
    device=device)

In [18]:
class FastText(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc = nn.Linear(embedding_dim, output_dim)
        
    def forward(self, x):       
        embedded = self.embedding(x)       
        embedded = embedded.permute(1, 0, 2)        
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1)                 
        return self.fc(pooled)

In [19]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 200
OUTPUT_DIM = 1
model = FastText(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM)

Loading pretrained word embeddings as the weight initializer for our embedding layer...

In [20]:
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.5545, -0.1426, -0.0038,  ..., -0.2828, -0.0693,  1.2271],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [21]:
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)

In [22]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [23]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [24]:
def evaluate(model, iterator, criterion):  
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [25]:
N_EPOCHS = 5
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print("Epoch:", epoch+1, "| Train Loss:", "%.3f" % train_loss, "| Train Acc:", "%.3f" % train_acc,"| Valid Loss:", "%.3f" % valid_loss, "| Valid Acc:", "%.3f" % valid_acc )

Epoch: 1 | Train loss: 0.527 | Train accuracy: 0.745 | Valid loss: 0.452 | Valid accuracy: 0.791
Epoch: 2 | Train loss: 0.414 | Train accuracy: 0.814 | Valid loss: 0.439 | Valid accuracy: 0.796
Epoch: 3 | Train loss: 0.378 | Train accuracy: 0.833 | Valid loss: 0.445 | Valid accuracy: 0.798
Epoch: 4 | Train loss: 0.357 | Train accuracy: 0.844 | Valid loss: 0.458 | Valid accuracy: 0.795
Epoch: 5 | Train loss: 0.343 | Train accuracy: 0.851 | Valid loss: 0.472 | Valid accuracy: 0.792


In [26]:
import spacy
nlp = spacy.load('en')
def predict_sentiment(sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = torch.sigmoid(model(tensor))
    if prediction.item()>0.5:
        return ("Positive", prediction.item())
    else:
        return ("Negative", prediction.item())

In [27]:
predict_sentiment("this tutorial is very useful")

('Positive', 0.9998857975006104)

In [28]:
predict_sentiment("i don't know why any would use tensorflow instead of pytorch")

('Negative', 0.06993444263935089)