In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext import data, vocab
import torch.optim as optim
import pandas as pd
from tqdm import tqdm
import random
import re
import spacy

SEED = 1994

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

Generating bigrams as suggested by FastText. It appends bigrams to the end of the sentences.

In [None]:
def generate_bigrams(x):
    n_grams = set(zip(*[x[i:] for i in range(2)]))
    for n_gram in n_grams:
        x.append(' '.join(n_gram))
    return x

Adding a custom tokenizer to supply the necessary padding if the filter size is larger than the sentence. 

Everything works fine unless a batch that the longest sentence is shorter than the biggest filter size. We have filters of sizes 3, 4, and 5 so if the longest sentence doesn't have at least 5 words, there is going to be an error.

In [None]:
## Code barrowed from https://stackoverflow.com/questions/51252221/torchtext-bucketiterator-minimum-padding
spacy_en = spacy.load('en')
FILTER_SIZES = [3,4,5]

def tokenizer(text):
    token = [t.text for t in spacy_en.tokenizer(text)]
    if len(token) < FILTER_SIZES[-1]:
        for i in range(0, FILTER_SIZES[-1] - len(token)):
            token.append("<PAD>")
    return token
    

In [None]:
TEXT = data.Field(tokenize=tokenizer, preprocessing=generate_bigrams)
LABEL = data.LabelField(sequential=False, use_vocab=False, dtype = torch.float)

Training only on 10% of the original dataset ~150,000 tweets

In [None]:
def random_sample(file, frac):
    df = pd.read_csv(file)
    df_percent = df.sample(frac=frac)
    dictionary = {"Negative": 0, "Positive": 1}
    df_percent["Sentiment"] = df_percent["Sentiment"].replace(dictionary, regex = True)
    df_percent.to_csv('subset.csv',index = False )
random_sample('preprocessed.csv', 0.1)

In [None]:
df = data.TabularDataset(
        path='subset.csv', format='csv',
        fields=[('text', TEXT), ('label', LABEL)],
        skip_header = True)

In [None]:
train_data, valid_data = df.split(random_state=random.seed(SEED))

In [None]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')

In [None]:
train_data[9].text

In [None]:
train_data[9].label

We are going to use pretrained GloVe vectors. The pretrained word embeddings for Twitter data comes in 4 sizes. I have chosen the largest word embedding (200 dimensions)

In [None]:
vec = vocab.Vectors('glove.twitter.27B.200d.txt')

In [None]:
TEXT.build_vocab(train_data, valid_data, max_size = 50000, vectors = vec)

In [None]:
LABEL.build_vocab(train_data)

In [None]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

In [None]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator = data.BucketIterator.splits(
    datasets = (train_data, valid_data), 
    batch_sizes = (128,64),
    sort_key = lambda x:len(x.text),
    sort_within_batch = True,
    repeat = False,
    device=device)

In [None]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs,embedding_dim)) for fs in filter_sizes])
        self.fc = nn.Linear(len(filter_sizes)*n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, inputs):
        inputs = inputs.permute(1, 0)
        embeds = self.embedding(inputs)
        embeds = embeds.unsqueeze(1)
        conved = [F.relu(conv(embeds)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim = 1))
        return self.fc(cat)

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 200
N_FILTERS = 100
F_SIZES = (3,4,5)
OUTPUT_DIM = 1
DROPOUT = 0.5

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, F_SIZES, OUTPUT_DIM, DROPOUT)

Loading pretrained word embeddings as the weight initializer for our embedding layer...

In [None]:
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

In [None]:
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [None]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):  
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
N_EPOCHS = 5
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print("Epoch:", epoch+1, "| Train Loss:", "%.3f" % train_loss, "| Train Acc:", "%.3f" % train_acc,"| Valid Loss:", "%.3f" % valid_loss, "| Valid Acc:", "%.3f" % valid_acc )

In [None]:
import spacy
nlp = spacy.load('en')
def predict_sentiment(sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = torch.sigmoid(model(tensor))
    if prediction.item()>0.5:
        return ("Positive", prediction.item())
    else:
        return ("Negative", prediction.item())

In [None]:
predict_sentiment("this tutorial is very useful")

In [None]:
predict_sentiment("i don't know why any would use tensorflow instead of pytorch")