With a little help from
https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/4%20-%20Convolutional%20Sentiment%20Analysis.ipynb
and my solutions from the class 
https://edu.epfl.ch/coursebook/en/data-and-artificial-intelligence-for-transportation-CIVIL-459

In [1]:
import numpy as np
import pickle

import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable
import torch.utils.data as utils

from helpers import *

%load_ext autoreload
%autoreload 2

In [4]:
# define file paths

path_embeddings = 'pretrained_glove/embeddings200_pretrained_reduced.npy'
path_vocab = 'pretrained_glove/vocab_pretrained_reduced.pkl'
path_train_pos = 'pos_train.txt'
path_train_neg = 'neg_train.txt'
path_test = 'test_data.txt'

In [44]:
# load word embeddings
embeddings = np.load(path_embeddings)
# add line of zeroes to the embeddings for empty words
embeddings = np.append(np.zeros((1, embeddings.shape[1])), embeddings, axis=0)
# load vocabulary
with open(path_vocab, 'rb') as f:
    vocab = pickle.load(f)

In [45]:
# find maximal tweet length (number of words)
longest = 0
for file in [path_train_pos, path_train_neg, path_test]:
    with open(file) as f:
        for line in f:
            length = len(line.strip().split())
            if length > longest:
                longest = length
            
print("Longest tweet has {:d} words".format(longest))


Longest tweet has 64 words


In [121]:
x = []
y = []

with open(path_train_pos) as f:
    for line in f:
        tweet = np.zeros((longest)).astype(int)
        wordcount = 0
        y.append(1)
        for word in line.strip().split():
            index = vocab.get(word, -1);
            # skip words for which we have no embedding
            if(index != -1):
                tweet[wordcount] = index + 1
                wordcount += 1
        x.append(tweet)
        
with open(path_train_neg) as f:
    for line in f:
        tweet = np.zeros((longest)).astype(int)
        wordcount = 0
        y.append(0)
        for word in line.strip().split():
            index = vocab.get(word, -1);
            # skip words for which we have no embedding
            if(index != -1):
                tweet[wordcount] = index + 1
                wordcount += 1
        x.append(tweet)

x_train = np.asarray(x)
y_train = np.asarray(y)
 
# Shuffle tweets
x_train, y_train = shuffle(x_train, y_train)

In [122]:
x = []

with open(path_test) as f:
    for line in f:
        tweet = np.zeros((longest)).astype(int)
        wordcount = 0
        for word in line.strip().split():
            index = vocab.get(word, -1);
            # skip words for which we have no embedding
            if(index != -1):
                tweet[wordcount] = index + 1
                wordcount += 1
        x.append(tweet)

x_submission = np.asarray(x)

In [130]:
class ConvNet(nn.Module):
    def __init__(self, embeddings):
        super().__init__()
        
        n_channels = 16
        filter_size = 3
        
        embedding_dim = embeddings.shape[1]        
        self.embeddings = torch.nn.Embedding.from_pretrained(embeddings)
        self.conv = torch.nn.Conv2d(1, n_channels, kernel_size=(filter_size, embedding_dim))
        self.fc = nn.Linear(n_channels, 1)

    def forward(self, x):   
        x = self.embeddings(x)
        x = x.unsqueeze(1)
        x = self.conv(x)
        x = x.squeeze(3)
        x = F.relu(x)
        x = F.max_pool1d(x, x.shape[2])
        x = x.squeeze(2)
        x = F.dropout(x, 0)
        x = self.fc(x)
        x = F.sigmoid(x)
        return x.squeeze(1)
    
    def predict(self, x):
        pred = torch.round(self.forward(x))
        return pred

In [131]:
net = ConvNet(torch.from_numpy(embeddings).float())
valset = 10000
batch_size = 64
epochs = 20
print_every = 20
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(net.parameters())


x_val_torch = torch.from_numpy(x_train[0:valset, :])
y_val_torch = torch.from_numpy(y_train[0:valset]).float()
x_train_torch = torch.from_numpy(x_train[valset + 1:, :])
y_train_torch = torch.from_numpy(y_train[valset + 1:]).float()

train_set = utils.TensorDataset(x_train_torch, y_train_torch)
train_loader = utils.DataLoader(train_set, batch_size, shuffle=False)

val_set = utils.TensorDataset(x_val_torch, y_val_torch)
val_loader = utils.DataLoader(val_set, batch_size, shuffle=False)

steps = 0
running_loss = 0

for e in range(epochs):
    for tweets, labels in iter(train_loader):
        steps += 1
        inputs = Variable(tweets)
        targets = Variable(labels)
        optimizer.zero_grad()
        outputs = net.forward(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.data[0]
        
        if steps % print_every == 0:
            accuracy = 0
            for n, (tweets, labels) in enumerate(val_loader):
                predictions = net.predict(tweets)
                accuracy += sum(predictions.data.numpy() == labels.data.numpy()) / labels.data.numpy().size
            
            print("Epoch {} / {}\t".format(e+1, epochs),
                  "Loss {:.4f}\t".format(running_loss / print_every),
                  "Test accuracy {:.4f}".format(accuracy / n))
            running_loss = 0



Epoch 1 / 20	 Loss 0.7001	 Test accuracy 0.5191
Epoch 1 / 20	 Loss 0.6862	 Test accuracy 0.5280
Epoch 1 / 20	 Loss 0.6777	 Test accuracy 0.5600
Epoch 1 / 20	 Loss 0.6685	 Test accuracy 0.6101
Epoch 1 / 20	 Loss 0.6613	 Test accuracy 0.6469
Epoch 1 / 20	 Loss 0.6519	 Test accuracy 0.6764
Epoch 1 / 20	 Loss 0.6476	 Test accuracy 0.6703
Epoch 1 / 20	 Loss 0.6390	 Test accuracy 0.7190
Epoch 1 / 20	 Loss 0.6473	 Test accuracy 0.7025
Epoch 1 / 20	 Loss 0.6384	 Test accuracy 0.7303
Epoch 1 / 20	 Loss 0.6284	 Test accuracy 0.7343
Epoch 1 / 20	 Loss 0.6323	 Test accuracy 0.7188
Epoch 1 / 20	 Loss 0.6340	 Test accuracy 0.7371
Epoch 1 / 20	 Loss 0.6327	 Test accuracy 0.7527
Epoch 1 / 20	 Loss 0.6309	 Test accuracy 0.7490
Epoch 1 / 20	 Loss 0.6068	 Test accuracy 0.7666
Epoch 1 / 20	 Loss 0.6166	 Test accuracy 0.7433
Epoch 1 / 20	 Loss 0.6288	 Test accuracy 0.7447
Epoch 1 / 20	 Loss 0.6212	 Test accuracy 0.7608
Epoch 1 / 20	 Loss 0.6162	 Test accuracy 0.7572
Epoch 1 / 20	 Loss 0.6141	 Test accuracy

KeyboardInterrupt: 