In [1]:
import pandas as pd, numpy as np
import torch

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F

from torchtext import data

In [2]:
# Read in the data
test = pd.read_csv("../maindatasets/cleaned_test.csv")
train = pd.read_csv("../maindatasets/cleaned_train.csv")
print('Number of rows in train:', len(train))
print('Number of rows in test:', len(test))
print('Number of NaNs in Tweet column: ', sum(pd.isnull(train['Tweet']) == True))
print('Number of NaNs in Tweet column: ', sum(pd.isnull(test['Tweet']) == True))

# Remove NaNs
test.dropna(subset=['Tweet'], inplace = True)
train.dropna(subset=['Tweet'], inplace = True)
print('Number of rows in train:', len(train))
print('Number of rows in test:', len(test))
print('Number of NaNs in Tweet column:', sum(pd.isnull(train['Tweet']) == True))
print('Number of NaNs in Tweet column:', sum(pd.isnull(test['Tweet']) == True))

Number of rows in train: 39780
Number of rows in test: 2000
Number of NaNs in Tweet column:  0
Number of NaNs in Tweet column:  1
Number of rows in train: 39780
Number of rows in test: 1999
Number of NaNs in Tweet column: 0
Number of NaNs in Tweet column: 0


In [3]:
train_X = train.Tweet
train_y = train.iloc[:, 1:2]['Sarcastic']
test_X = test.Tweet
test_y = np.array(test.Sarcastic)
train_data = np.array([train_X, train_y])
train_data

array([['0430yes i hope youre lurking rn i want listen hallucination wanna again live someday pretty please',
        '05 really taught valuable lesson Im never gonna late again Not',
        '098BERRY Never had voice protest fed shit digest wish had reason flaw are open season',
        ..., 'zoso4986 Nero He fag we need but fag we deserve right now',
        'Zuma sounding like Kanye West right now trying explain difference between socialism capitalism Cosatu',
        'ZZUCRU UWDawgPack So true Students stick around have fun at game Then go home screaming family burnt turkey'],
       [0, 0, 0, ..., 1, 1, 1]], dtype=object)

In [4]:
TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)

In [5]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [6]:
train_X_split = [x.split() for x in train_X]

In [7]:
word_to_ix = {}
for sent in train_X_split:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

In [8]:
tag_to_ix = {0: 0, 1: 1}

In [30]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(
            embeds.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [31]:
EMBEDDING_DIM = 10
HIDDEN_DIM = 10
LR = 0.1

In [32]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), LR)

In [35]:
with torch.no_grad():
    inputs = prepare_sequence(train_X_split[0], word_to_ix)
    tag_scores = model(inputs)
    print(tag_scores)

tensor([[-0.7055, -0.6809],
        [-0.7524, -0.6372],
        [-0.7073, -0.6792],
        [-0.7102, -0.6763],
        [-0.7186, -0.6683],
        [-0.7507, -0.6388],
        [-0.7449, -0.6439],
        [-0.8445, -0.5617],
        [-0.7058, -0.6807],
        [-0.6406, -0.7486],
        [-0.7365, -0.6516],
        [-0.7121, -0.6745],
        [-0.7599, -0.6306],
        [-0.7774, -0.6155],
        [-0.7447, -0.6442],
        [-0.7440, -0.6448]])


In [15]:
for sentence, tag in zip(train_X_split, train_y):
    # Step 1. Remember that Pytorch accumulates gradients.
    # We need to clear them out before each instance
    model.zero_grad()

    # Also, we need to clear out the hidden state of the LSTM,
    # detaching it from its history on the last instance.
    model.hidden = model.init_hidden()

    # Step 2. Get our inputs ready for the network, that is, turn them into
    # Tensors of word indices.
    sentence_in = prepare_sequence(sentence, word_to_ix)
    target = torch.tensor([tag])

    # Step 3. Run our forward pass.
    tag_scores = model(sentence_in)
    print(tag_scores)
    # Step 4. Compute the loss, gradients, and update the parameters by
    #  calling optimizer.step()
    loss = loss_function(tag_scores, torch.tensor([tag]))
    loss.backward()
    optimizer.step()

tensor([[-0.7469, -0.6421],
        [-0.6910, -0.6953],
        [-0.7335, -0.6544],
        [-0.6850, -0.7013],
        [-0.7160, -0.6708],
        [-0.6192, -0.7730],
        [-0.6207, -0.7712],
        [-0.5502, -0.8600],
        [-0.6044, -0.7906],
        [-0.6480, -0.7404],
        [-0.6482, -0.7402],
        [-0.6457, -0.7429],
        [-0.6518, -0.7363],
        [-0.6911, -0.6952],
        [-0.7071, -0.6794],
        [-0.6869, -0.6994]], grad_fn=<LogSoftmaxBackward>)


ValueError: Expected input batch_size (16) to match target batch_size (1).