In [1]:
import pandas as pd, numpy as np
import torch

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F

import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob
from textblob import Word

In [2]:
# Read in the data
test = pd.read_csv("../maindatasets/cleaned_test.csv")
train = pd.read_csv("../maindatasets/cleaned_train.csv")
print('Number of rows in train:', len(train))
print('Number of rows in test:', len(test))
print('Number of NaNs in Tweet column: ', sum(pd.isnull(train['Tweet']) == True))
print('Number of NaNs in Tweet column: ', sum(pd.isnull(test['Tweet']) == True))

# Remove NaNs
test.dropna(subset=['Tweet'], inplace = True)
train.dropna(subset=['Tweet'], inplace = True)
print('Number of rows in train:', len(train))
print('Number of rows in test:', len(test))
print('Number of NaNs in Tweet column:', sum(pd.isnull(train['Tweet']) == True))
print('Number of NaNs in Tweet column:', sum(pd.isnull(test['Tweet']) == True))

Number of rows in train: 39780
Number of rows in test: 2000
Number of NaNs in Tweet column:  7
Number of NaNs in Tweet column:  6
Number of rows in train: 39773
Number of rows in test: 1994
Number of NaNs in Tweet column: 0
Number of NaNs in Tweet column: 0


In [25]:
train_X = train.Tweet
train_y = train.iloc[:, 1:2]['Sarcastic']
test_X = test.Tweet
test_y = np.array(test.Sarcastic)

print(train_y)

0        0
1        0
2        0
3        0
4        0
5        0
6        0
7        0
8        0
9        0
10       0
11       0
12       0
13       0
14       0
15       0
16       0
17       0
18       0
19       0
20       0
21       0
22       0
23       0
24       0
25       0
26       0
27       0
28       0
29       0
        ..
39750    1
39751    1
39752    1
39753    1
39754    1
39755    1
39756    1
39757    1
39758    1
39759    1
39760    1
39761    1
39762    1
39763    1
39764    1
39765    1
39766    1
39767    1
39768    1
39769    1
39770    1
39771    1
39772    1
39773    1
39774    1
39775    1
39776    1
39777    1
39778    1
39779    1
Name: Sarcastic, Length: 39773, dtype: int64


In [4]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [5]:
train_X_split = [x.split() for x in train_X]

In [6]:
word_to_ix = {}
for sent in train_X_split:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

In [7]:
tag_to_ix = {0: 0, 1: 1}

In [8]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(
            embeds.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [29]:
EMBEDDING_DIM = 10
HIDDEN_DIM = 10
LR = 0.1

tensor([0])

In [10]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), LR)

In [39]:
with torch.no_grad():
    inputs = prepare_sequence(train_X_split[0], word_to_ix)
    tag_scores = model(inputs)
    funced = sum(tag_scores)
    print(funced)

tensor([ -7.9283, -10.2566])


In [36]:
for sentence, tag in zip(train_X_split, train_y):
    # Step 1. Remember that Pytorch accumulates gradients.
    # We need to clear them out before each instance
    model.zero_grad()

    # Also, we need to clear out the hidden state of the LSTM,
    # detaching it from its history on the last instance.
    model.hidden = model.init_hidden()

    # Step 2. Get our inputs ready for the network, that is, turn them into
    # Tensors of word indices.
    sentence_in = prepare_sequence(sentence, word_to_ix)
    target = torch.tensor([tag])

    # Step 3. Run our forward pass.
    tag_scores = model(sentence_in)
    print(tag_scores)
    # Step 4. Compute the loss, gradients, and update the parameters by
    #  calling optimizer.step()
    loss = loss_function(tag_scores, torch.tensor([tag]))
    loss.backward()
    optimizer.step()

tensor([[-0.5658, -0.8391],
        [-0.5779, -0.8234],
        [-0.5722, -0.8308],
        [-0.6238, -0.7677],
        [-0.5226, -0.8989],
        [-0.5297, -0.8887],
        [-0.5308, -0.8871],
        [-0.5441, -0.8683]], grad_fn=<LogSoftmaxBackward>)


ValueError: Expected input batch_size (8) to match target batch_size (1).