In [274]:
import numpy as np
import pandas as pd
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.autograd as autograd
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from sklearn.model_selection import train_test_split

In [275]:
df = pd.read_csv('Sentiment.csv', usecols=['sentiment', 'text'])

df.head()

Unnamed: 0,sentiment,text
0,Neutral,RT @NancyLeeGrahn: How did everyone feel about...
1,Positive,RT @ScottWalker: Didn't catch the full #GOPdeb...
2,Neutral,RT @TJMShow: No mention of Tamir Rice and the ...
3,Positive,RT @RobGeorge: That Carly Fiorina is trending ...
4,Positive,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...


In [276]:
bow = Counter([token for setence in df.text for token in setence.lower().split()])
ordered_bow = bow.most_common()

print(ordered_bow[:20])

[('the', 8736), ('#gopdebate', 8129), ('rt', 7835), ('#gopdebates', 5028), ('to', 4758), ('of', 3365), ('is', 3296), ('a', 3263), ('and', 2865), ('i', 2397), ('@rwsurfergirl:', 1959), ('in', 1941), ('for', 1816), ('🇺🇸', 1798), ('you', 1784), ('on', 1563), ('it', 1479), ('that', 1372), ('trump', 1319), ('fox', 1280)]


In [277]:
# Mapping the words

idx_to_word = {i+1:w for i, (w, c) in enumerate(ordered_bow)}
word_to_idx = {w:i for i, w in idx_to_word.items()}

In [278]:
list_embedded = [[word_to_idx[token] for token in setence.lower().split()] for setence in df.text]

In [279]:
rows = len(list_embedded)
cols = max([len(text) for text in list_embedded])

text_vector = np.zeros((rows, cols))

text_vector.shape

(13871, 29)

In [280]:
for i, text_embedded in enumerate(list_embedded):
    for j,idx in enumerate(text_embedded):
        text_vector[i][j] = int(idx)
        
print(text_vector[0])

[3.000e+00 2.775e+03 4.600e+01 7.700e+01 3.450e+02 4.350e+02 2.400e+01
 1.000e+00 4.360e+02 5.580e+02 2.090e+02 2.800e+01 4.500e+02 4.638e+03
 2.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00]


In [281]:
sentiments = np.array([0 if sentiment == 'Neutral' else 1 if sentiment == 'Positive' else -1 for sentiment in df.sentiment])

In [282]:
X_train, X_test, Y_train, Y_test = train_test_split(text_vector, sentiments)

In [283]:
train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(Y_train))

In [252]:
train_loader = DataLoader(dataset=train_data, batch_size=50, shuffle=True)

In [253]:
vocab_size = len(idx_to_word)
embedding_dim = 256
hidden_dim = 100
n_labels = 3

In [270]:
class SentimentAnalysis(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_labels):
        super().__init__()
        
        self.hidden_dim = hidden_dim
        
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.linear = nn.Linear(hidden_dim, n_labels)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        
        self.hidden =  self.init_hidden(x.size(-1))
        
        embeds = self.word_embeddings(x)
        
        packed_input = pack_padded_sequence(embeds, lengths)
        
        lstm_out, (ht, ct) = self.lstm(embeds, self.hidden)

        print(lstm_out.shape)
        
        linear_out = self.linear(lstm_out)
        sigmoid_out = self.sigmoid(linear_out)
    
        print(sigmoid_out.shape)
        
        
        return sigmoid_out
    
    def init_hidden(self, batch_size):
        return (autograd.Variable(torch.randn(1, batch_size, self.hidden_dim)),
                autograd.Variable(torch.randn(1, batch_size, self.hidden_dim)))
        

In [271]:
model = SentimentAnalysis(vocab_size, embedding_dim, hidden_dim, n_labels)

print(model)

SentimentAnalysis(
  (word_embeddings): Embedding(28156, 256)
  (lstm): LSTM(256, 100)
  (linear): Linear(in_features=100, out_features=3, bias=True)
  (sigmoid): Sigmoid()
)


In [272]:
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [273]:
n_epoch = 5

for epoch in range(n_epoch):

    for train_data, train_label in train_loader:    

        model.zero_grad()
        prediction = model(train_data.type('torch.LongTensor'))
      
#         loss = loss_function(prediction, train_label)
#         loss.backward()
        
#         print(train_data.shape)
    #     print(train_data.shape)

    

torch.Size([50, 29])
torch.Size([50, 29, 100])
torch.Size([50, 29, 3])
torch.Size([50, 29])
torch.Size([50, 29, 100])
torch.Size([50, 29, 3])
torch.Size([50, 29])
torch.Size([50, 29, 100])
torch.Size([50, 29, 3])
torch.Size([50, 29])
torch.Size([50, 29, 100])
torch.Size([50, 29, 3])
torch.Size([50, 29])
torch.Size([50, 29, 100])
torch.Size([50, 29, 3])
torch.Size([50, 29])
torch.Size([50, 29, 100])
torch.Size([50, 29, 3])
torch.Size([50, 29])
torch.Size([50, 29, 100])
torch.Size([50, 29, 3])
torch.Size([50, 29])
torch.Size([50, 29, 100])
torch.Size([50, 29, 3])
torch.Size([50, 29])
torch.Size([50, 29, 100])
torch.Size([50, 29, 3])
torch.Size([50, 29])
torch.Size([50, 29, 100])
torch.Size([50, 29, 3])
torch.Size([50, 29])
torch.Size([50, 29, 100])
torch.Size([50, 29, 3])
torch.Size([50, 29])
torch.Size([50, 29, 100])
torch.Size([50, 29, 3])
torch.Size([50, 29])
torch.Size([50, 29, 100])
torch.Size([50, 29, 3])
torch.Size([50, 29])
torch.Size([50, 29, 100])
torch.Size([50, 29, 3])
torch.

torch.Size([50, 29, 100])
torch.Size([50, 29, 3])
torch.Size([50, 29])
torch.Size([50, 29, 100])
torch.Size([50, 29, 3])
torch.Size([50, 29])
torch.Size([50, 29, 100])
torch.Size([50, 29, 3])
torch.Size([50, 29])
torch.Size([50, 29, 100])
torch.Size([50, 29, 3])
torch.Size([50, 29])
torch.Size([50, 29, 100])
torch.Size([50, 29, 3])
torch.Size([50, 29])
torch.Size([50, 29, 100])
torch.Size([50, 29, 3])
torch.Size([50, 29])
torch.Size([50, 29, 100])
torch.Size([50, 29, 3])
torch.Size([50, 29])
torch.Size([50, 29, 100])
torch.Size([50, 29, 3])
torch.Size([50, 29])
torch.Size([50, 29, 100])
torch.Size([50, 29, 3])
torch.Size([50, 29])
torch.Size([50, 29, 100])
torch.Size([50, 29, 3])
torch.Size([50, 29])
torch.Size([50, 29, 100])
torch.Size([50, 29, 3])
torch.Size([50, 29])
torch.Size([50, 29, 100])
torch.Size([50, 29, 3])
torch.Size([50, 29])
torch.Size([50, 29, 100])
torch.Size([50, 29, 3])
torch.Size([50, 29])
torch.Size([50, 29, 100])
torch.Size([50, 29, 3])
torch.Size([50, 29])
torch.

RuntimeError: index out of range: Tried to access index 28156 out of table with 28155 rows. at /pytorch/aten/src/TH/generic/THTensorEvenMoreMath.cpp:237