In [1]:
import numpy as np
import pandas as pd
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.autograd as autograd
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('Sentiment.csv', usecols=['sentiment', 'text'])

df.head()

Unnamed: 0,sentiment,text
0,Neutral,RT @NancyLeeGrahn: How did everyone feel about...
1,Positive,RT @ScottWalker: Didn't catch the full #GOPdeb...
2,Neutral,RT @TJMShow: No mention of Tamir Rice and the ...
3,Positive,RT @RobGeorge: That Carly Fiorina is trending ...
4,Positive,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...


In [3]:
bow = Counter([token for setence in df.text for token in setence.lower().split()])
ordered_bow = bow.most_common()

print(ordered_bow[:20])

[('the', 8736), ('#gopdebate', 8129), ('rt', 7835), ('#gopdebates', 5028), ('to', 4758), ('of', 3365), ('is', 3296), ('a', 3263), ('and', 2865), ('i', 2397), ('@rwsurfergirl:', 1959), ('in', 1941), ('for', 1816), ('🇺🇸', 1798), ('you', 1784), ('on', 1563), ('it', 1479), ('that', 1372), ('trump', 1319), ('fox', 1280)]


In [4]:
# Mapping the words

idx_to_word = {i+1:w for i, (w, c) in enumerate(ordered_bow)}
word_to_idx = {w:i for i, w in idx_to_word.items()}

In [5]:
list_embedded = [[word_to_idx[token] for token in setence.lower().split()] for setence in df.text]

In [6]:
rows = len(list_embedded)
cols = max([len(text) for text in list_embedded])

text_vector = np.zeros((rows, cols), dtype=int)

text_vector.shape

(13871, 29)

In [7]:
for i, text_embedded in enumerate(list_embedded):
    for j,idx in enumerate(text_embedded):
        text_vector[i][j] = int(idx)
        
print(text_vector[0])

[   3 2775   46   77  345  435   24    1  436  558  209   28  450 4638
    2    0    0    0    0    0    0    0    0    0    0    0    0    0
    0]


In [34]:
sentiments = np.array([0 if sentiment == 'Neutral' else 1 if sentiment == 'Positive' else 2 for sentiment in df.sentiment])

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(text_vector, sentiments)

In [24]:
train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(Y_train))

In [25]:
train_loader = DataLoader(dataset=train_data, batch_size=50, shuffle=True)

In [26]:
vocab_size = len(idx_to_word)
embedding_dim = 256
hidden_dim = 100
n_labels = 3

In [27]:
class SentimentAnalysis(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_labels):
        super().__init__()
        
        self.hidden_dim = hidden_dim
        
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.linear = nn.Linear(hidden_dim, n_labels)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        
        hidden =  self.init_hidden(x.size(0))
        
        embeds = self.word_embeddings(x)
        embeds = embeds.permute(1, 0, 2)

        lstm_out, (ht, ct) = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)

        linear_out = self.linear(lstm_out)
        
        sig_out = self.sigmoid(linear_out)
        sig_out = sig_out.view(x.size(0), -1)
        sig_out = sig_out[:, -1]
            
        print(linear_out)
        
        return sig_out
    
    def init_hidden(self, batch_size):
        return (autograd.Variable(torch.randn(1, batch_size, self.hidden_dim)),
                autograd.Variable(torch.randn(1, batch_size, self.hidden_dim)))
        

In [28]:
model = SentimentAnalysis(vocab_size, embedding_dim, hidden_dim, n_labels)

print(model)

SentimentAnalysis(
  (word_embeddings): Embedding(28156, 256)
  (lstm): LSTM(256, 100)
  (linear): Linear(in_features=100, out_features=3, bias=True)
  (sigmoid): Sigmoid()
)


In [29]:
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [33]:
n_epoch = 5

for epoch in range(n_epoch):

    for train_data, train_label in train_loader:
    
        model.zero_grad()
        prediction = model(train_data)
        
        print(train_label)
        
        loss = loss_function(prediction.unsqueeze(1), train_label)
        loss.backward()
        
#         print(train_data.shape)
    #     print(train_data.shape)

    

tensor([[ 0.1729, -0.0225, -0.4223],
        [ 0.1175, -0.2352, -0.2774],
        [ 0.0197, -0.1321, -0.0412],
        ...,
        [ 0.1902, -0.2380, -0.0835],
        [ 0.1880, -0.2341, -0.0786],
        [ 0.1827, -0.2323, -0.0735]], grad_fn=<AddmmBackward>)
tensor([1., 0., 2., 0., 0., 1., 2., 0., 0., 1., 2., 2., 2., 1., 2., 2., 2., 1.,
        2., 1., 2., 2., 2., 2., 2., 1., 2., 0., 2., 2., 1., 2., 0., 2., 2., 2.,
        2., 0., 2., 0., 1., 2., 2., 0., 0., 2., 0., 0., 1., 2.],
       dtype=torch.float64)


RuntimeError: Expected object of scalar type Long but got scalar type Double for argument #2 'target'