In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.feature_extraction.text import CountVectorizer
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm, tqdm_notebook

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [3]:
class Sequences(Dataset):
    def __init__(self, path, max_seq_len):
        self.max_seq_len = max_seq_len
        df = path

        # BOW
        vectorizer = CountVectorizer(stop_words='english', min_df=0.015)
        vectorizer.fit(df.review.tolist())

        # Creating Vocabulary
        self.token2idx = vectorizer.vocabulary_

        self.token2idx['<PAD>'] = max(self.token2idx.values()) + 1

        tokenizer = vectorizer.build_analyzer()
        self.encode = lambda x: [self.token2idx[token] for token in tokenizer(x)
                                 if token in self.token2idx]
        self.pad = lambda x: x + (max_seq_len - len(x)) * \
            [self.token2idx['<PAD>']]

        sequences = [self.encode(sequence)[:max_seq_len]
                     for sequence in df.review.tolist()]
        sequences, self.labels = zip(*[(sequence, label) for sequence, label
                                       in zip(sequences, df.label.tolist()) if sequence])
        self.sequences = [self.pad(sequence) for sequence in sequences]

    def __getitem__(self, i):
        assert len(self.sequences[i]) == self.max_seq_len
        return self.sequences[i], self.labels[i]

    def __len__(self):
        return len(self.sequences)

In [4]:
data = pd.read_csv(
    'data.csv')
data['label'] = data['sentiment']
del data['sentiment']
data.head()

Unnamed: 0,review,label
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


__Encoding positive as 1 and negative as 0__

In [5]:
labeling = {
    'positive': 1,
    'negative': 0
}

In [6]:
data['label'] = data['label'].apply(lambda x: labeling[x])

In [7]:
data.shape

(50000, 2)

In [8]:
dataset = Sequences(data, max_seq_len=128)

In [9]:
len(dataset.token2idx)

1046

In [10]:
def collate(batch):
    inputs = torch.LongTensor([item[0] for item in batch])
    target = torch.FloatTensor([item[1] for item in batch])
    return inputs, target


batch_size = 2048
train_loader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate)

In [11]:
class RNN(nn.Module):
    def __init__(
        self,
        vocab_size,
        batch_size,
        embedding_dimension=100,
        hidden_size=128,
        n_layers=1,
        device='cpu'
    ):
        super(RNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.device = device
        self.batch_size = batch_size
        self.encoder = nn.Embedding(vocab_size, embedding_dimension)
        self.rnn = nn.LSTM(
            embedding_dimension,
            hidden_size,
            num_layers=n_layers,
            batch_first=True,
        )
        self.decoder = nn.Linear(hidden_size, 1)

    def init_hidden(self):

        return (torch.randn(self.n_layers, self.batch_size, self.hidden_size).to(self.device),
                torch.randn(self.n_layers, self.batch_size, self.hidden_size).to(self.device))

    def forward(self, inputs):
        # Avoid breaking if the last batch has a different size
        batch_size = inputs.size(0)
        if batch_size != self.batch_size:
            self.batch_size = batch_size

        encoded = self.encoder(inputs)
        output, hidden = self.rnn(encoded, self.init_hidden())
        # o
        output = self.decoder(output[:, :, -1]).squeeze()
        return output

In [12]:
model = RNN(
    hidden_size=128,
    vocab_size=len(dataset.token2idx),
    device=device,
    batch_size=batch_size
)
model = model.to(device)
model

RNN(
  (encoder): Embedding(1046, 100)
  (rnn): LSTM(100, 128, batch_first=True)
  (decoder): Linear(in_features=128, out_features=1, bias=True)
)

In [13]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam([p for p in model.parameters()
                       if p.requires_grad], lr=0.001)

In [14]:
model.train()
train_losses = []
for epoch in range(5):
    progress_bar = tqdm_notebook(train_loader, leave=False)
    losses = []
    total = 0
    for inputs, target in progress_bar:
        inputs, target = inputs.to(device), target.to(device)
        model.zero_grad()

        output = model(inputs)

        loss = criterion(output, target)

        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), 3)

        optimizer.step()

        progress_bar.set_description(f'Loss: {loss.item():.3f}')

        losses.append(loss.item())
        total += 1

    epoch_loss = sum(losses) / total
    train_losses.append(epoch_loss)

    tqdm.write(f'Epoch #{epoch + 1}\tTrain Loss: {epoch_loss:.3f}')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  progress_bar = tqdm_notebook(train_loader, leave=False)


  0%|          | 0/25 [00:00<?, ?it/s]

Epoch #1	Train Loss: 0.690


  0%|          | 0/25 [00:00<?, ?it/s]

Epoch #2	Train Loss: 0.605


  0%|          | 0/25 [00:00<?, ?it/s]

Epoch #3	Train Loss: 0.460


  0%|          | 0/25 [00:00<?, ?it/s]

Epoch #4	Train Loss: 0.382


  0%|          | 0/25 [00:00<?, ?it/s]

Epoch #5	Train Loss: 0.342


In [15]:
def predict_sentiment(text):
    model.eval()
    with torch.no_grad():
        test_vector = torch.LongTensor(
            [dataset.pad(dataset.encode(text))]).to(device)

        output = model(test_vector)
        prediction = torch.sigmoid(output).item()

        if prediction > 0.5:
            print(f'{prediction:0.3}: Positive sentiment')
        else:
            print(f'{prediction:0.3}: Negative sentiment')

In [16]:
text = "that's nice"
predict_sentiment(text)

0.661: Positive sentiment


In [17]:
text = "that's worst"
predict_sentiment(text)

0.354: Negative sentiment


In [33]:
text="but why is it so bad"

predict_sentiment(text=text)

0.444: Negative sentiment
