## Getting data

    - Sentiment140 dataset: <http://help.sentiment140.com/for-students>

In [1]:
import pandas as pd 

In [2]:
tweetsDF = pd.read_csv("training.1600000.processed.noemoticon.csv", engine="python", header=None)

In [3]:
tweetsDF.head(5)

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
tweetsDF['sentiment_cat'] = tweetsDF[0].astype('category')

In [6]:
tweetsDF["sentiment"] = tweetsDF['sentiment_cat'].cat.codes

In [7]:
tweetsDF.to_csv('train_processed.csv', header=None, index=None)

In [8]:
tweetsDF.sample(10000).to_csv('train_processed_sample.csv', header=None, index=None)

In [2]:
import pandas as pd 
tweets = pd.read_csv('train_processed.csv', engine='python', header=None)
tweets.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0,0
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,0,0
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,0,0
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,0,0
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",0,0


In [3]:
label_tweets = tweets.iloc[:, [5,7]]
label_tweets.head()

Unnamed: 0,5,7
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,is upset that he can't update his Facebook by ...,0
2,@Kenichan I dived many times for the ball. Man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0


In [4]:
label_tweets.to_csv('train_labeled.csv', header=None, index=None)

## Defining fileds

In [1]:
import torch
from torchtext import data

In [2]:
LABEL = data.LabelField(dtype=torch.float)
TWEET = data.Field(sequential=True, tokenize='spacy', include_lengths=True)

In [3]:
fields = [('tweet', TWEET), ('label', LABEL)]

twitterDataset = data.TabularDataset(path='train_labeled.csv', format='CSV', skip_header=False, fields=fields)

In [4]:
import random
(train, valid, test) = twitterDataset.split(split_ratio=[0.8, 0.1, 0.1], random_state=random.seed(123))

In [5]:
print(f"Num train: {len(train)}, valid: {len(valid)}, test: {len(test)}")

Num train: 1280000, valid: 160000, test: 160000


## Builing a vocabulary

In [6]:
vocab_size = 20000
TWEET.build_vocab(train, max_size=vocab_size)
LABEL.build_vocab(train)

In [7]:
TWEET.vocab.freqs.most_common(10)

[('!', 724072),
 ('.', 647596),
 ('I', 527196),
 (' ', 469727),
 ('to', 447002),
 ('the', 392317),
 (',', 386838),
 ('a', 295794),
 ('i', 271292),
 ('my', 226121)]

In [8]:
LABEL.vocab.freqs

Counter({'0': 639980, '1': 640020})

In [9]:
device=torch.device('cpu')
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits((train, valid, test), batch_size=32, sort_within_batch = True, sort_key=lambda x: len(x.tweet), device=device)

In [10]:
print('Train')
for batch in train_iterator:
    print(f"Text matrix size: {batch.tweet[0].size()}")
    print(f"Target vector size: {batch.label.size()}")
    break

print("\nValid")
for batch in valid_iterator:
    print(f"Text matrix size: {batch.tweet[0].size()}")
    print(f"Target vector size: {batch.label.size()}")
    break

print("\nTest")
for batch in test_iterator:
    print(f"Text matrix size: {batch.tweet[0].size()}")
    print(f"Target vector size: {batch.label.size()}")
    break

Train
Text matrix size: torch.Size([19, 32])
Target vector size: torch.Size([32])

Valid
Text matrix size: torch.Size([1, 32])
Target vector size: torch.Size([32])

Test
Text matrix size: torch.Size([1, 32])
Target vector size: torch.Size([32])


## Creat a model

In [11]:
import torch.nn as nn

class TwitterLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(TwitterLSTM, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim)
        self.predictor = nn.Linear(hidden_dim, 1)
    
    def forward(self, text, text_length):
        embedded = self.embedding(text)
        
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, text_length)
        
        packed_output, (hidden, cell) = self.encoder(packed)
        preds = self.predictor(hidden.squeeze(0))
        
        return preds.view(-1)

In [12]:
model = TwitterLSTM(vocab_size=20002, embedding_dim=128, hidden_dim=256, output_dim=1)
model.to(device)

TwitterLSTM(
  (embedding): Embedding(20002, 128)
  (encoder): LSTM(128, 256)
  (predictor): Linear(in_features=256, out_features=1, bias=True)
)

In [13]:
optimizer = torch.optim.Adam(model.parameters(), lr=2e-2)
criterion = nn.CrossEntropyLoss()

## Update the training loop

In [22]:
def compute_binary_accuracy(model, data_loader, device):
    model.eval()
    correct_pred, num_examples = 0, 0
    with torch.no_grad():
        for batch_idx, batch_data in enumerate(data_loader):
            text, text_lengths = batch_data.tweet
            logits = model(text, text_lengths)
            predicted_labels = (torch.sigmoid(logits) > 0.5).long()
            num_examples += batch_data.label.size(0)
            correct_pred += (predicted_labels.long() == batch_data.label.long()).sum()
        return correct_pred.float() / num_examples * 100

In [23]:
import time
import torch.nn.functional as F

NUM_EPOCHS = 7

In [24]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):
    model.train()
    for batch_idx, batch_data in enumerate(train_iterator):
        
        text, text_lengths = batch_data.tweet
        
        ### FORWARD AND BACK PROP
        logits = model(text, text_lengths)
        cost = F.binary_cross_entropy_with_logits(logits, batch_data.label)
        optimizer.zero_grad()
        
        cost.backward()
        
        ### UPDATE MODEL PARAMETERS
        optimizer.step()
        
        ### LOGGING
        if not batch_idx % 1000:
            print (f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | '
                   f'Batch {batch_idx:03d}/{len(train_iterator):03d} | '
                   f'Cost: {cost:.4f}')

    with torch.set_grad_enabled(False):
        print(f'training accuracy: '
              f'{compute_binary_accuracy(model, train_iterator, device):.2f}%'
              f'\nvalid accuracy: '
              f'{compute_binary_accuracy(model, valid_iterator, device):.2f}%')
        
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
    
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_binary_accuracy(model, test_iterator, device):.2f}%')

Epoch: 001/007 | Batch 000/40000 | Cost: 0.6494
Epoch: 001/007 | Batch 1000/40000 | Cost: 0.6475
Epoch: 001/007 | Batch 2000/40000 | Cost: 0.8558
Epoch: 001/007 | Batch 3000/40000 | Cost: 0.4507
Epoch: 001/007 | Batch 4000/40000 | Cost: 0.7720
Epoch: 001/007 | Batch 5000/40000 | Cost: 0.7869
Epoch: 001/007 | Batch 6000/40000 | Cost: 0.8636
Epoch: 001/007 | Batch 7000/40000 | Cost: 0.6355
Epoch: 001/007 | Batch 8000/40000 | Cost: 0.7200
Epoch: 001/007 | Batch 9000/40000 | Cost: 0.7128
Epoch: 001/007 | Batch 10000/40000 | Cost: 0.6936
Epoch: 001/007 | Batch 11000/40000 | Cost: 0.8777
Epoch: 001/007 | Batch 12000/40000 | Cost: 0.7586
Epoch: 001/007 | Batch 13000/40000 | Cost: 0.6140
Epoch: 001/007 | Batch 14000/40000 | Cost: 0.7572
Epoch: 001/007 | Batch 15000/40000 | Cost: 0.5550
Epoch: 001/007 | Batch 16000/40000 | Cost: 0.6876
Epoch: 001/007 | Batch 17000/40000 | Cost: 0.6643
Epoch: 001/007 | Batch 18000/40000 | Cost: 0.7760
Epoch: 001/007 | Batch 19000/40000 | Cost: 0.6778
Epoch: 001/

Epoch: 005/007 | Batch 000/40000 | Cost: 0.5910
Epoch: 005/007 | Batch 1000/40000 | Cost: 0.6108
Epoch: 005/007 | Batch 2000/40000 | Cost: 0.6353
Epoch: 005/007 | Batch 3000/40000 | Cost: 0.6381
Epoch: 005/007 | Batch 4000/40000 | Cost: 0.6868
Epoch: 005/007 | Batch 5000/40000 | Cost: 0.6500
Epoch: 005/007 | Batch 6000/40000 | Cost: 0.5197
Epoch: 005/007 | Batch 7000/40000 | Cost: 0.7053
Epoch: 005/007 | Batch 8000/40000 | Cost: 0.6093
Epoch: 005/007 | Batch 9000/40000 | Cost: 0.7537
Epoch: 005/007 | Batch 10000/40000 | Cost: 0.6125
Epoch: 005/007 | Batch 11000/40000 | Cost: 0.7172
Epoch: 005/007 | Batch 12000/40000 | Cost: 0.7553
Epoch: 005/007 | Batch 13000/40000 | Cost: 0.6241
Epoch: 005/007 | Batch 14000/40000 | Cost: 0.7913
Epoch: 005/007 | Batch 15000/40000 | Cost: 0.7388
Epoch: 005/007 | Batch 16000/40000 | Cost: 0.8298
Epoch: 005/007 | Batch 17000/40000 | Cost: 0.6015
Epoch: 005/007 | Batch 18000/40000 | Cost: 0.6838
Epoch: 005/007 | Batch 19000/40000 | Cost: 0.6474
Epoch: 005/

In [25]:
import spacy
nlp = spacy.load('en')

def predict_sentiment(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TWEET.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor, length_tensor))
    return prediction.item()

In [26]:
print('Probability positive:')
1-predict_sentiment(model, "Feel pretty upset today.")

Probability positive:


0.3635202646255493

In [27]:
print('Probability negative:')
predict_sentiment(model, "Feel pretty upset today.")

Probability negative:


0.6364797353744507

In [28]:
print('Probability positive:')
1 - predict_sentiment(model, "Had a really great day.")

Probability positive:


0.8992006033658981