In [1]:
import sys
sys.path.append("..")

In [3]:
import ast
import time
import pandas as pd
import numpy as np

import torch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

import gensim
import torchtext
from torchtext import data
from torchtext import vocab

from utils.db_client import DBClient

In [4]:
# Make DB Connection
db = DBClient(secrets_path='../configs/db_secrets.json')

I'm trying
Connected to political tweets DB


### Model definition and tuning

In [5]:
# Load data
examples = db.read("select tweet_text_clean, label from staging.train_twitter140 order by Random() limit 100000")

In [6]:
# Define Fields
txt_field = data.Field(sequential=True, 
                       include_lengths=True, 
                       use_vocab=True)
label_field = data.Field(sequential=False, 
                         use_vocab=False, 
                         pad_token=None, 
                         unk_token=None)
train_val_fields = [
    ('SentimentText', txt_field), # process it as text
    ('Sentiment', label_field) # process it as label
]

# Convert text ecamples to Example datatype
examples = [data.Example.fromlist(((ast.literal_eval(example[0])), example[1]), train_val_fields) for example in examples]

# Create dataset
dataset = data.Dataset(examples, train_val_fields)

In [7]:
MAX_VOCAB_SIZE = 25000

# Make vocab for text and label fields
txt_field.build_vocab(dataset,
                      max_size = MAX_VOCAB_SIZE, 
                      vectors = vocab.Vectors('glove.twitter.27B.50d.txt', '../model/glove.twitter.27B/'))
label_field.build_vocab(dataset)

pretrained_embeddings = txt_field.vocab.vectors

# Make train/val/test splits
train_data, test_data, valid_data = dataset.split([0.7, 0.2, 0.1])

In [8]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 70000
Number of validation examples: 20000
Number of testing examples: 10000


In [9]:
# Make data iterators
BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits((train_data, valid_data, test_data), 
                                                          batch_sizes = (BATCH_SIZE, BATCH_SIZE, BATCH_SIZE),
                                                                        sort_key = lambda x: len(x.SentimentText),
                                                                        sort_within_batch=True,
                                                                        repeat = False)

In [23]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout, pad_idx):#, optimizer, criterion):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        self.embedding_dim = embedding_dim
        self.pad_idx = pad_idx
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           #bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.optimizer = None #optimizer #optim.Adam(model.parameters())
        self.criterion = None #criterion #nn.BCEWithLogitsLoss()
        
    
    def set_pretrained_weights(self, pretrained_embeddings, txt_field):
        self.embedding.weight.data.copy_(pretrained_embeddings)

        UNK_IDX = txt_field.vocab.stoi[txt_field.unk_token]

        self.embedding.weight.data[UNK_IDX] = torch.zeros(self.embedding_dim)
        self.embedding.weight.data[self.pad_idx] = torch.zeros(self.embedding_dim)
    
    def forward(self, text, text_lengths):
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden.squeeze(0))
    
    def binary_accuracy(self, preds, y):
        """
        Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
        """

        #round predictions to the closest integer
        rounded_preds = torch.round(torch.sigmoid(preds))
        correct = (rounded_preds == y).float() #convert into float for division 
        acc = correct.sum() / len(correct)
        return acc
    
    def train_epoch(self, iterator):
        epoch_loss = 0
        epoch_acc = 0

        self.train()
        optimizer = self.optimizer
        for batch in iterator:
            if len(batch) == BATCH_SIZE:
            
                optimizer.zero_grad()

                text, text_lengths = batch.SentimentText
                predictions = self.forward(text, text_lengths).squeeze(1)
                loss = self.criterion(predictions, batch.Sentiment.float())

                acc = self.binary_accuracy(predictions, batch.Sentiment.float())

                loss.backward()

                optimizer.step()

                epoch_loss += loss.item()
                epoch_acc += acc.item()

        return epoch_loss / len(iterator), epoch_acc / len(iterator)

    def evaluate(self, iterator):

        epoch_loss = 0
        epoch_acc = 0

        model.eval()

        with torch.no_grad():

            for batch in iterator:
                if len(batch) == BATCH_SIZE:

                    text, text_lengths = batch.SentimentText

                    predictions = self.forward(text, text_lengths).squeeze(1)

                    loss = self.criterion(predictions, batch.Sentiment.float())

                    acc = self.binary_accuracy(predictions, batch.Sentiment.float())

                    epoch_loss += loss.item()
                    epoch_acc += acc.item()

        return epoch_loss / len(iterator), epoch_acc / len(iterator)
    
    def sentiment_political(self, iterator):

        # keep?
        model.eval()
        
        sentiment_df = None

        with torch.no_grad():

            for batch in iterator:
                if len(batch) == BATCH_SIZE:

                    text, text_lengths = batch.SentimentText

                    predictions = self.forward(text, text_lengths).squeeze(1)
                    
                    preds = torch.sigmoid(predictions)
                    
                    pred_arr = preds.numpy()
                    id_arr = batch.Id.numpy()
                    pred_class_arr = torch.round(preds).numpy()
                    
                    if not sentiment_df:
                        sentiment_df = pd.DataFrame({"tweet_id": id_arr, 
                                                     "prediction_raw": pred_arr, 
                                                     "prediction_class": pred_clas_arr})
                    else:
                        sentiment_df = pd.concat([sentiment_df, pd.DataFrame({"tweet_id": id_arr, 
                                                     "prediction_raw": pred_arr, 
                                                     "prediction_class": pred_clas_arr})])

                    #loss = self.criterion(predictions, batch.Sentiment.float())

                    #acc = self.binary_accuracy(predictions, batch.Sentiment.float())

                    #epoch_loss += loss.item()
                    #epoch_acc += acc.item()
        return sentiment_df
    
    def epoch_time(self, start_time, end_time):
        elapsed_time = end_time - start_time
        elapsed_mins = int(elapsed_time / 60)
        elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
        return elapsed_mins, elapsed_secs

In [11]:
def epoch_training(N_EPOCHS, model, train_iterator, valid_iterator):
    best_valid_loss = float('inf')

    for epoch in range(N_EPOCHS):

        start_time = time.time()

        train_loss, train_acc = model.train_epoch(train_iterator)
        valid_loss, valid_acc = model.evaluate(valid_iterator)

        end_time = time.time()

        epoch_mins, epoch_secs = model.epoch_time(start_time, end_time)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'tut2-model.pt')

        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
        
    return model

In [16]:
optimizers = ["Adam", "Adagrad"]
criterions = ["BCEWithLogitsLoss"] 
learning_rates = [0.001, 0.01]
epochs = [5, 25] 
weight_decay = [0, 0.01]

In [13]:
INPUT_DIM = len(txt_field.vocab)
EMBEDDING_DIM = 50
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
DROPOUT = 0.5
PAD_IDX = txt_field.vocab.stoi[txt_field.pad_token]


best_model_tuple = None
best_accuracy = -1
for optimizer in optimizers:
    # Make a new model at the start
    model = LSTM(INPUT_DIM, 
                EMBEDDING_DIM, 
                HIDDEN_DIM, 
                OUTPUT_DIM, 
                N_LAYERS,
                DROPOUT, 
                PAD_IDX)
    model.set_pretrained_weights(pretrained_embeddings, txt_field)
    for learning_rate in learning_rates:
        for decay in weight_decay:
            # Set OPTIMIZER
            if optimizer == "Adam":
                model.optimizer = optim.Adam(params = model.parameters(), lr = learning_rate, weight_decay = decay) 
            else: 
                model.optimizer = optim.Adagrad(params = model.parameters(), lr = learning_rate, weight_decay = decay)

            for criterion in criterions:
                # SET CRITERION
                if criterion == "BCEWithLogitsLoss":
                    model.criterion = nn.BCEWithLogitsLoss() 
                else: 
                    model.criterion = nn.NLLLoss()
                
                for epoch in epochs:
                    model_name = "-".join([optimizer, str(learning_rate), str(decay), str(criterion), str(epoch)])

                    print(f"working on model {model_name}")
                    trained_model = epoch_training(epoch, model, train_iterator, valid_iterator)
                    _, acc = trained_model.evaluate(valid_iterator)
                    
                    if acc > best_accuracy:
                        best_accuracy = acc
                        best_model_tuple = (model_name, trained_model)


working on model Adam-0.001-0-BCEWithLogitsLoss-5
Epoch: 01 | Epoch Time: 1m 52s
	Train Loss: 0.563 | Train Acc: 70.48%
	 Val. Loss: 0.493 |  Val. Acc: 75.84%
Epoch: 02 | Epoch Time: 2m 9s
	Train Loss: 0.508 | Train Acc: 74.94%
	 Val. Loss: 0.480 |  Val. Acc: 76.72%
Epoch: 03 | Epoch Time: 2m 14s
	Train Loss: 0.485 | Train Acc: 76.46%
	 Val. Loss: 0.467 |  Val. Acc: 77.03%
Epoch: 04 | Epoch Time: 2m 16s
	Train Loss: 0.467 | Train Acc: 77.82%
	 Val. Loss: 0.466 |  Val. Acc: 77.24%
Epoch: 05 | Epoch Time: 2m 20s
	Train Loss: 0.449 | Train Acc: 78.89%
	 Val. Loss: 0.473 |  Val. Acc: 77.31%
working on model Adam-0.001-0-BCEWithLogitsLoss-50
Epoch: 01 | Epoch Time: 2m 21s
	Train Loss: 0.436 | Train Acc: 79.75%
	 Val. Loss: 0.468 |  Val. Acc: 77.57%
Epoch: 02 | Epoch Time: 2m 20s
	Train Loss: 0.422 | Train Acc: 80.43%
	 Val. Loss: 0.475 |  Val. Acc: 77.49%
Epoch: 03 | Epoch Time: 2m 21s
	Train Loss: 0.410 | Train Acc: 81.16%
	 Val. Loss: 0.475 |  Val. Acc: 77.52%
Epoch: 04 | Epoch Time: 2m 1

KeyboardInterrupt: 

In [14]:
best_model_name = best_model_tuple[0]
best_model = best_model_tuple[1]

In [15]:
print(best_model_name)
best_model.evaluate(test_iterator)

Adam-0.001-0-BCEWithLogitsLoss-5


(1.0419494485019878, 0.7395501592356688)

In [None]:
run_no = 1
torch.save(best_model.state_dict(), 'models_store/best_model_dict_{}_{}.pt'.format(best_model_name, run_no))
torch.save(best_model, 'models_store/best_model_{}_{}.pt'.format(best_model_name, run_no))


### Subject Analysis

In [17]:
# Load data
num_tweets = 100000
topics = ['health', 'econ_jobs', 'guns']
topics_str = ', '.join(topics)
examples = db.read("select tweet_id, tweet_text_clean, {} from staging.master order by Random() limit {}".format(topics_str, num_tweets))


In [19]:
# Define Fields
txt_field = data.Field(sequential=True, 
                       include_lengths=True, 
                       use_vocab=True)
id_field = data.Field(sequential=False, 
                      use_vocab=False, 
                      pad_token=None, 
                      unk_token=None)

eval_val_fields = [
    ('SentimentText', txt_field), # process it as text
    ('Id', id_field) # process it as id
]


# Convert text examples to Example datatype
examples = [data.Example.fromlist(((ast.literal_eval(example[0])), example[1]), train_val_fields) for example in examples]

# Create dataset
dataset = data.Dataset(examples, eval_val_fields)


In [20]:
# Create Iterator
eval_iterator = data.BucketIterator(dataset, batch_size = BATCH_SIZE, sort_key = lambda x: len(x.SentimentText),
                                                                        sort_within_batch=True,
                                                                        repeat = False)

In [24]:
# Calculate sentiment of each tweet
sentiment_df = best_model.sentiment_political(eval_iterator)
sentiment_df.to_csv('models_store/political_sentiment_{}_{}.csv'.format(best_model_name, run_no))

AttributeError: 'LSTM' object has no attribute 'sentiment_political'

### Plotting Results

In [None]:
choices = ['dem-lead', 'dem-base', 'rep-lead', 'rep-base']
palette = {"dem-base": "#a8b2ff","dem-lead": "#0015bc", "rep-base": "#ff9d9d", "rep-lead": "#ff0000", "": "gray"}


In [None]:
# def binary_accuracy(preds, y):
#     """
#     Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
#     """

#     #round predictions to the closest integer
#     rounded_preds = torch.round(torch.sigmoid(preds))
#     correct = (rounded_preds == y).float() #convert into float for division 
#     acc = correct.sum() / len(correct)
#     return acc

In [None]:
# def train(model, iterator):
    
#     epoch_loss = 0
#     epoch_acc = 0
    
#     model.train()
    
#     for batch in iterator:
        
#         optimizer.zero_grad()
        
#         text, text_lengths = batch.SentimentText
        
#         predictions = model(text, text_lengths).squeeze(1)

#         loss = model.criterion(predictions, batch.Sentiment.float())
        
#         acc = binary_accuracy(predictions, batch.Sentiment.float())
        
#         loss.backward()
        
#         model.optimizer.step()
        
#         epoch_loss += loss.item()
#         epoch_acc += acc.item()
        
#     return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
# def evaluate(model, iterator):
    
#     epoch_loss = 0
#     epoch_acc = 0
    
#     model.eval()
    
#     with torch.no_grad():
    
#         for batch in iterator:
            
#             text, text_lengths = batch.SentimentText

#             predictions = model(text, text_lengths).squeeze(1)
            
#             loss = model.criterion(predictions, batch.Sentiment.float())
            
#             acc = binary_accuracy(predictions, batch.Sentiment.float())

#             epoch_loss += loss.item()
#             epoch_acc += acc.item()
        
#     return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [None]:
# def epoch_time(start_time, end_time):
#     elapsed_time = end_time - start_time
#     elapsed_mins = int(elapsed_time / 60)
#     elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
#     return elapsed_mins, elapsed_secs

In [None]:
# INPUT_DIM = len(txt_field.vocab)
# EMBEDDING_DIM = 50
# HIDDEN_DIM = 256
# OUTPUT_DIM = 1
# N_LAYERS = 2
# DROPOUT = 0.5
# PAD_IDX = txt_field.vocab.stoi[txt_field.pad_token]
# OPTIMIZER = optim.Adam(model.parameters())
# CRITERION = nn.BCEWithLogitsLoss()

# model = LSTM(INPUT_DIM, 
#             EMBEDDING_DIM, 
#             HIDDEN_DIM, 
#             OUTPUT_DIM, 
#             N_LAYERS,
#             DROPOUT, 
#             PAD_IDX,
#             OPTIMIZER,
#             CRITERION)

In [None]:
trained_model = epoch_training(10, model, train_iterator, valid_iterator)

In [None]:
#OPTIMIZER = optim.Adam(model.parameters())
#CRITERION = nn.BCEWithLogitsLoss()

# model = LSTM(INPUT_DIM, 
#             EMBEDDING_DIM, 
#             HIDDEN_DIM, 
#             OUTPUT_DIM, 
#             N_LAYERS,
#             DROPOUT, 
#             PAD_IDX,
#             OPTIMIZER,
#             CRITERION)

In [12]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

NameError: name 'model' is not defined