In [1]:
import torch
import transformers
import pandas as pd
from transformers import BertTokenizer
import random
import numpy as np
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
df_train = pd.read_csv('/data/sentimentanalysis/train_df.csv')
df_test = pd.read_csv('/data/sentimentanalysis/test_df.csv')
df_val = pd.read_csv('/data/sentimentanalysis/valid_df.csv', )

df = pd.concat([df_train, df_test, df_val], axis=0, ignore_index=True)

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [4]:
%%time
input_sentences = []
attention_masks = []

for sentence in df.Review:
    input_sen = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=tokenizer.model_max_length,
                                      pad_to_max_length=True, return_tensors='pt', return_attention_mask=True)
    
    input_sentences.append(input_sen['input_ids'])
    attention_masks.append(input_sen['attention_mask'])
    
    
labels = torch.tensor(df.Label)

CPU times: user 3min 58s, sys: 467 ms, total: 3min 58s
Wall time: 4min


In [5]:
input_sentences = torch.cat(input_sentences, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

  This is separate from the ipykernel package so we can avoid doing imports until


In [6]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_sentences, attention_masks, labels)

train_size = int(0.7*len(dataset))
test_size = int(0.15*len(dataset))
valid_size = len(dataset) - train_size - test_size

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, valid_size, test_size])

print('{} training samples'.format(train_size))
print('{} test samples'.format(test_size))
print('{} validation samples'.format(valid_size))


17,500 training samples
3,750 test samples
3,750 validation samples


In [7]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 5

train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset),
                          batch_size=batch_size)

valid_loader = DataLoader(val_dataset, sampler=RandomSampler(val_dataset),
                          batch_size=batch_size)

test_loader = DataLoader(test_dataset, sampler=RandomSampler(test_dataset),
                         batch_size=batch_size)

In [8]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels = 2,   
    output_attentions = False, 
    output_hidden_states = False,
)

model.cuda()


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [12]:
from transformers import get_linear_schedule_with_warmup

epochs = 3
total_steps = len(train_loader)*epochs

optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8)

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)


In [13]:
import numpy as np

def accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


In [14]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))



In [15]:
training_stats = []

total_t0 = time.time()

for epoch_i in range(0, epochs):

    print("")
    print('Epoch {:} / {:} '.format(epoch_i + 1, epochs))

    # Measure how long the training epoch takes.
    t0 = time.time()

    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_loader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {}  of  {}.    Elapsed: {:}.'.format(step, len(train_loader), elapsed))

 
        # batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        

        loss, logits = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)

        total_train_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_loader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training took: {:}".format(training_time))
        

    t0 = time.time()
    
    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in valid_loader:
    
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        (loss, logits) = model(b_input_ids, 
                               token_type_ids=None, 
                               attention_mask=b_input_mask,
                               labels=b_labels)
            
        total_eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.cpu().numpy()

        total_eval_accuracy += accuracy(logits, label_ids)
        

    avg_val_accuracy = total_eval_accuracy / len(valid_loader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    avg_val_loss = total_eval_loss / len(valid_loader)
    
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)


  Batch    40  of  3,500.    Elapsed: 0:00:14.
  Batch    80  of  3,500.    Elapsed: 0:00:27.
  Batch   120  of  3,500.    Elapsed: 0:00:40.
  Batch   160  of  3,500.    Elapsed: 0:00:53.
  Batch   200  of  3,500.    Elapsed: 0:01:06.
  Batch   240  of  3,500.    Elapsed: 0:01:19.
  Batch   280  of  3,500.    Elapsed: 0:01:32.
  Batch   320  of  3,500.    Elapsed: 0:01:46.
  Batch   360  of  3,500.    Elapsed: 0:01:59.
  Batch   400  of  3,500.    Elapsed: 0:02:12.
  Batch   440  of  3,500.    Elapsed: 0:02:25.
  Batch   480  of  3,500.    Elapsed: 0:02:38.
  Batch   520  of  3,500.    Elapsed: 0:02:51.
  Batch   560  of  3,500.    Elapsed: 0:03:04.
  Batch   600  of  3,500.    Elapsed: 0:03:17.
  Batch   640  of  3,500.    Elapsed: 0:03:30.
  Batch   680  of  3,500.    Elapsed: 0:03:43.
  Batch   720  of  3,500.    Elapsed: 0:03:56.
  Batch   760  of  3,500.    Elapsed: 0:04:09.
  Batch   800  of  3,500.    Elapsed: 0:04:22.
  Batch   840  of  3,500.    Elapsed: 0:04:35.
  Batch   880

In [16]:
training_stats

[{'epoch': 1,
  'Training Loss': 0.35042030596124407,
  'Valid. Loss': 0.3146270592676786,
  'Valid. Accur.': 0.9210666666666657,
  'Training Time': '0:19:02',
  'Validation Time': '0:01:16'},
 {'epoch': 2,
  'Training Loss': 0.17831795548703355,
  'Valid. Loss': 0.3476218721763774,
  'Valid. Accur.': 0.9322666666666658,
  'Training Time': '0:18:58',
  'Validation Time': '0:01:16'},
 {'epoch': 3,
  'Training Loss': 0.06698844165046258,
  'Valid. Loss': 0.4026231278688356,
  'Valid. Accur.': 0.9322666666666658,
  'Training Time': '0:18:57',
  'Validation Time': '0:01:16'}]

In [17]:
# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in test_loader:
    batch = tuple(t.to(device) for t in batch)

    b_input_ids, b_input_mask, b_labels = batch


    outputs = model(b_input_ids, token_type_ids=None, 
                    attention_mask=b_input_mask)

    logits = outputs[0]

    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    predictions.append(logits)
    true_labels.append(label_ids)

In [18]:
total_test_accuracy = 0
for pred, label in zip(predictions, true_labels):
    total_test_accuracy += accuracy(pred, label)

avg_test_accuracy = total_test_accuracy / len(test_loader)
print("Test accuracy: {0:.2f}".format(avg_test_accuracy))

Test accuracy: 0.92


In [66]:
def predict_sentiment(sentence):
    tokenized = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=tokenizer.model_max_length,
                                      pad_to_max_length=True, return_tensors='pt', return_attention_mask=True)
    tokenized = tokenized.to(device)
    
    model.eval()
    with torch.no_grad():
        logits = model(tokenized['input_ids'], 
                          token_type_ids=None, 
                          attention_mask=tokenized['attention_mask'])
    
    logits = logits[0]
    print(torch.sigmoid(logits))
    pred = np.argmax(logits.cpu(), axis = 1)
    if pred == 1:
        print('Positive review!')
    else:
        print('Negative review!')
    

In [67]:
pos_rew = 'This film is great'
predict_sentiment(pos_rew)

tensor([[0.0181, 0.9910]], device='cuda:0')
Positive review!


In [68]:
neg_rew = "This film is terrible"
predict_sentiment(neg_rew)

tensor([[0.9816, 0.0116]], device='cuda:0')
Negative review!
