In [1]:
import numpy as np 
import pandas as pd 

import os
import re
import time
import datetime
import string

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from collections import Counter, defaultdict

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
import transformers
from transformers import BertModel, BertTokenizer, DistilBertTokenizer, RobertaModel, RobertaTokenizer
from transformers import AutoConfig, AutoModel, AdamW, get_linear_schedule_with_warmup


In [3]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, random_split, DataLoader, RandomSampler, SequentialSampler


In [4]:
# load data

statement_df = pd.read_csv('all-data.csv',encoding='UTF-8',header=None)
statement_df.columns = ['sentiment','statement']

In [5]:
statement_df

Unnamed: 0,sentiment,statement
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...


In [6]:
# set device for model training

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
# set random seeds (there will still be some random variability though)

RANDOM_SEED = 73
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7f8dd0ebcc50>

In [8]:
# clean statements

def clean_statements(statement):
    statement = re.sub(" '", "'", statement)
    statement = re.sub(" 's", "'s", statement)
    statement = re.sub('\( ', '(', statement)
    statement = re.sub(' \)', ')', statement)
    statement = re.sub('``', '"', statement)
    statement = re.sub("''", '"', statement)
    statement = re.sub(r'\s([?.,%:!"](?:\s|$))', r'\1', statement)
    return statement

statement_df['statement'] = statement_df['statement'].apply(clean_statements)

In [9]:
statement_df

Unnamed: 0,sentiment,statement
0,neutral,"According to Gran, the company has no plans to..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company's updated strategy fo...
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai's beer sales fell by 6.5 per cent t...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...


In [10]:
# label encode classes

le = LabelEncoder()
statement_df['sentiment'] = le.fit_transform(statement_df['sentiment'])

In [11]:
statement_df

Unnamed: 0,sentiment,statement
0,1,"According to Gran, the company has no plans to..."
1,1,Technopolis plans to develop in stages an area...
2,0,The international electronic industry company ...
3,2,With the new production plant the company woul...
4,2,According to the company's updated strategy fo...
...,...,...
4841,0,LONDON MarketWatch -- Share prices ended lower...
4842,1,Rinkuskiai's beer sales fell by 6.5 per cent t...
4843,0,Operating profit fell to EUR 35.4 mn from EUR ...
4844,0,Net sales of the Paper segment decreased to EU...


In [12]:
# set hyperparameters

MAX_LENGTH = 64
BATCH_SIZE = 16
NUM_CLASSES = 3
EPOCHS = 5
DROPOUT_PROB = 0.1
WEIGHT_DECAY = 0.01
NFOLDS = 10
LEARNING_RATE = 2e-5

In [13]:
# create helper function for time formatting

def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round(elapsed))))


In [14]:
# split data into train/val/test sets (80/10/10) for single model training

df_train,df_test = train_test_split(statement_df,
                                    test_size = 0.2,
                                    random_state=RANDOM_SEED,
                                    stratify = statement_df['sentiment'].values)

df_val,df_test = train_test_split(df_test,
                                  test_size=0.5,
                                  random_state=RANDOM_SEED,
                                  stratify=df_test['sentiment'].values
                                 )
df_train_full = pd.concat([df_train,df_val])

In [15]:
# define Dataset class and functions for creating datasets

class StatementDataset(Dataset):
    
    def __init__(self,statements,labels,tokenizer,max_length):
        self.statements = statements
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.statements)
    
    def __getitem__(self,idx):
        statement = str(self.statements[idx])
        label = self.labels[idx]
        
        enconding = self.tokenizer.encode_plus(statement,
                                               max_length=self.max_length,
                                               padding='max_length',
                                               add_special_tokens=True, 
                                               return_token_type_ids=False,
                                               truncation=True,
                                               return_attention_mask=True,
                                               return_tensors='pt'  
                                              )
        
        return {
            'statement_text': statement,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }
    

def create_dataset(df,tokenizer,max_length):
    ds = StatementDataset(statements=df['statement'].to_numpy(),
                          labels = df['sentiment'].to_numpy(),
                          tokenizer=tokenizer,
                          max_length=max_length
                         )
    return ds


def create_dataloader(ds,batch_size):
    return DataLoader(ds,batch_size,num_workers=4)

In [16]:
# helper functions for evaluating model accuracy

def cv_ensemble_performance(preds,labels):
    preds = np.array(preds)
    summed = np.sum(preds,axis=0)
    preds = np.argmax(summed,axis=1)
    print(confusion_matrix(y_true=labels,y_pred=preds))
    print('')
    print(classification_report(y_true=labels,y_pred=preds,digits=3,target_names=le.classes_))

def single_model_performance(preds,labels):
    print(confusion_matrix(y_true=labels,y_pred=preds))
    print('')
    print(classification_report(y_true=labels,y_pred=preds,digits=3,target_names=le.classes_))
    

# create a function for fine tunning single model

def train_model(model,device,data_loader,loss_function,optimizer,scheduler,n_examples):
    
    model = model.train()
    losses = []
    correct_preds=0
    complete_preds=[]
    complete_labels=[]
    
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids,attention_mask=attention_mask)
        
        _,preds = torch.max(outputs,dim=1)
        loss = loss_function(outputs,labels)
        complete_preds.append(preds.data.cpu().numpy().tolist())
        complete_labels.append(labels.data.cpu().numpy().tolist())
        correct_preds += torch.sum(preds == labels)
        losses.append(loss.item())
        
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
    complete_preds_flat = [x for y in complete_preds for x in y]
    complete_labels_flat = [x for y in complete_labels for x in y]
    acc_score = accuracy_score(y_true=complete_labels_flat, 
                             y_pred=complete_preds_flat)
    return acc_score, np.mean(losses)
    
    

In [17]:
# function for evaluating a single model
def eval_model(model, device, data_loader, loss_function, n_examples):
    model = model.eval()
    
    losses = []
    correct_preds = 0
    complete_preds = []
    complete_labels = []
    complete_outputs = []
    
    with torch.no_grad():
        for item in data_loader:
            input_ids = item['input_ids'].to(device)
            attention_mask = item['attention_mask'].to(device)
            labels = item['labels'].to(device)

            outputs = model(input_ids=input_ids, 
                            attention_mask=attention_mask)
            
            _, preds = torch.max(outputs, dim=1)
            loss = loss_function(outputs, labels)
            
            correct_preds += torch.sum(preds == labels)
            complete_preds.append(preds.data.cpu().numpy().tolist())
            complete_labels.append(labels.data.cpu().numpy().tolist())
            complete_outputs.append(outputs.tolist())
            losses.append(loss.item())
        
        accuracy = correct_preds.double() / n_examples
        complete_preds_flat = [x for y in complete_preds for x in y]
        complete_labels_flat = [x for y in complete_labels for x in y]
        complete_outputs_flat = [x for y in complete_outputs for x in y]

        acc_score = accuracy_score(y_true=complete_labels_flat, 
                             y_pred=complete_preds_flat)
        
        return_items = (acc_score, 
                        np.mean(losses), 
                        complete_preds_flat, 
                        complete_outputs_flat)
        
        return return_items

In [18]:
# function for completing a training fold for k-fold validation
def train_fold(epochs, model, device, train_dataloader, 
               val_dataloader, test_dataloader, loss_fn, optimizer, 
               scheduler, model_save_name, n_train, n_val, single_model=True):
    
    start_time = time.time()
    history = defaultdict(list)
    best_accuracy = 0

    for epoch in range(epochs):
        epoch_start_time = time.time()
        print('Epoch ', epoch+1, '/', epochs)
        print('-'*50)

        training_output = train_model(model, 
                                      device, 
                                      train_dataloader, 
                                      loss_fn, 
                                      optimizer, 
                                      scheduler, 
                                      n_train)
        
        train_acc, train_loss = training_output
        
        val_output = eval_model(model, 
                                device, 
                                val_dataloader, 
                                loss_fn, 
                                n_val)

        val_acc, val_loss, val_preds, val_outputs = val_output
        
        history['train_accuracy'].append(train_acc)
        history['train_loss'].append(train_loss)
        history['val_accuracy'].append(val_acc)
        history['val_loss'].append(val_loss)
        history['val_preds'].append(val_preds)

        if val_acc > best_accuracy:
            torch.save(model.state_dict(), model_save_name)
            best_accuracy = val_acc
            best_preds = val_preds
            best_outputs = val_outputs

        print('Train Loss: ', 
              train_loss, 
              ' | ', 
              'Train Accuracy: ', 
              train_acc)
        print('Val Loss: ', 
              val_loss, 
              ' | ', 
              'Val Accuracy: ', 
              val_acc)
        print('Epoch Train Time: ', 
              format_time(time.time() - epoch_start_time))
        print('\n')
    
    print('Finished Training.')   
    print('Fold Train Time: ', format_time(time.time() - start_time))
    print('\n')
    if single_model:
        _, _, test_preds, test_outputs = eval_model(model, 
                                                    device, 
                                                    test_dataloader, 
                                                    loss_function, 
                                                    len(df_test))
        
        single_model_performance(test_preds, df_test['sentiment'].values)
    return history, best_preds, best_outputs

In [19]:
# function for completing k-fold validation
def get_oof_and_test_preds(model_type, tokenizer, 
                           train_df, test_df, single_model=False):
    oof_preds = []
    oof_outputs = []
    oof_preds_indices = []
    test_preds_list = []
    test_outputs_list = []
    history_list = []
    start_time = time.time()
    
    fold = 0
    
    x_train = train_df['statement']
    y_train = train_df['sentiment']

    for train_index, val_index in skf.split(x_train, y_train):
        print('Fold: {}'.format(fold+1))
        
        x_tr = x_train.iloc[train_index]
        y_tr = y_train.iloc[train_index]
        x_va = x_train.iloc[val_index]
        y_va = y_train.iloc[val_index]
        
        train = pd.DataFrame(list(zip(x_tr, y_tr)), 
                             columns=['statement', 'sentiment'])
        val = pd.DataFrame(list(zip(x_va, y_va)), 
                           columns=['statement', 'sentiment'])

        train_ds = create_dataset(train, tokenizer, MAX_LENGTH)
        val_ds = create_dataset(val, tokenizer, MAX_LENGTH)
        test_ds = create_dataset(test_df, tokenizer, MAX_LENGTH)
        

        if model_type == 'bert':
            model = BERTSentimentClassifier(NUM_CLASSES)
            model = model.to(device)
        elif model_type == 'distilbert':
            model = DistilBertForSequenceClassification(pretrained_model_name=DISTILBERT_MODEL_NAME, 
                                                        num_classes=NUM_CLASSES)
            model = model.to(device)
        elif model_type == 'roberta':
            model = RobertaSentimentClassifier(n_classes=NUM_CLASSES)
            model = model.to(device)
        
        train_loader = create_dataloader(train_ds, BATCH_SIZE)
        val_loader = create_dataloader(val_ds, BATCH_SIZE)
        test_loader = create_dataloader(test_ds, BATCH_SIZE)
        
        training_steps = len(train_loader.dataset) * EPOCHS
        warmup_steps = int(0.1 * training_steps)
        optimizer = AdamW(model.parameters(), 
                          lr=LEARNING_RATE, 
                          weight_decay=WEIGHT_DECAY, 
                          correct_bias=True)
        scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                    num_warmup_steps=warmup_steps, 
                                                    num_training_steps=training_steps)
        
        model_save_name = '{}_fold_{}.bin'.format(model_type, fold)
        
        history, preds, outputs = train_fold(epochs=EPOCHS,
                                             model=model, 
                                             device=device, 
                                             train_dataloader=train_loader, 
                                             val_dataloader=val_loader,
                                             test_dataloader=test_loader,
                                             loss_fn=loss_function,
                                             optimizer=optimizer,
                                             scheduler=scheduler,
                                             model_save_name=model_save_name,
                                             n_train=len(train),
                                             n_val=len(val),
                                             single_model=False
                                            )
        
        history_list.append(history)
        oof_preds.append(preds)
        oof_outputs.append(outputs)
        oof_preds_indices.append(val_index)
        _, _, test_preds, test_outputs = eval_model(model, 
                                                    device, 
                                                    test_loader, 
                                                    loss_function, 
                                                    len(test_df))
        test_preds_list.append(test_preds)
        test_outputs_list.append(test_outputs)
        
        fold += 1

    print(str(NFOLDS), 'Fold CV Train Time: ', format_time(time.time() - start_time))
    return history_list, test_outputs_list

In [20]:
# define loss function and move to device
loss_function = nn.CrossEntropyLoss().to(device)

# create folds for k-fold CV (use stratified to preserve class distribution)
skf = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=RANDOM_SEED)

# create tokenizer
BERT_MODEL_NAME = 'bert-base-cased'
bert_tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

# create datasets and dataloaders
bert_train_ds = create_dataset(df_train, bert_tokenizer, MAX_LENGTH)
bert_test_ds = create_dataset(df_test, bert_tokenizer, MAX_LENGTH)
bert_val_ds = create_dataset(df_val, bert_tokenizer, MAX_LENGTH)

bert_train_dataloader = create_dataloader(bert_train_ds, BATCH_SIZE)
bert_test_dataloader = create_dataloader(bert_test_ds, BATCH_SIZE)
bert_val_dataloader = create_dataloader(bert_val_ds, BATCH_SIZE)

# define BERT sentiment classifier model using BertModel as base
class BERTSentimentClassifier(nn.Module):
    def __init__(self, n_classes):
        super(BERTSentimentClassifier, self).__init__()
        self.model = BertModel.from_pretrained(BERT_MODEL_NAME)
        self.drop = nn.Dropout(DROPOUT_PROB)
        self.output = nn.Linear(self.model.config.hidden_size, n_classes)
        
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.model(
            input_ids = input_ids,
            attention_mask=attention_mask
        )

        output = self.drop(pooled_output)
        
        return self.output(output)
        
bert_model = BERTSentimentClassifier(NUM_CLASSES)
bert_model = bert_model.to(device)

# set up optimizer and scheduler
training_steps = len(bert_train_dataloader.dataset) * EPOCHS

bert_optimizer = AdamW(bert_model.parameters(), 
                       lr=LEARNING_RATE, 
                       weight_decay=WEIGHT_DECAY, 
                       correct_bias=True)

warmup_steps = int(0.1 * training_steps)
bert_scheduler = get_linear_schedule_with_warmup(bert_optimizer, 
                                                 num_warmup_steps=warmup_steps, 
                                                 num_training_steps=training_steps)
# train single model                                                 
bert_history, bert_preds, bert_outputs = train_fold(epochs=EPOCHS, 
                                                    model=bert_model, 
                                                    device=device, 
                                                    train_dataloader=bert_train_dataloader, 
                                                    val_dataloader=bert_val_dataloader,
                                                    test_dataloader=bert_test_dataloader,
                                                    loss_fn=loss_function,
                                                    optimizer=bert_optimizer,
                                                    scheduler=bert_scheduler,
                                                    model_save_name='bert_best_model.bin',
                                                    n_train=len(df_train),
                                                    n_val=len(df_val),
                                                    single_model=True
                                                   )
                                                   
# train models using 10 fold cross validation
bert_history, bert_test_outputs = get_oof_and_test_preds(model_type='bert', 
                                                         tokenizer=bert_tokenizer, 
                                                         train_df=df_train_full, 
                                                         test_df=df_test,
                                                         single_model=False)                                                 

# evaluate performance of ensemble model created using 10 fold cross validation
cv_ensemble_performance(bert_test_outputs, df_test['sentiment'].values)

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=433.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=435779157.0), HTML(value='')))


Epoch  1 / 5
--------------------------------------------------


KeyboardInterrupt: 