In [None]:
!pip install pytorch_pretrained_bert pytorch-nlp
!apt install unixodbc-dev
!pip install pyodbc

In [None]:
%%sh
curl https://packages.microsoft.com/keys/microsoft.asc | apt-key add -
curl https://packages.microsoft.com/config/ubuntu/16.04/prod.list > /etc/apt/sources.list.d/mssql-release.list
sudo apt-get update
sudo ACCEPT_EULA=Y apt-get -q -y install msodbcsql17

In [None]:
import sys
import numpy as np
import random as rn
import torch
import pyodbc
import csv
from pytorch_pretrained_bert import BertModel
from torch import nn
from torchnlp.datasets import imdb_dataset
from pytorch_pretrained_bert import BertTokenizer
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
from IPython.display import clear_output
from sklearn.metrics import classification_report

In [None]:
rn.seed(2020)
np.random.seed(2020)
torch.manual_seed(2020)
torch.cuda.manual_seed(2020)

## Data Pre-processing

In [None]:
# Set up the database configurations and functions to query dataset from the database

server = 'positive-ai-server.database.windows.net'
database = 'positive-ai-db'
username = 'positive-ai-admin'
password = 'Zcdukic001'
driver= '{ODBC Driver 17 for SQL Server}'

def query_training_set():
    cnxn = pyodbc.connect('DRIVER='+driver+';SERVER='+server+';PORT=1433;DATABASE='+database+';UID='+username+';PWD='+ password)
    cursor = cnxn.cursor()
    cursor.execute("select Text, Sentiment from training_dataset_new")
    texts = []
    labels = []
    rows = cursor.fetchall()
    for row in rows:
        texts.append(row.Text)
        labels.append(row.Sentiment)
    return texts,labels

def query_testing_set():
    cnxn = pyodbc.connect('DRIVER='+driver+';SERVER='+server+';PORT=1433;DATABASE='+database+';UID='+username+';PWD='+ password)
    cursor = cnxn.cursor()
    cursor.execute("select Text, Sentiment from testing_dataset")
    texts = []
    labels = []
    rows = cursor.fetchall()
    for row in rows:
        texts.append(row.Text)
        labels.append(row.Sentiment)
    return texts,labels

def query_dataset(db):
    cnxn = pyodbc.connect('DRIVER='+driver+';SERVER='+server+';PORT=1433;DATABASE='+database+';UID='+username+';PWD='+ password)
    cursor = cnxn.cursor()
    cursor.execute("select text, sentiment from "+db)
    texts = []
    labels = []
    rows = cursor.fetchall()
    for row in rows:
        texts.append(row.text)
        labels.append(row.sentiment)
    return texts,labels

In [None]:
# Prepare the trainging set, validation set, and testing set

good_news_texts, good_news_labels = query_dataset("Good_News") 
pos_web_texts, pos_web_labels = query_dataset("Pos_Web") 
bad_news_texts, bad_news_labels = query_dataset("Bad_News") 
neg_news_texts, neg_news_labels = query_dataset("Neg_News")
new_texts, new_labels = query_training_set()

train_texts = good_news_texts + pos_web_texts[:500] + bad_news_texts[:9580] + neg_news_texts + new_texts
train_labels = good_news_labels + pos_web_labels[:500] + bad_news_labels[:9580] + neg_news_labels + new_labels

val_texts = pos_web_texts[500:1000] + bad_news_texts[9580:10080]
val_labels = pos_web_labels[500:1000] + bad_news_texts[9580:10080]

# train_texts = good_news_texts + new_texts + pos_web_texts[:1000] + bad_news_texts + neg_news_texts
# train_labels = good_news_labels + new_labels + pos_web_labels[:1000] + bad_news_labels + neg_news_labels
# val_texts, val_labels = query_testing_set()
test_texts, test_labels = query_testing_set()

train_data = train_texts

In [None]:
# Shuffle the training set
data = list(zip(train_texts, train_labels))
rn.shuffle(data)
train_texts, train_labels = zip(*data)

In [None]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
# Tokenization
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], train_texts))
val_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], val_texts))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], test_texts))                    

In [None]:
# Convert tokens to ids, pad with zeros if needed
train_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, train_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")
val_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, val_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, test_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")

In [None]:
# Prepare the labels
train_y = np.array(train_labels) == 'pos'
val_y = np.array(val_labels) == 'pos'
test_y = np.array(test_labels) == 'pos'

In [None]:
# Create attention mask
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
val_masks = [[float(i > 0) for i in ii] for ii in val_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]

# Bert Model

In [None]:
# Create BertClassifier

class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba
        

In [None]:
# specify the GPU as the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Instantiate Bert Classifier
bert_clf = BertBinaryClassifier()
bert_clf = bert_clf.cuda()

# Traing and Fine-tuning

In [None]:
# Set batch size and epochs
BATCH_SIZE = 8
EPOCHS = 5

In [None]:
# Convert data to torch.tensor
train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()

val_tokens_tensor = torch.tensor(val_tokens_ids)
val_y_tensor = torch.tensor(val_y.reshape(-1, 1)).float()

test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()

train_masks_tensor = torch.tensor(train_masks)
val_masks_tensor = torch.tensor(val_masks)
test_masks_tensor = torch.tensor(test_masks)

In [None]:
# Create the DataLoader for training, validation and testing set
train_dataset = TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

val_dataset = TensorDataset(val_tokens_tensor, val_masks_tensor, val_y_tensor)
val_sampler = SequentialSampler(val_dataset)
val_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)


In [None]:
# Create the optimizer
param_optimizer = list(bert_clf.sigmoid.named_parameters()) 
optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(bert_clf.parameters(), lr=3e-5)

In [None]:
# Create evaluate and test functions
def evaluate(bert_clf, val_dataloader):
    """
    After the completion of each training epoch, measure the model's performance on validation set.
    """
    bert_clf.eval()
    val_accuracy = []
    val_loss = []

    for batch in val_dataloader:
        token_ids, masks, labels = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            logits = bert_clf(token_ids, masks)

        loss_func = nn.BCELoss()
        loss = loss_func(logits, labels)
        val_loss.append(loss.item())

        preds = torch.argmax(logits, dim=1).flatten()
        accuracy = (preds == labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

def test(bert_clf, test_dataloader):
    """
    After the completion of whole training process, measure the model's performance on testing set.
    """
    bert_clf.eval()
    bert_predicted = []
    all_logits = []

    with torch.no_grad():
      for step_num, batch_data in enumerate(test_dataloader):
        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)

        logits = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        loss = loss_func(logits, labels)
        numpy_logits = logits.cpu().detach().numpy()  
        
        bert_predicted += list(numpy_logits[:, 0] > 0.5)
        all_logits += list(numpy_logits[:, 0])

    print(classification_report(test_y, bert_predicted))

In [None]:
# The training process
for epoch_num in range(EPOCHS):
    bert_clf.train()
    train_loss = 0
    for step_num, batch_data in enumerate(train_dataloader):
        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)
        logits = bert_clf(token_ids, masks)
        
        loss_func = nn.BCELoss()
        batch_loss = loss_func(logits, labels)
        train_loss += batch_loss.item()
        
        bert_clf.zero_grad()
        batch_loss.backward()
        

        clip_grad_norm_(parameters=bert_clf.parameters(), max_norm=1.0)
        optimizer.step()
        
        if (step_num % 2000 == 0 and step_num != 0) or (step_num == len(train_dataloader) - 1):
          # print('Epoch: ', epoch_num + 1)
          print("Epoch:{0} {1}/{2} loss: {3} ".format(epoch_num + 1, step_num, len(train_data) / BATCH_SIZE, train_loss / (step_num + 1)))
    
    avg_train_loss = train_loss / len(train_dataloader)
    val_loss, val_accuracy = evaluate(bert_clf, val_dataloader)
    print("Train loss: "+str(avg_train_loss)+" Val loss :"+str(val_loss)+" Val acc: "+str(val_accuracy))
    test(bert_clf, test_dataloader)
    print("-"*70)

In [None]:
# Save the model
torch.save(bert_clf.state_dict(), 'bert_model.bin')