In [2]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import transformers
from transformers import (
    AdamW,
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoModel, BertTokenizerFast
from transformers import AutoTokenizer, DataCollatorWithPadding
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import pytorch_lightning as pl
import os.path
from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import BertTokenizer, BertModel

In [8]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

In [3]:


epoch = 0


class BERT_unfreeze(pl.LightningModule):  
    def __init__(self, weights,max_length, learning_rate=1e-5):
        super().__init__()
        self.save_hyperparameters()
        self.loss = nn.NLLLoss(weight=weights)
        self.max_length=max_length
        self.learning_rate = learning_rate

        self.bert = BertModel.from_pretrained("bert-base-uncased")

        #freeze the pretrained layers except 1-3
        for layer in self.bert.encoder.layer[:9]:
            for param in layer.parameters():
                param.requires_grad = False
       
        # dropout layer
        self.dropout = nn.Dropout(0.2)
       
         # relu activation function
        self.relu =  nn.ReLU()
 
         # dense layer 1
        self.fc1 = nn.Linear(768,512)
       
        # dense layer 2 (Output layer)
        self.fc2 = nn.Linear(512,2)
 
        #softmax activation function
        self.softmax = nn.LogSoftmax(dim=1)
 
    #define the forward pass
    def forward(self, sent_id, mask):
 
        #pass the inputs to the model
        _, cls_hs = self.bert(sent_id, attention_mask=mask, return_dict=False)
       
        x = self.fc1(cls_hs)
 
        x = self.relu(x)
 
        x = self.dropout(x)
 
        # output layer
        x = self.fc2(x)
       
        # apply softmax activation
        x = self.softmax(x)
 
        return x

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.learning_rate)
        scheduler = {'scheduler':ReduceLROnPlateau(optimizer,mode='min',factor=0.5,patience=3,verbose=1),
'monitor':'val_loss'}
        return {'optimizer': optimizer, 'lr_scheduler': scheduler}

    def export_metrics(self, acc, loss, step):

        metrics_filename = f"/Users/vladandreichuk/Desktop/practical-data-science-tutorial/models/BERT_metrics/max_l{self.max_length}_lr{self.learning_rate}_{step}_metrics.csv"

        metrics_df = pd.DataFrame([[epoch, acc,float(loss)]], columns = ['epoch', 'acc', 'loss'])

        if os.path.isfile(metrics_filename):

            metrics_in_data = pd.read_csv(metrics_filename)
            metrics_out_data = pd.concat([metrics_in_data, metrics_df])
            metrics_out_data.to_csv(metrics_filename, index = False)

        else:

            metrics_df.to_csv(metrics_filename, index = False)

        return

    # function to train the model
    def training_step(self, train_batch, batch_idx):
        sent_id, mask, labels= train_batch

        # get model predictions for the current batch
        preds = self(sent_id, mask)

        # compute the loss between actual and predicted values
        loss = self.loss(preds, labels)

        acc = torch.sum(labels == torch.argmax(preds, dim=1)).item() / len(labels)

        metrics = {"train_acc": acc, "loss": loss}

        return metrics


    def training_epoch_end(self, outputs):
        step = "train"

        acc = sum([x['train_acc'] for x in outputs]) / len(outputs)
        loss = sum([x['loss'] for x in outputs]) / len(outputs)

        self.export_metrics(acc, loss, step)

    def validation_step(self, val_batch, batch_idx):
        sent_id, mask, labels = val_batch
        preds = self(sent_id, mask)
        loss, acc = self._shared_eval_step(val_batch, batch_idx)
        metrics = {"val_acc": acc, "val_loss": loss}
        self.log_dict(metrics)

        return metrics

    def validation_epoch_end(self, outputs):
        global epoch
        epoch = epoch + 1

        step = "val"

        acc = sum([x['val_acc'] for x in outputs]) / len(outputs)
        loss = sum([x['val_loss'] for x in outputs]) / len(outputs)

        self.export_metrics(acc, loss, step)

    def test_step(self, test_batch, batch_idx):
        sent_id, mask, labels= test_batch
        preds = self(sent_id, mask)
        loss, acc = self._shared_eval_step(test_batch, batch_idx)
        metrics = {"test_acc": acc, "test_loss": loss}
        self.export_metrics(acc,loss,'test')
        self.log_dict(metrics)

        return metrics

    def predict_step(self, predict_batch, batch_idx):
        sent_id, mask, labels = predict_batch

        return self(sent_id, mask)

    def _shared_eval_step(self, batch, batch_idx):
        sent_id, mask, labels= batch

        # model predictions
        preds = self(sent_id, mask)

        # compute the validation loss between actual and predicted values
        loss = self.loss(preds, labels)

        # compute accuracy between actual and predicted values
        acc = torch.sum(labels == torch.argmax(preds, dim=1)).item() / len(labels)

        return loss, acc

In [None]:
from transformers import AutoModel, BertTokenizerFast
from transformers import AutoTokenizer, DataCollatorWithPadding
from sklearn.utils.class_weight import compute_class_weight


class POLUSADataModule(pl.LightningDataModule):
    def __init__(self, batch_size=32, max_length=256, data_dir=f'/net/projects/ycleong/neuralsentiment/scripts/vlad/data/300_manual_v2/v{vers}', test_nrows=None):
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.max_length = max_length
        self.test_nrows = test_nrows
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def tokenize(self, df):
        tokens = self.tokenizer.__call__(
                df['body'].tolist(),
                padding = True, max_length=self.max_length,
                truncation = True)
               
        # convert the integer sequences to tensors.
        seq = torch.tensor(tokens['input_ids'])
        mask = torch.tensor(tokens['attention_mask'])
        y = torch.tensor(df['label'].tolist())

        return TensorDataset(seq, mask, y)

    def setup(self, stage):
        if stage == "weight_calc":
            df_train = pd.read_csv(f'{self.data_dir}/{self.prefix}_center_train.csv', nrows=self.test_nrows)
            class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(df_train['label']), y=df_train['label'])
            self.weights = torch.tensor(class_weights, dtype=torch.float)

        if stage == "fit":
            df_train = pd.read_csv(f'{self.data_dir}/{self.prefix}_center_train.csv', nrows=self.test_nrows)
            df_val = pd.read_csv(f'{self.data_dir}/{self.prefix}_center_val.csv', nrows=self.test_nrows)

            self.train_data = self.tokenize(df_train)
            self.val_data = self.tokenize(df_val)

        if stage == "test" or stage == "predict":
            df_test = pd.read_csv(f'{self.data_dir}/{self.prefix}_center_test.csv', nrows=self.test_nrows)
            self.test_data = self.tokenize(df_test)

    def get_weights(self):
        return self.weights
       
    def train_dataloader(self):
        return DataLoader(self.train_data, drop_last=True, batch_size=self.batch_size)

    def val_dataloader(self):
        return DataLoader(self.val_data, drop_last=True, batch_size=self.batch_size)

    def test_dataloader(self):
        return DataLoader(self.test_data, drop_last=True, batch_size=self.batch_size)

    def predict_dataloader(self):
        return DataLoader(self.test_data, drop_last=True, batch_size=self.batch_size)