# Finetune Bert for personality classification

## Setup


In [None]:
!pip install -q -U watermark
!pip install -qq transformers
!pip install --quiet shap==0.39

!pip install tqdm
!pip install livelossplot --quiet
%reload_ext watermark
%watermark -v -p numpy,pandas,torch,transformers

[K     |████████████████████████████████| 1.6 MB 5.0 MB/s 
[K     |████████████████████████████████| 4.7 MB 5.0 MB/s 
[K     |████████████████████████████████| 6.6 MB 68.0 MB/s 
[K     |████████████████████████████████| 120 kB 93.7 MB/s 
[K     |████████████████████████████████| 356 kB 5.1 MB/s 
[?25h  Building wheel for shap (setup.py) ... [?25l[?25hdone
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Python implementation: CPython
Python version       : 3.7.13
IPython version      : 7.9.0

numpy       : 1.21.6
pandas      : 1.3.5
torch       : 1.12.1+cu113
transformers: 4.21.2



In [None]:
#@title Setup & Config
from wordcloud import WordCloud
import transformers
from transformers import logging
logging.set_verbosity_error()
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from transformers import DistilBertTokenizer, DistilBertModel
from transformers import LongformerConfig, LongformerModel
from transformers import RobertaTokenizer
from transformers.utils.dummy_pt_objects import PreTrainedModel
import shap
import torch
from sklearn.model_selection import train_test_split
import warnings

import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap

import pickle
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
import sklearn
import keras
from tensorflow.keras.layers import Conv2D, BatchNormalization, GlobalAveragePooling2D, \
Dense, Input, Activation, MaxPool2D
from tensorflow.keras import Model

import numpy as np
import pandas as pd
import re
import csv
# import preprocessor as p
import math
from torch.utils.data import TensorDataset, DataLoader

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from tqdm import tqdm

from tensorflow import summary
import datetime
from torch.utils.tensorboard import SummaryWriter
from torch.cuda.amp import autocast 
%load_ext tensorboard

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from livelossplot import PlotLosses

pd.options.display.max_colwidth = 1000
pd.set_option('display.expand_frame_repr', False)

import re
import imageio,glob

import random
seed = 0
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

%matplotlib inline
%config InlineBackend.figure_format='retina'
torch.set_printoptions(precision=3, sci_mode=False)

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 12, 8



DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#DEVICE =torch.device('cpu')

In [None]:
# declare global settings
 # this is still possible on the gpu for Bert - 32 not tested yet
batch_size = 16
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
#DEVICE =torch.device('cpu')

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#DEVICE =torch.device('cpu')

In [None]:
from google.colab import drive
drive.mount('/content/drive')
PROJECT_PATH = 'drive/MyDrive/Colab\ Notebooks/application_project/personality-prediction'
%cd $PROJECT_PATH

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/application_project/personality-prediction


## Functions

In [None]:
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Linear') != -1:
        #torch.nn.init.xavier_uniform_(m.weight)
        torch.nn.init.normal_(m.weight, mean=0.0, std=0.05)
        torch.nn.init.zeros_(m.bias)

        #initializers.RandomNormal(stddev=0.01), bias_initializer=initializers.Zeros()
    elif classname.find('BatchNorm') != -1:
        torch.nn.init.normal_(m.weight, 1.0, 0.02)
        torch.nn.init.zeros_(m.bias)

In [None]:
def visualize_layerwise_embeddings(hidden_states,masks,labels,epoch,title, acc,layers_to_visualize=[0,1,2,3,8,9,10,11]):
    dim_reducer = TSNE(n_components=2, learning_rate = 'auto', init='pca')
    #dim_reducer = PCA(n_components=2)

    !mkdir -p plots/dim_reduction/{title}
    num_layers = len(layers_to_visualize)
    
    fig = plt.figure(figsize=(24,(num_layers/4)*6)) #each subplot of size 6x6, each row will hold 4 plots
    ax = [fig.add_subplot(num_layers/4,4,i+1) for i in range(num_layers)]
    
    labels = labels.numpy().reshape(-1)
    for i,layer_i in enumerate(layers_to_visualize):
        layer_embeds = hidden_states[layer_i]
        
        layer_averaged_hidden_states = torch.div(layer_embeds.sum(dim=1),masks.sum(dim=1,keepdim=True))
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            layer_dim_reduced_embeds = dim_reducer.fit_transform(layer_averaged_hidden_states.numpy())
        
        df = pd.DataFrame.from_dict({'x':layer_dim_reduced_embeds[:,0],'y':layer_dim_reduced_embeds[:,1],'label':labels})

        df.label = df.label.astype(int)
        sns.scatterplot(data=df,x='x',y='y',hue='label',ax=ax[i], palette="deep")
        fig.suptitle(f"{title}, Epoch: {epoch}, Accuracy: {str(round(acc, 2))}")
        ax[i].set_title(f"layer {layer_i+1}")
        
        
    plt.savefig(f'plots/dim_reduction/{title}/{epoch}',format='png',pad_inches=0)

In [None]:
# https://www.kaggle.com/code/tanmay17061/transformers-bert-hidden-embeddings-visualization/notebook
# https://stackoverflow.com/questions/15638612/calculating-mean-and-standard-deviation-of-the-data-which-does-not-fit-in-memory
def prepare_visulaize(model, data_loader, epoch, trait_idx, hot_encoding, titel, acc):
    MAX_SEQ_LEN = 512
    model.eval()  #toggle model in eval mode
    with torch.no_grad():
        train_correct_preds,train_total_preds,train_total_loss = 0,0,0.0
        train_masks,train_ys = torch.zeros(0,MAX_SEQ_LEN),torch.zeros(0,1)
        train_hidden_states = None

        for step, (author_ids, inputs_ids, attention_mask, target) in enumerate(data_loader):
            if step >= 15:
                continue
            target = target[:,trait_idx]
            if hot_encoding:
                one_hot_encoding = tf.keras.utils.to_categorical(target, num_classes=2)
                target = torch.from_numpy(one_hot_encoding).float().to(DEVICE)
            else:
                target = target.to(DEVICE).unsqueeze(0).t()

            _, hidden_states = model(inputs_ids, attention_mask, output_hidden_states=True)
            hidden_states = hidden_states[1:]
            #proba = sigmoid(logits)

            train_masks = torch.cat([train_masks,attention_mask.cpu()])
            train_ys = torch.cat([train_ys, target.cpu().view(-1,1)])

            if type(train_hidden_states) == type(None):
                train_hidden_states = tuple(layer_hidden_states.cpu() for layer_hidden_states in hidden_states)
            else:
                train_hidden_states = tuple(torch.cat([layer_hidden_state_all,layer_hidden_state_batch.cpu()])for layer_hidden_state_all,layer_hidden_state_batch in zip(train_hidden_states,hidden_states))
        
        visualize_layerwise_embeddings(train_hidden_states,train_masks,train_ys,epoch, titel, acc)

## Data Preprocessing

In [None]:
def preprocess_text(sentence):
    # remove hyperlinks, hashtags, smileys, emojies
    # sentence = p.clean(sentence)
    # Remove hyperlinks
    sentence = re.sub(r"http\S+", " ", sentence)
    # Remove punctuations and numbers
    # sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    # sentence = re.sub('[^a-zA-Z.?!,]', ' ', sentence)
    # Single character removal (except I)
    # sentence = re.sub(r"\s+[a-zA-HJ-Z]\s+", ' ', sentence)
    # Removing multiple spaces
    sentence = re.sub(r"\s+", " ", sentence)
    sentence = re.sub(r"\|\|\|", " ", sentence)

    return sentence


def load_essays_df(datafile):
    with open(datafile, "rt") as csvf:
        csvreader = csv.reader(csvf, delimiter=",", quotechar='"')
        first_line = True
        df = pd.DataFrame(
            columns=["user", "text", "token_len", "EXT", "NEU", "AGR", "CON", "OPN"]
        )
        for line in csvreader:
            if first_line:
                first_line = False
                continue

            text = line[1]
            df = df.append(
                {
                    "user": line[0],
                    "text": text,
                    "token_len": 0,
                    "EXT": 1 if line[2].lower() == "y" else 0,
                    "NEU": 1 if line[3].lower() == "y" else 0,
                    "AGR": 1 if line[4].lower() == "y" else 0,
                    "CON": 1 if line[5].lower() == "y" else 0,
                    "OPN": 1 if line[6].lower() == "y" else 0,
                },
                ignore_index=True,
            )

    #print("EXT : ", df["EXT"].value_counts())
    #print("NEU : ", df["NEU"].value_counts())
    #print("AGR : ", df["AGR"].value_counts())
    #print("CON : ", df["CON"].value_counts())
    #print("OPN : ", df["OPN"].value_counts())

    return df


def essays_embeddings(datafile, tokenizer, token_length, mode):
    targets = []
    input_ids = []
    attention_masks = []

    df = load_essays_df(datafile)
    df
    cnt = 0

    # sorting all essays in ascending order of their length
    for ind in df.index:
        tokens = tokenizer.tokenize(df["text"][ind])
        df.at[ind, "token_len"] = len(tokens)

    df.sort_values(by=["token_len", "user"], inplace=True, ascending=True)
    tmp_df = df["user"]
    tmp_df.to_csv("data/essays/author_id_order.csv", index_label="order")
    print(df["token_len"].mean())

    for ii in range(len(df)):
        text = preprocess_text(df["text"][ii])
        tokens = tokenizer.tokenize(text)

        if mode.isdigit():
              token_length = int(mode)
              encoding = tokenizer.encode_plus(
                    tokens,
                    add_special_tokens=True,
                    max_length=token_length,
                    pad_to_max_length=True,
                    return_attention_mask=True,
                )
              input_ids.append(encoding['input_ids'])
              attention_masks.append(encoding['attention_mask'])

        elif mode == "normal" or mode == "512_head":
              encoding = tokenizer.encode_plus(
                    tokens,
                    add_special_tokens=True,
                    max_length=token_length,
                    pad_to_max_length=True,
                    return_attention_mask=True,
                )
              input_ids.append(encoding['input_ids'])
              attention_masks.append(encoding['attention_mask'])

        elif mode == "longformer":
              encoding = tokenizer.encode_plus(
                    tokens,
                    add_special_tokens=True,
                    max_length=token_length,
                    pad_to_max_length=True,
                    return_attention_mask=True,
                )
              input_ids.append(encoding['input_ids'])
              attention_masks.append(encoding['attention_mask'])
            
        elif mode == "512_tail":
            input_ids.append(
                tokenizer.encode(
                    tokens[-(token_length - 2) :],
                    add_special_tokens=True,
                    max_length=token_length,
                    pad_to_max_length=True,
                )
            )
        elif mode == "256_head_tail":
            input_ids.append(
                tokenizer.encode(
                    tokens[: (token_length - 1)] + tokens[-(token_length - 1) :],
                    add_special_tokens=True,
                    max_length=token_length,
                    pad_to_max_length=True,
                )
            )

        elif mode == "docbert":
            docmax_len = 2048
            subdoc_len = 512
            max_subdoc_num = docmax_len // subdoc_len
            subdoc_tokens = [
                tokens[i : i + subdoc_len] for i in range(0, len(tokens), subdoc_len)
            ][:max_subdoc_num]
            # print(subdoc_tokens)
            token_ids = [
                tokenizer.encode(
                    x,
                    add_special_tokens=True,
                    max_length=token_length,
                    pad_to_max_length=True,
                )
                for x in subdoc_tokens
            ]
            # print(token_ids)
            token_ids = np.array(token_ids).astype(int)

            buffer_len = docmax_len // subdoc_len - token_ids.shape[0]
            # print(buffer_len)
            tmp = np.full(shape=(buffer_len, token_length), fill_value=0, dtype=int)
            token_ids = np.concatenate((token_ids, tmp), axis=0)

            input_ids.append(token_ids)

        targets.append(
            [df["EXT"][ii], df["NEU"][ii], df["AGR"][ii], df["CON"][ii], df["OPN"][ii]]
        )
        cnt += 1

    author_ids = np.array(df.index)
    #print("loaded all input_ids and targets from the data file!")

    #df_tokenized = pd.DataFrame(list(zip(author_ids, input_ids, targets)),
     #          columns =['author_ids', 'input_ids', 'targets'])
    return author_ids, input_ids, attention_masks, targets

## Dataloader & dataset

In [None]:
# one target
class Bert_Dataset(Dataset):
    def __init__(self, author_ids, input_ids, attention_masks, targets):
        input_ids = [np.asarray(x) for x in input_ids]
        attention_masks = [np.asarray(x) for x in attention_masks]
        self.author_ids = torch.from_numpy(np.array(author_ids))
        self.input_ids = torch.from_numpy(np.array(input_ids))
        self.attention_masks = torch.from_numpy(np.array(attention_masks))
        #one_hot_encoding = tf.keras.utils.to_categorical(targets.to_numpy(), num_classes=2)
        #self.targets = torch.from_numpy(one_hot_encoding).float()
        self.targets = torch.from_numpy(targets.to_numpy()).float()
        #print(f'input_ids: {self.input_ids.size()}')
        #print(f'attention_mask: {self.attention_masks.size()}')
        #print(f'targets: {self.targets.size()}')
        

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        #return (self.author_ids[idx], self.input_ids[idx].to(DEVICE), self.attention_masks[idx].to(DEVICE), self.targets[idx].to(DEVICE))
        return (self.author_ids[idx], self.input_ids[idx].to(DEVICE), self.attention_masks[idx].to(DEVICE), self.targets[idx])

In [None]:
# base = 'drive/MyDrive/Colab Notebooks/application_project/personality-prediction/'
'''
base = ''
token_length = 1024
#tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
#tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
tokenizer.model_max_length = token_length
#token_length = 512
mode = 'longformer'
datafile = base + "data/essays/essays.csv"
author_ids, input_ids, attention_masks, targets = essays_embeddings(
    datafile, tokenizer, token_length, mode
)
np.save('input_ids_distill_bert.npy', input_ids)
np.save('attention_masks_distill_bert.npy', attention_masks)
'''
author_ids = np.load(open('author_ids.npy', 'rb'), allow_pickle=True)
input_ids = np.load(open('input_ids.npy', 'rb'), allow_pickle=True)
attention_masks = np.load(open('attention_masks.npy', 'rb'), allow_pickle=True)
targets_arr = np.load(open('targets.npy', 'rb'), allow_pickle=True)

In [None]:
def return_dataloader(author_ids, input_ids, attention_masks, targets_arr):
    tokenized_df = pd.DataFrame(list(zip(author_ids, input_ids, attention_masks)),
                  columns =['author_ids', 'input_ids', 'attention_masks']).apply(np.asarray)
    target_df = pd.DataFrame(targets_arr, columns = ["EXT", "NEU", "AGR", "CON", "OPN"])

    df_inputs_train, df_inputs_test, df_targets_train, df_targets_test = train_test_split(tokenized_df, target_df, test_size=0.1, stratify=target_df)

    # testing
    test_on = 10
    inputs = df_inputs_train.iloc[:test_on,]
    targets = df_targets_train.iloc[:test_on,]

    inputs_train, inputs_val, targets_train, targets_val = train_test_split(inputs, targets, test_size=0.5)
    #auth = inputs_train['author_ids']
    #tar = targets_train
    #print(f'author_ids: \n{auth}\ntargets: \n{tar}')

    # dataloader
    train_dataset_small = Bert_Dataset(inputs_train['author_ids'].to_numpy(), inputs_train['input_ids'].to_numpy(), inputs_train['attention_masks'].to_numpy(), targets_train)
    val_dataset_small = Bert_Dataset(inputs_val['author_ids'].to_numpy(), inputs_val['input_ids'].to_numpy(), inputs_val['attention_masks'].to_numpy(), targets_val)
    train_dataloader_small = DataLoader(train_dataset_small, batch_size = batch_size, shuffle = False)
    val_dataloader_small = DataLoader(val_dataset_small, batch_size = batch_size)


    # normal
    inputs = df_inputs_train#.iloc[:test_on,]
    targets = df_targets_train#.iloc[:test_on,]

    inputs_train, inputs_val, targets_train, targets_val = train_test_split(inputs, targets, test_size=0.15, stratify=targets)

    # dataloader
    train_dataset = Bert_Dataset(inputs_train['author_ids'].to_numpy(), inputs_train['input_ids'].to_numpy(), inputs_train['attention_masks'].to_numpy(), targets_train)
    val_dataset = Bert_Dataset(inputs_val['author_ids'].to_numpy(), inputs_val['input_ids'].to_numpy(), inputs_val['attention_masks'].to_numpy(), targets_val)
    train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
    val_dataloader = DataLoader(val_dataset, batch_size = batch_size)

    return train_dataloader, val_dataloader, train_dataloader_small, val_dataloader_small

#df_targets_test.value_counts()
#iter(train_dataloader_small).next()

In [None]:
#author_ids, inputs_ids, attention_mask, target = iter(train_dataloader_small).next()
#print(f'author_ids: \n{author_ids.cpu().numpy()}\ntargets: \n{target.cpu().numpy()[:,0]}\n inputs_ids: \n{inputs_ids.cpu().numpy()}')

## Model

In [None]:
class personet2output(nn.Module):
  def __init__(self, n_classes):
    super(personet2output, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.3)

    self.MLP = nn.Sequential(
      nn.Linear(768,50),
      nn.ReLU(), nn.Dropout(0.1),
      nn.Linear(50, n_classes),
      nn.Softmax(dim=1)
      )
    self.MLP.apply(weights_init)
    '''
    self.list_MLPS = []
    for i in range(n_classes):
      self.list_MLPS.append(nn.Sequential(
          nn.Linear(768,50),
          nn.ReLU(), #nn.Dropout(0.1),
          nn.Linear(50, 1)
          ).to(DEVICE)
      )
    
    for model in self.list_MLPS:
      model.apply(weights_init)
    # torch.nn.init.xavier_uniform_(self.MLP.parameters())
    '''
    #for param in self.bert.parameters():
      #param.requires_grad = False

  
  def forward(self, input_ids, attention_mask):
    outputs = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    output = outputs[1]
    #pooled_output = outputs[0][:,0]
    
    '''
    result = torch.tensor([]).to(DEVICE)
    for network in self.list_MLPS:
      x = network(pooled_output)
      result = torch.cat((result, x), dim = 1)
    '''
    output = self.drop(output)

    
    output = self.MLP(output)
    # result = torch.tensor(result).to(DEVICE)
    #print(f'result: {result.size()}')
    #print(f'result: {result}')
    return output

class personet1output(nn.Module):
  def __init__(self):
    super(personet1output, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.3)

    self.MLP = nn.Sequential(
      nn.Linear(768,50),
      nn.ReLU(), nn.Dropout(0.1),
      nn.Linear(50, 1),
      #nn.Linear(50, n_classes),
      #nn.Softmax(dim=1)
      )
    self.MLP.apply(weights_init)

    #for param in self.bert.parameters():
     # param.requires_grad = False

  
  def forward(self, input_ids, attention_mask, output_hidden_states=False):
    outputs = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask,
      output_hidden_states = output_hidden_states
    )
    #print(outputs)
    output = outputs[1]
    if output_hidden_states:
        hidden_states = outputs[2]
    else:
        hidden_states = None
    #pooled_output = outputs[0][:,0]
    
    '''
    result = torch.tensor([]).to(DEVICE)
    for network in self.list_MLPS:
      x = network(pooled_output)
      result = torch.cat((result, x), dim = 1)
    '''
    output = self.drop(output)

    
    output = self.MLP(output)
    # result = torch.tensor(result).to(DEVICE)
    #print(f'result: {result.size()}')
    #print(f'result: {result}')
    return output, hidden_states

# predict number of subnets at once 
class allnet(nn.Module):
  def __init__(self, subnets):
    super(allnet, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    #self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
    self.drop = nn.Dropout(p=0.3)

    self.list_MLPS = []
    hidden_size = 50
    for i in range(subnets):
      self.list_MLPS.append(nn.Sequential(
          nn.Linear(768,hidden_size),
          nn.ReLU(), #nn.Dropout(0.1),
          nn.Linear(hidden_size, 1)
          ).to(DEVICE)
      )
    
    for model in self.list_MLPS:
      model.apply(weights_init)

  #@autocast()
  def forward(self, input_ids, attention_mask):
    outputs = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    output = outputs[1]
    output = self.drop(output)

    for i, model in enumerate(self.list_MLPS):
        x = model(output)
        if i == 0:
            result = x
        else:
            result = torch.cat((result, x), 1)
    return result


# simple MLP same as was used in baseline paper
class MLP(nn.Module):
    def __init__(self, n_classes):
      super(MLP, self).__init__()
      
      self.MLP = nn.Sequential(
        nn.Linear(768,50),
        nn.ReLU(),
        nn.Linear(50, n_classes),
        nn.Softmax(dim=1)
      )
      self.MLP.apply(weights_init)
    
    def forward(self, input):
      output = self.MLP(input)
      return output

In [None]:
def model_size(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()

    size_all_mb = (param_size + buffer_size) / 1024**2
    print('model size: {:.3f}MB'.format(size_all_mb))

## Training

In [None]:
def train_epoch(model, data_loader, loss_fn, optimizer, epoch, trait_idx, hot_encoding):
  model = model.train()
  train_acc_metric = keras.metrics.BinaryAccuracy()
  local_acc_metric = keras.metrics.BinaryAccuracy()
  sigmoid = nn.Sigmoid()
  losses = []
  for step, (author_ids, inputs_ids, attention_mask, target) in enumerate(data_loader):  
      target = target[:,trait_idx]
      if hot_encoding:
          one_hot_encoding = tf.keras.utils.to_categorical(target, num_classes=2)
          target = torch.from_numpy(one_hot_encoding).float().to(DEVICE)
      else:
          target = target.to(DEVICE).unsqueeze(0).t()

      # basic training steps
      for optim in optimizer:
          optim.zero_grad()
      model = model.train()
      logits, _ = model(inputs_ids, attention_mask)
      proba = sigmoid(logits)
      loss_value = loss_fn(proba, target)
      loss_value.backward()
      losses.append(loss_value.item())
      for optim in optimizer:
          optim.step()
      train_acc_metric.update_state(target.cpu(), proba.cpu().detach())

  # metrics gathering
  train_acc = (train_acc_metric.result() * 100).numpy()
  #print(f'train_acc : {train_acc}')
  #print(f'train_loss: {np.mean(losses)}')
  train_acc_metric.reset_states()

  return train_acc, np.mean(losses)

def val_epoch(model, data_loader, loss_fn, epoch, trait_idx, hot_encoding):
  model = model.eval()
  val_acc_metric = keras.metrics.BinaryAccuracy()
  losses = []
  sigmoid = nn.Sigmoid()

  for step, (author_ids, inputs_ids, attention_mask, target) in enumerate(data_loader):
      target = target[:,trait_idx]
      if hot_encoding:
          one_hot_encoding = tf.keras.utils.to_categorical(target, num_classes=2)
          target = torch.from_numpy(one_hot_encoding).float().to(DEVICE)
      else:
          target = target.to(DEVICE).unsqueeze(0).t()

      with torch.no_grad():
          logits, _ = model(inputs_ids, attention_mask)
          proba = sigmoid(logits)
          #if step == 0:
           #   print(f'val logit: {logits}')
          loss_value = loss_fn(proba, target)
          losses.append(loss_value.item())
          #proba = logits
          val_acc_metric.update_state(target.cpu(), proba.cpu())


  # metrics gathering
  val_acc = (val_acc_metric.result() * 100).numpy()
  #print(f'val_acc  : {val_acc}')
  #print(f'val_loss : {np.mean(losses)}')
  return val_acc, np.mean(losses)

In [None]:
def predict(model, data_loader, loss_fn, epoch, trait_idx, hot_encoding):
  model = model.eval()
  val_acc_metric = keras.metrics.BinaryAccuracy()
  losses = []
  sigmoid = nn.Sigmoid()
  predictions = [[],[],[]]

  for step, (author_ids, inputs_ids, attention_mask, target) in enumerate(data_loader):
      target = target[:,trait_idx]
      #if step == 0: 
       #       print(f'logit: {target}')
      if hot_encoding:
          one_hot_encoding = tf.keras.utils.to_categorical(target, num_classes=2)
          target = torch.from_numpy(one_hot_encoding).float().to(DEVICE)
      else:
          target = target.to(DEVICE).unsqueeze(0).t()

      with torch.no_grad():
          logits, _ = model(inputs_ids, attention_mask)
          proba = sigmoid(logits)
          predictions[0].extend(author_ids.cpu().tolist())
          predictions[1].extend(proba.cpu().tolist())
          predictions[2].extend(target.cpu().tolist())
          
          loss_value = loss_fn(proba, target)
          #if step == 0: 
           #   print(f'logit: {logits}, {logits[:,1]}')
            #  print(f'logit: {target}, {target[:,1]}')
          
          losses.append(loss_value.item())
          #proba = logits
          #proba = sigmoid(logits)
          val_acc_metric.update_state(target.cpu(), proba.cpu())


      # metrics gathering
  val_acc = (val_acc_metric.result() * 100).numpy()
  print(f'val_acc  : {val_acc}')
  print(f'val_loss : {np.mean(losses)}')
      
  return predictions

In [None]:
writer = SummaryWriter(comment="")
#current_time = str(datetime.datetime.now().now().strftime("%Y%m%d-%H%M%S"))
#log_dir = 'logs/tensorboard/' + current_time
#summary_writer = summary.create_file_writer(log_dir)

def full_training(train_dataloader, val_dataloader, trait, trait_idx, hp):
    print(f'trait_idx: {trait_idx}')
    # hyperparameter
    lr_mlp = hp['lr_mlp']
    lr_bert = hp['lr_bert']
    batch_size = hp['batch_size']
    n_classes = hp['n_classes']
    hidden_dim = hp['hidden_dim']
    wd = hp['wd']
    hot_encoding = hp['hot_encoding']
        
    # model, optimizer, loss
    model = personet1output().to(DEVICE)
    optimizer_bert = torch.optim.AdamW(model.bert.parameters(), lr=lr_bert, weight_decay = wd)
    optimizer_mlp = torch.optim.Adam(model.MLP.parameters(), lr=lr_mlp, eps=1e-07, weight_decay = wd)
    optimizer = [optimizer_bert, optimizer_mlp]
    #optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.BCELoss(reduction='sum').to(DEVICE)

    # metrics
    train_loss_list = []
    train_acc_list = []
    val_loss_list = []
    val_acc_list = []
    train_acc = 50
    val_acc = 50

    # training loop
    for epoch in (range(hp['epochs'])): #tqdm
        #print(f'Epoch: {epoch + 1}')

        # visualization
        #prepare_visulaize(model, train_dataloader, epoch, trait_idx, hot_encoding, 'Train', train_acc)
        #prepare_visulaize(model, val_dataloader, epoch, trait_idx, hot_encoding, 'Val', val_acc)
        
        # training loop
        train_acc, train_loss = train_epoch(model, train_dataloader, loss_fn, optimizer, epoch, trait_idx, hot_encoding)
        train_acc_list.append(train_acc)
        train_loss_list.append(train_loss)
        
        # validation loop
        val_acc, val_loss = val_epoch(model, val_dataloader, loss_fn, epoch, trait_idx, hot_encoding)
        val_acc_list.append(val_acc)
        val_loss_list.append(val_loss)
        
        if epoch == 4:
            print(f'Save model, val acc: {val_acc}')
            print(f'pretrained_model/pretrained_one_class/{trait}.bin')
            torch.save(model.state_dict(), f'pretrained_model/pretrained_one_class/{trait}.bin')
            

    print('')
    print(f'Final: ')
    train_acc_best = max(train_acc_list)
    print(f'Best train acc: {train_acc_best}, from: {train_acc_list}')
    val_acc_best = max(val_acc_list)
    print(f'Best val  acc: {val_acc_best}, from: {val_acc_list}')

In [None]:
trait_labels = ["EXT", "NEU", "AGR", "CON", "OPN"]
token_length_list = ['512']
to_do = [4]

hyperparameter = {
    'epochs': 5,
    'lr_mlp': 1e-4,
    'lr_bert': 2e-5,
    'batch_size': 16,
    'n_classes': 2,
    'hidden_dim': 768,
    'wd': 1e-6,
    'hot_encoding': False,
}

for trait_idx, trait in enumerate(trait_labels):
    if trait_idx not in to_do:
        continue
    for token_length in token_length_list:
        base = ''
        token_l = 0
        tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
        mode = token_length
        datafile = base + "data/essays/essays.csv"
        author_ids, input_ids, attention_masks, targets = essays_embeddings(
            datafile, tokenizer, token_l, mode
        )

        train_dataloader, val_dataloader, train_dataloader_small, val_dataloader_small = return_dataloader(author_ids, input_ids, attention_masks, targets)

        print(f'token_length: {token_length}')
        full_training(train_dataloader, val_dataloader, trait, trait_idx, hyperparameter)


    # predict
    #model = personet1output().to(DEVICE)
    #model.load_state_dict(torch.load(f'pretrained_model/pretrained_one_class/{trait}_1_output.bin'))

787.5030401297122


The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).


token_length: 512
trait_idx: 4
Save model, val acc: 60.960960388183594
pretrained_model/pretrained_one_class/OPN.bin

Final: 
Best train acc: 83.99575805664062, from: [48.966614, 57.021725, 64.65289, 73.39693, 83.99576]
Best val  acc: 63.96396255493164, from: [51.95195, 60.96096, 60.66066, 63.963963, 60.96096]


In [None]:
model = personet1output().to(DEVICE)
model.load_state_dict(torch.load(f'pretrained_model/pretrained_one_class/OPN.bin'))
hot_encoding = False
trait_idx = 4
loss_fn = nn.BCELoss(reduction='sum').to(DEVICE)
predictions = predict(model.to(DEVICE), val_dataloader, loss_fn, 0, trait_idx, hot_encoding)

val_acc  : 60.960960388183594
val_loss : 12.738883222852435


In [None]:
df = pd.DataFrame(predictions).T.rename(columns={0: "author_ids", 1: "probabilities", 2: 'targets'})
df['author_ids'] = df['author_ids'].astype(int)
df['probabilities'] = df['probabilities'].apply(lambda x: x[0])
df['targets'] = df['targets'].apply(lambda x: x[0])

df['predictions'] = df['probabilities'].apply(np.round)
df = df.sort_values(by=['probabilities'])
#df.to_csv('predictions/bert_one_class_opn')
df

Unnamed: 0,author_ids,probabilities,targets,predictions
209,75,0.065019,1.0,0.0
89,476,0.065069,0.0,0.0
137,373,0.065114,0.0,0.0
214,901,0.065345,0.0,0.0
46,1622,0.065476,0.0,0.0
...,...,...,...,...
276,1709,0.952791,0.0,1.0
278,1267,0.953275,1.0,1.0
44,1968,0.956046,0.0,1.0
87,533,0.956334,1.0,1.0


In [None]:
def score_and_visualize(text):
  print(text)

  sigmoid = nn.Sigmoid()
  encoding = tokenizer.encode_plus(
                    x,
                    add_special_tokens=True,
                    max_length=512,
                    pad_to_max_length=True,
                    return_attention_mask=True,
                )
  input_ids = encoding['input_ids']
  attention_masks = encoding['attention_mask']

  logits, _ = model(inputs_ids, attention_mask)
  proba = sigmoid(logits).detach().cpu().numpy()

  #prediction = pipe([text])
  print(proba)

  explainer = shap.Explainer(pipe)
  shap_values = explainer([text])

  shap.plots.text(shap_values)

In [None]:
#pytorch for all traits
#trait_labels = ["EXT", "NEU", "AGR", "CON", "OPN"]
trait_labels = ["EXT", "OPN"]

writer = SummaryWriter(comment="")
#current_time = str(datetime.datetime.now().now().strftime("%Y%m%d-%H%M%S"))
#log_dir = 'logs/tensorboard/' + current_time
#summary_writer = summary.create_file_writer(log_dir)


# training
epochs = 8
# hyperparameter
lr_mlp = 1e-4
lr_bert = 1e-5
batch_size = 4
n_classes = 2
hidden_dim = 768
wd = 1e-6
hot_encoding = False
trait_idx = -1

    
# model optimizer loss
model = allnet(2).to(DEVICE)
model_size(model)


optimizer_bert = torch.optim.AdamW(model.bert.parameters(), lr=lr_bert, weight_decay = wd)
#for model.list_MLPS.parameters()
#params = [
#    {'params': model.list_MLPS[0].parameters()},
#            ]
params = [{'params': model.parameters()} for model in model.list_MLPS]
optimizer_mlp = torch.optim.Adam(params, lr=lr_mlp, eps=1e-07, weight_decay = wd)
optimizer = [optimizer_bert, optimizer_mlp]
#optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_fn = nn.BCEWithLogitsLoss(reduction='sum').to(DEVICE)

# metrics
train_loss_list = []
train_acc_list = []
val_loss_list = []
val_acc_list = []

# training loop
for epoch in (range(epochs)): #tqdm
    print(f'Epoch: {epoch + 1}')
    
    # training loop
    train_acc, train_loss = train_epoch_all(model, train_dataloader, loss_fn, optimizer, epoch, trait_idx, hot_encoding)
    train_acc_list.append(train_acc)
    train_loss_list.append(train_loss)
    

    # validation loop
    val_acc, val_loss = val_epoch_all(model, val_dataloader, loss_fn, epoch, trait_idx, hot_encoding)
    val_acc_list.append(val_acc)
    val_loss_list.append(val_loss)
    
    if val_acc >= max(val_acc_list) and val_acc >= 50.0:
        print('Save model')
        #torch.save(model.state_dict(), f'pretrained_model/pretrained_one_class/{trait}_1_output.bin')

print('')
print(f'Final: ')
train_acc_best = max(train_acc_list)
print(f'Best train acc: {train_acc_best}, from: {train_acc_list}')
val_acc_best = max(val_acc_list)
print(f'Best val  acc: {val_acc_best}, from: {val_acc_list}')

model size: 417.649MB
Epoch: 1
train_acc : [50.236584, 54.064266]
val_acc   : [56.43315, 58.653843]
Save model
Epoch: 2
train_acc : [51.73023, 61.790253]
val_acc   : [54.945057, 56.13553]
Epoch: 3
train_acc : [55.70268, 67.680084]
val_acc   : [54.71612, 64.90385]
Save model
Epoch: 4
train_acc : [56.380646, 74.18785]
val_acc   : [56.13553, 62.225277]
Epoch: 5
train_acc : [57.817795, 86.705505]
val_acc   : [56.799454, 61.92765]
Epoch: 6
train_acc : [61.373585, 92.21398]
val_acc   : [56.272896, 58.356224]
Epoch: 7
train_acc : [65.86158, 97.192795]
val_acc   : [56.57051, 60.07326]
Epoch: 8
train_acc : [70.85098, 98.62288]
val_acc   : [55.608974, 60.737183]

Final: 
Best train acc: 84.73767852783203, from: [52.146263, 56.75676, 61.685215, 65.28882, 72.25755, 76.78855, 81.53153, 84.73768]
Best val  acc: 59.75975799560547, from: [57.507507, 55.555557, 59.759758, 59.15916, 59.30931, 57.20721, 58.25826, 58.10811]


In [None]:
#pytorch for all traits
trait_labels = ["EXT", "NEU", "AGR", "CON", "OPN"]

current_time = str(datetime.datetime.now().now().strftime("%Y%m%d-%H%M%S"))
log_dir = 'logs/tensorboard/' + current_time
summary_writer = summary.create_file_writer(log_dir)

# training
n_splits = 10
epochs = 10
# hyperparameter
lr_mlp = 1e-4
lr_bert = 2e-5
batch_size = 16
n_classes = 2
hidden_dim = 768
wd = 1e-6


# testing
test_on = 10
inputs = df_inputs_train.iloc[:test_on,]
targets = df_targets_train.iloc[:test_on,]


# split
inputs_train, inputs_val, targets_train, targets_val = train_test_split(inputs, targets, test_size=0.15, stratify=targets)

# dataloader
train_dataset = Bert_Dataset(inputs_train['input_ids'].to_numpy(), inputs_train['attention_masks'].to_numpy(), targets_train)
val_dataset = Bert_Dataset(inputs_val['input_ids'].to_numpy(), inputs_val['attention_masks'].to_numpy(), targets_val)
train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
val_dataloader = DataLoader(val_dataset, batch_size = batch_size)

# model optimizer loss
model = personet(n_classes).to(DEVICE)
optimizer_bert = torch.optim.AdamW(model.bert.parameters(), lr=lr_bert, weight_decay = wd)
optimizer_mlp = torch.optim.Adam(model.MLP.parameters(), lr=lr_mlp, eps=1e-07, weight_decay = wd)
optimizer = [optimizer_bert, optimizer_mlp]
#optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_fn = nn.BCELoss(reduction='sum').to(DEVICE)

# metrics
train_loss_list = []
train_acc_list = []
val_loss_list = []
val_acc_list = []

# training loop
for epoch in (range(epochs)): #tqdm
    
    print(f'Epoch: {epoch + 1}')
    
    # training loop
    train_acc, train_loss = train_epoch_all_traits(model, train_dataloader, loss_fn, optimizer, epoch)
    train_acc_list.append(train_acc)
    train_loss_list.append(train_loss)
    
    

    # validation loop
    val_acc, val_loss = val_epoch_all_traits(model, val_dataloader, loss_fn, epoch)
    val_acc_list.append(val_acc)
    val_loss_list.append(val_loss)


print('')
print(f'Final: ')
train_acc_best = max(train_acc_list)
print(f'Best train acc: {train_acc_best}, from these: {train_acc_list}')
val_acc_best = max(val_acc_list)
print(f'Best val  acc: {val_acc_best}, from these: {val_acc_list}')

Epoch: 1
Epoch: 2


In [None]:
# %%time

history = defaultdict(list)
EPOCHS = 10

best_acc = {}
for trait_idx, trait in tqdm(enumerate(["EXT", "NEU", "AGR", "CON", "OPN"])):
  #if trait_idx != 1:
   # continue

  # model
  model = SentimentClassifier(n_classes = 2)
  model = model.to(DEVICE)

  #torch.optim.AdamW
  optimizer_bert = torch.optim.AdamW(model.bert.parameters(), lr=2e-5)
  #optimizer_bert = AdamW(model.bert.parameters(), lr=5e-5, correct_bias=False)
  optimizer_mlp = torch.optim.Adam(model.MLP.parameters(), lr=5e-4)
  optimizers = [optimizer_bert, optimizer_mlp]

  total_steps = len(data_loader_train) * EPOCHS


  scheduler = get_linear_schedule_with_warmup(
    optimizer_bert,
    num_warmup_steps=0,
    num_training_steps=total_steps
  )
  loss_fn = nn.BCELoss().to(DEVICE)

  train_loss_list = []
  train_acc_list = []
  val_loss_list = []
  val_acc_list = []
  for epoch in range(EPOCHS):

    #print(f'Epoch {epoch + 1}/{EPOCHS}')
    #print('-' * 10)
    train_acc, train_loss = train_epoch(model, data_loader_train, loss_fn, 
                                        optimizers, DEVICE, scheduler, 
                                        len(data_loader_train), trait_idx
                                        )

    #print(f'Train loss {train_loss} accuracy {train_acc}')
    val_acc, val_loss = eval_model(model, data_loader_val, loss_fn, 
                                    DEVICE, len(data_loader_val), trait_idx
                                  )

    #print(f'Val   loss {val_loss} accuracy {val_acc}')
    #print()

    train_acc_list.append(train_acc)
    train_loss_list.append(train_loss)
    val_acc_list.append(val_acc)
    val_loss_list.append(val_loss)

  print(f'Train loss: {train_loss_list}')
  print(f'Val   loss: {val_loss_list}')
  print(f'Train accuracy: {train_acc_list}')
  print(f'Val   accuracy: {val_acc_list}')

  best_acc[trait] = np.amax(val_acc_list)

print(best_acc)

0it [00:00, ?it/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

1it [17:27, 1047.58s/it]

Train loss: [0.7174489311873913, 0.728443194180727, 0.6808435693383217, 0.6695068795233965, 0.659148421138525, 0.6243377160280943, 0.5829568188637495, 0.5489284712821245, 0.5171634368598461, 0.5117265358567238]
Val   loss: [0.8292552194149374, 0.7403159034337928, 0.7494856226358483, 0.8856332049095373, 0.9812030301248427, 0.7822692169131135, 0.7867868722771569, 0.8179814742623474, 0.8120708201857779, 0.8602678201181425]
Train accuracy: [0.5065104166666666, 0.5182291666666666, 0.5533854166666666, 0.5989583333333334, 0.6822916666666666, 0.6783854166666666, 0.72265625, 0.7096354166666666, 0.7434895833333334, 0.7708333333333334]
Val   accuracy: [0.48218732706142786, 0.48488516878804655, 0.48713337022689546, 0.5024211400110681, 0.505672385168788, 0.5213060320973991, 0.5280506364139458, 0.5186081903707803, 0.529849197565025, 0.5267017155506365]


In [None]:
train_images = []
for filename in sorted(glob.glob('plots/dim_reduction/Train/*')):
    print('appending for',filename)
    train_images.append(imageio.imread(filename))
imageio.mimsave('plots/dim_reduction/train.gif', train_images,duration=len(train_images)/2)

train_images = []
for filename in sorted(glob.glob('plots/dim_reduction/Val/*')):
    print('appending for',filename)
    train_images.append(imageio.imread(filename))
imageio.mimsave('plots/dim_reduction/val.gif', train_images,duration=len(train_images)/2)

appending for plots/dim_reduction/Train/0
appending for plots/dim_reduction/Train/1
appending for plots/dim_reduction/Train/2
appending for plots/dim_reduction/Train/3
appending for plots/dim_reduction/Train/4
appending for plots/dim_reduction/Val/0
appending for plots/dim_reduction/Val/1
appending for plots/dim_reduction/Val/2
appending for plots/dim_reduction/Val/3
appending for plots/dim_reduction/Val/4


In [None]:
class MLPDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = torch.from_numpy(inputs).float()
        self.targets = torch.from_numpy(targets).float()
        

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        return (self.inputs[idx].to(DEVICE), self.targets[idx].to(DEVICE))

In [None]:
def train_epoch_all_traits(model, data_loader, loss_fn, optimizer, epoch):
  model = model.train()
  train_acc_metric = keras.metrics.BinaryAccuracy()
  local_acc_metric = keras.metrics.BinaryAccuracy()
  losses = []
  for step, (inputs_ids, attention_mask, target) in enumerate(train_dataloader):
      # basic training steps
      for optim in optimizer:
          optim.zero_grad()
      #optimizer.zero_grad()
      model = model.train()
      logits = model(inputs_ids, attention_mask)
      loss_value = loss_fn(logits, target)
      loss_value.backward()
      losses.append(loss_value.item())
      for optim in optimizer:
          optim.step()
      train_acc_metric.update_state(target.cpu(), logits.cpu().detach())
      local_acc_metric.update_state(target.cpu(), logits.cpu().detach())
      local_acc = local_acc_metric.result() * 100
      local_acc_metric.reset_states()
      # This is where I'm recording to Tensorboard
      with summary_writer.as_default():
          tf.summary.scalar('train_loss', loss_value.item(), step=(epoch + 1) * step * len(train_dataloader))
          tf.summary.scalar('train_acc', local_acc, step=(epoch + 1) * step * len(train_dataloader))

  # metrics gathering
  train_acc = (train_acc_metric.result() * 100).numpy()
  #print(f'train_acc : {train_acc}')
  #print(f'train_loss: {np.mean(losses)}')
  train_acc_metric.reset_states()

  

  return train_acc, np.mean(losses)

def val_epoch_all_traits(model, data_loader, loss_fn, epoch):
  model = model.eval()
  val_acc_metric = keras.metrics.BinaryAccuracy()
  losses = []

  for step, (inputs_ids, attention_mask, target) in enumerate(val_dataloader):
      with torch.no_grad():
          val_logits = model(inputs_ids, attention_mask)
          #if step == 0:
           #   print(f'val logit: {val_logits}')
          loss_value = loss_fn(val_logits, target)
          losses.append(loss_value.item())
          val_acc_metric.update_state(target.cpu(), val_logits.cpu())


      # metrics gathering
  val_acc = (val_acc_metric.result() * 100).numpy()
  #print(f'val_acc  : {val_acc}')
  #print(f'val_loss : {np.mean(losses)}')
      
  return val_acc, np.mean(losses)

In [None]:
def train_epoch(model, data_loader, loss_fn, optimizer, n_examples):
  model = model.train()

  losses = []
  correct_predictions = 0
  acc = 0
  m = tf.keras.metrics.BinaryAccuracy()
  
  for d in (data_loader):
    inputs = d[0]
    targets = d[1]


    outputs = model(
      inputs
    )
    #outputs = torch.special.expit(outputs)

    loss = loss_fn(outputs, targets)
    #print(f'outputs: {outputs}, after arg: {torch.argmax(outputs, dim=-1)}')

    losses.append(loss.item())
    loss.backward()

    #outputs = torch.argmax(outputs, dim=-1)
    #targets = torch.argmax(targets, dim=-1)
    m.update_state(targets.cpu(), outputs.cpu().detach())

    #nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    optimizer.zero_grad()
  
  acc = m.result().numpy() * 100
  m.reset_state()

  return acc, np.mean(losses)

def eval_model(model, data_loader, loss_fn, n_examples):
  model = model.eval()

  losses = []
  correct_predictions = 0
  #acc = []
  m = tf.keras.metrics.BinaryAccuracy()

  with torch.no_grad():
    for d in (data_loader):
      inputs = d[0]
      targets = d[1]

      outputs = model(
        inputs
      )

      #print(f'outputs: {outputs}')
      #print(f'targets: {targets}')

      loss = loss_fn(outputs, targets)
      m.update_state(targets.cpu(), outputs.cpu().detach())
      losses.append(loss.item())
  
  acc = m.result().numpy() * 100
  m.reset_state()

  return acc, np.mean(losses)

In [None]:
def get_inputs(inp_dir, dataset, embed, embed_mode, mode, layer):
    n_hl = 12
    """Read data from pkl file and prepare for training."""
    file = open(
        inp_dir + dataset + "-" + embed + "-" + embed_mode + "-" + mode + ".pkl", "rb"
    )
    data = pickle.load(file)
    author_ids, data_x, data_y = list(zip(*data)) # all_author_ids, hidden_features, all_targets
    file.close()

    # alphaW is responsible for which BERT layer embedding we will be using
    if layer == "all":
        alphaW = np.full([n_hl], 1 / n_hl)

    # standard is 11
    else:
        alphaW = np.zeros([n_hl])
        alphaW[int(layer) - 1] = 1

    # just changing the way data is stored (tuples of minibatches) and
    # getting the output for the required layer of BERT using alphaW
    inputs = []
    targets = []
    n_batches = len(data_y)
    for ii in range(n_batches):
        inputs.extend(np.einsum("k,kij->ij", alphaW, data_x[ii]))
        targets.extend(data_y[ii])

    inputs = np.array(inputs)
    full_targets = np.array(targets)

    return inputs, full_targets

inputs, full_targets = get_inputs('pkl_data/', 'essays', 'bert-base', 'cls', 'normal', 11)

In [None]:
from keras import backend as K

def acc(y_true, y_pred):
    #y_pred = tf.convert_to_tensor(y_pred)
    #threshold = tf.cast(0.5, y_pred.dtype)
    #y_pred = tf.cast(y_pred > threshold, y_pred.dtype)
    y_pred = K.round(y_pred)
    #print(K.eval(tf.cast(K.cast(K.round(y_pred) == y_true, 'int32'), K.floatx())))
    return tf.cast(K.cast(K.round(y_pred) == y_true, 'int32'), K.floatx())

In [None]:
# keras
def training(dataset, inputs, full_targets):
    """Train MLP model for each trait on 10-fold corss-validtion."""
    if dataset == "kaggle":
        trait_labels = ["E", "N", "F", "J"]
    else:
        trait_labels = ["EXT", "NEU", "AGR", "CON", "OPN"]

    n_splits = 10
    fold_acc = {}
    expdata = {}
    expdata["acc"], expdata["trait"], expdata["fold"] = [], [], []

    all_data = [[[]]]

    for trait_idx in range(full_targets.shape[1]):
        if trait_idx == 10:
          continue
        # convert targets to one-hot encoding
        targets = full_targets[:, trait_idx]
        n_data = targets.shape[0]

        expdata["trait"].extend([trait_labels[trait_idx]] * n_splits)
        expdata["fold"].extend(np.arange(1, n_splits + 1))

        print(f'Trait: {trait_labels[trait_idx]}')
        skf = StratifiedKFold(n_splits=n_splits, shuffle=False)
        k = -1
        for train_index, test_index in skf.split(inputs, targets):
            x_train, x_test = inputs[train_index], inputs[test_index]
            y_train, y_test = targets[train_index], targets[test_index]
            #print(f'y_train: {y_train}')
            # converting to one-hot embedding
            y_train = tf.keras.utils.to_categorical(y_train, num_classes=n_classes)
            y_test = tf.keras.utils.to_categorical(y_test, num_classes=n_classes)
            #print(f'y_train: {y_train}')

            
            model = tf.keras.models.Sequential([
                tf.keras.layers.Dense(50, input_dim=hidden_dim, activation="relu"),
                tf.keras.layers.Dense(n_classes),
                tf.keras.layers.Softmax(axis=-1)
            ])
            
            #model = get_keras_model()
            #

            #model = CustomModel()
                      

            # define the neural network architecture
            #model.add(
            #    tf.keras.layers.Dense(50, input_dim=hidden_dim, activation="relu")
            #)
            #model.add(tf.keras.layers.Dense(n_classes))

            k += 1
            model.compile(
                optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
                loss=tf.keras.losses.BinaryCrossentropy(),
                #loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                #metrics=[tf.keras.metrics.BinaryAccuracy(), acc]
                #metrics=[acc]
                metrics=["accuracy"],
            )
            #print(model.compiled_metrics._metrics)

            history = model.fit(
                x_train,
                y_train,
                epochs=epochs,
                batch_size=batch_size,
                validation_data=(x_test, y_test),
                verbose=0,
                shuffle=True
            )
            #model.summary()
            #print(history.history)
            best_acc_fold = max(history.history["val_accuracy"])
            #print(f'Best bin fold: {best_acc_fold}, from these: {history.history["val_binary_accuracy"]}')
            print(f'Best acc fold: {best_acc_fold}, from these: {history.history["val_accuracy"]}')
            expdata["acc"].append(100 * max(history.history["val_accuracy"]))
    print(expdata)
    df = pd.DataFrame.from_dict(expdata)
    return df


n_classes = 2
hidden_dim = 768
epochs = 10
lr = 5e-4
#tf.config.run_functions_eagerly(False)
df = training('essays', inputs, full_targets)

Trait: EXT
Best acc fold: 0.5789473652839661, from these: [0.5425100922584534, 0.5789473652839661, 0.546558678150177, 0.5627530217170715, 0.5546558499336243, 0.5748987793922424, 0.5789473652839661, 0.5384615659713745, 0.5587044358253479, 0.5587044358253479]
Best acc fold: 0.5708501935005188, from these: [0.5506072640419006, 0.5546558499336243, 0.5708501935005188, 0.5668016076087952, 0.546558678150177, 0.5668016076087952, 0.5546558499336243, 0.5708501935005188, 0.5546558499336243, 0.5627530217170715]
Best acc fold: 0.6153846383094788, from these: [0.5425100922584534, 0.5708501935005188, 0.5344129800796509, 0.52226722240448, 0.5263158082962036, 0.5587044358253479, 0.5627530217170715, 0.6153846383094788, 0.5141700506210327, 0.5303643941879272]
Best acc fold: 0.5748987793922424, from these: [0.5101214647293091, 0.5627530217170715, 0.5101214647293091, 0.5384615659713745, 0.5425100922584534, 0.5748987793922424, 0.5303643941879272, 0.5384615659713745, 0.5020242929458618, 0.4979757070541382]
B

In [None]:
# tensorflow
import keras.initializers as initializers
def training(dataset, inputs, full_targets):
    trait_labels = ["EXT", "NEU", "AGR", "CON", "OPN"]

    n_splits = 10
    fold_acc = {}
    expdata = {}
    expdata["acc"], expdata["trait"], expdata["fold"] = [], [], []

    for trait_idx in range(full_targets.shape[1]):
        if trait_idx != 0:
          continue
        # convert targets to one-hot encoding
        targets = full_targets[:, trait_idx]
        n_data = targets.shape[0]

        expdata["trait"].extend([trait_labels[trait_idx]] * n_splits)
        expdata["fold"].extend(np.arange(1, n_splits + 1))

        skf = StratifiedKFold(n_splits=n_splits, shuffle=False)
        k = -1
        for train_index, test_index in skf.split(inputs, targets):
            x_train, x_val = inputs[train_index], inputs[test_index]
            y_train, y_val = targets[train_index], targets[test_index]
            #print(f'y_train: {y_train}')
            # converting to one-hot embedding
            y_train = tf.keras.utils.to_categorical(y_train, num_classes=n_classes)
            y_val = tf.keras.utils.to_categorical(y_val, num_classes=n_classes)
            #print(f'y_train: {y_train}')

            # Prepare the training dataset.
            train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
            train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)

            # Prepare the validation dataset.
            val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
            val_dataset = val_dataset.batch(batch_size)

            model = tf.keras.models.Sequential([
                tf.keras.layers.Dense(50, input_dim=hidden_dim, activation="relu", kernel_initializer=initializers.RandomNormal(stddev=0.05), bias_initializer=initializers.Zeros()),
                tf.keras.layers.Dense(n_classes, kernel_initializer=initializers.RandomNormal(stddev=0.05), bias_initializer=initializers.Zeros()),
                tf.keras.layers.Softmax(axis=-1)
            ])

            # Instantiate an optimizer to train the model.
            optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
            # Instantiate a loss function.
            loss_fn = tf.keras.losses.BinaryCrossentropy()

            # Prepare the metrics.
            train_acc_metric = keras.metrics.BinaryAccuracy()
            val_acc_metric = keras.metrics.BinaryAccuracy()

            train_loss_list = []
            train_acc_list = []
            val_loss_list = []
            val_acc_list = []
            for epoch in range(epochs):
                # Iterate over the batches of the dataset.
                for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
                    with tf.GradientTape() as tape:
                        logits = model(x_batch_train, training=True)
                        loss_value = loss_fn(y_batch_train, logits)
                    grads = tape.gradient(loss_value, model.trainable_weights)
                    optimizer.apply_gradients(zip(grads, model.trainable_weights))

                    # Update training metric.
                    train_acc_metric.update_state(y_batch_train, logits)

                # Display metrics at the end of each epoch.
                train_acc = train_acc_metric.result()

                #print("Training acc over epoch: %.4f" % (float(train_acc),))
                train_acc_list.append(train_acc)

                # Reset training metrics at the end of each epoch
                train_acc_metric.reset_states()

                # Run a validation loop at the end of each epoch.
                for x_batch_val, y_batch_val in val_dataset:
                    val_logits = model(x_batch_val, training=False)
                    # Update val metrics
                    val_acc_metric.update_state(y_batch_val, val_logits)
                val_acc = (val_acc_metric.result() * 100).numpy()
                val_acc_list.append(val_acc)
                val_acc_metric.reset_states()
                #print("Validation acc: %.4f" % (float(val_acc),))
                #print("Time taken: %.2fs" % (time.time() - start_time))

            #print(history.history)
            best_acc_fold = max(val_acc_list)
            #print(f'Best bin fold: {best_acc_fold}, from these: {history.history["val_binary_accuracy"]}')
            print(f'Best acc fold: {best_acc_fold}, from these: {val_acc_list}')
            


n_classes = 2
hidden_dim = 768
epochs = 10
lr = 5e-4
tf.config.run_functions_eagerly(True)
df = training('essays', inputs, full_targets)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 50)                38450     
                                                                 
 dense_1 (Dense)             (None, 2)                 102       
                                                                 
 softmax (Softmax)           (None, 2)                 0         
                                                                 
Total params: 38,552
Trainable params: 38,552
Non-trainable params: 0
_________________________________________________________________
Best acc fold: 57.4898796081543, from these: [56.68016, 55.465584, 57.08502, 55.870445, 55.465584, 55.870445, 57.48988, 55.060726, 56.68016, 56.275303]


KeyboardInterrupt: ignored