# Finetune Bert for personality classification

## Setup


In [None]:
!pip install -q -U watermark
!pip install -qq transformers
!pip install --quiet shap==0.39
!pip install tweet-preprocessor

!pip install tqdm
!pip install livelossplot --quiet
%reload_ext watermark
%watermark -v -p numpy,pandas,torch,transformers

[K     |████████████████████████████████| 1.6 MB 4.1 MB/s 
[K     |████████████████████████████████| 4.9 MB 4.1 MB/s 
[K     |████████████████████████████████| 120 kB 49.3 MB/s 
[K     |████████████████████████████████| 6.6 MB 27.8 MB/s 
[K     |████████████████████████████████| 356 kB 4.3 MB/s 
[?25h  Building wheel for shap (setup.py) ... [?25l[?25hdone
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tweet-preprocessor
  Downloading tweet_preprocessor-0.6.0-py3-none-any.whl (27 kB)
Installing collected packages: tweet-preprocessor
Successfully installed tweet-preprocessor-0.6.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

Python implementation: CPython
Python version       : 3.7.14
IPython version      : 7.9.0

numpy       : 1.21.6
pandas      : 1.3.5
torch       : 1.12.1+cu113
transformers: 4.22.0



In [None]:
#@title Setup & Config
from wordcloud import WordCloud
import transformers
from transformers import logging
logging.set_verbosity_error()
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from transformers import DistilBertTokenizer, DistilBertModel
from transformers import LongformerConfig, LongformerModel
from transformers import RobertaTokenizer
from transformers.utils.dummy_pt_objects import PreTrainedModel
import shap
import torch
from sklearn.model_selection import train_test_split
import warnings
import preprocessor as p

import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap

import pickle
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
import sklearn
import keras
from tensorflow.keras.layers import Conv2D, BatchNormalization, GlobalAveragePooling2D, \
Dense, Input, Activation, MaxPool2D
from tensorflow.keras import Model

import numpy as np
import pandas as pd
import re
import csv
# import preprocessor as p
import math
from torch.utils.data import TensorDataset, DataLoader

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from tqdm import tqdm

from tensorflow import summary
import datetime
from torch.utils.tensorboard import SummaryWriter
from torch.cuda.amp import autocast 
%load_ext tensorboard

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from livelossplot import PlotLosses

pd.options.display.max_colwidth = 1000
pd.set_option('display.expand_frame_repr', False)

import re
import imageio,glob

import random
seed = 0
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

%matplotlib inline
%config InlineBackend.figure_format='retina'
torch.set_printoptions(precision=3, sci_mode=False)

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 12, 8



DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#DEVICE =torch.device('cpu')

In [None]:
# declare global settings
 # this is still possible on the gpu for Bert - 32 not tested yet
batch_size = 16
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
#DEVICE =torch.device('cpu')

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#DEVICE =torch.device('cpu')

In [None]:
from google.colab import drive
drive.mount('/content/drive')
PROJECT_PATH = 'drive/MyDrive/Colab\ Notebooks/application_project/personality-prediction'
%cd $PROJECT_PATH

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/application_project/personality-prediction


## Functions

In [None]:
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Linear') != -1:
        #torch.nn.init.xavier_uniform_(m.weight)
        torch.nn.init.normal_(m.weight, mean=0.0, std=0.05)
        torch.nn.init.zeros_(m.bias)

        #initializers.RandomNormal(stddev=0.01), bias_initializer=initializers.Zeros()
    elif classname.find('BatchNorm') != -1:
        torch.nn.init.normal_(m.weight, 1.0, 0.02)
        torch.nn.init.zeros_(m.bias)

In [None]:
def visualize_layerwise_embeddings(hidden_states,masks,labels,epoch,title, acc,layers_to_visualize=[0,1,2,3,8,9,10,11]):
    dim_reducer = TSNE(n_components=2, learning_rate = 'auto', init='pca')
    #dim_reducer = PCA(n_components=2)

    !mkdir -p plots/dim_reduction/{title}
    num_layers = len(layers_to_visualize)
    
    fig = plt.figure(figsize=(24,(num_layers/4)*6)) #each subplot of size 6x6, each row will hold 4 plots
    ax = [fig.add_subplot(num_layers/4,4,i+1) for i in range(num_layers)]
    
    labels = labels.numpy().reshape(-1)
    for i,layer_i in enumerate(layers_to_visualize):
        layer_embeds = hidden_states[layer_i]
        
        layer_averaged_hidden_states = torch.div(layer_embeds.sum(dim=1),masks.sum(dim=1,keepdim=True))
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            layer_dim_reduced_embeds = dim_reducer.fit_transform(layer_averaged_hidden_states.numpy())
        
        df = pd.DataFrame.from_dict({'x':layer_dim_reduced_embeds[:,0],'y':layer_dim_reduced_embeds[:,1],'label':labels})

        df.label = df.label.astype(int)
        sns.scatterplot(data=df,x='x',y='y',hue='label',ax=ax[i], palette="deep")
        fig.suptitle(f"{title}, Epoch: {epoch}, Accuracy: {str(round(acc, 2))}")
        ax[i].set_title(f"layer {layer_i+1}")
        
        
    plt.savefig(f'plots/dim_reduction/{title}/{epoch}',format='png',pad_inches=0)

In [None]:
# https://www.kaggle.com/code/tanmay17061/transformers-bert-hidden-embeddings-visualization/notebook
# https://stackoverflow.com/questions/15638612/calculating-mean-and-standard-deviation-of-the-data-which-does-not-fit-in-memory
def prepare_visulaize(model, data_loader, epoch, trait_idx, hot_encoding, titel, acc):
    MAX_SEQ_LEN = 512
    model.eval()  #toggle model in eval mode
    with torch.no_grad():
        train_correct_preds,train_total_preds,train_total_loss = 0,0,0.0
        train_masks,train_ys = torch.zeros(0,MAX_SEQ_LEN),torch.zeros(0,1)
        train_hidden_states = None

        for step, (author_ids, inputs_ids, attention_mask, target) in enumerate(data_loader):
            if step >= 15:
                continue
            target = target[:,trait_idx]
            if hot_encoding:
                one_hot_encoding = tf.keras.utils.to_categorical(target, num_classes=2)
                target = torch.from_numpy(one_hot_encoding).float().to(DEVICE)
            else:
                target = target.to(DEVICE).unsqueeze(0).t()

            _, hidden_states = model(inputs_ids, attention_mask, output_hidden_states=True)
            hidden_states = hidden_states[1:]
            #proba = sigmoid(logits)

            train_masks = torch.cat([train_masks,attention_mask.cpu()])
            train_ys = torch.cat([train_ys, target.cpu().view(-1,1)])

            if type(train_hidden_states) == type(None):
                train_hidden_states = tuple(layer_hidden_states.cpu() for layer_hidden_states in hidden_states)
            else:
                train_hidden_states = tuple(torch.cat([layer_hidden_state_all,layer_hidden_state_batch.cpu()])for layer_hidden_state_all,layer_hidden_state_batch in zip(train_hidden_states,hidden_states))
        
        visualize_layerwise_embeddings(train_hidden_states,train_masks,train_ys,epoch, titel, acc)

## Data Preprocessing

In [None]:
def preprocess_text(sentence):
    # remove hyperlinks, hashtags, smileys, emojies
    sentence = p.clean(sentence)
    # Remove hyperlinks
    sentence = re.sub(r"http\S+", " ", sentence)
    # Remove punctuations and numbers
    # sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    # sentence = re.sub('[^a-zA-Z.?!,]', ' ', sentence)
    # Single character removal (except I)
    sentence = re.sub(r"\s+[a-zA-HJ-Z]\s+", ' ', sentence)
    # Removing multiple spaces
    sentence = re.sub(r"\s+", " ", sentence)
    sentence = re.sub(r"\|\|\|", " ", sentence)

    return sentence


def load_essays_df(datafile):
    with open(datafile, "rt") as csvf:
        csvreader = csv.reader(csvf, delimiter=",", quotechar='"')
        first_line = True
        df = pd.DataFrame(
            columns=["user", "text", "token_len", "EXT", "NEU", "AGR", "CON", "OPN"]
        )
        for line in csvreader:
            if first_line:
                first_line = False
                continue

            text = line[1]
            df = df.append(
                {
                    "user": line[0],
                    "text": text,
                    "token_len": 0,
                    "EXT": 1 if line[2].lower() == "y" else 0,
                    "NEU": 1 if line[3].lower() == "y" else 0,
                    "AGR": 1 if line[4].lower() == "y" else 0,
                    "CON": 1 if line[5].lower() == "y" else 0,
                    "OPN": 1 if line[6].lower() == "y" else 0,
                },
                ignore_index=True,
            )

    #print("EXT : ", df["EXT"].value_counts())
    #print("NEU : ", df["NEU"].value_counts())
    #print("AGR : ", df["AGR"].value_counts())
    #print("CON : ", df["CON"].value_counts())
    #print("OPN : ", df["OPN"].value_counts())

    return df


def essays_embeddings(dataframe, tokenizer, token_length, mode):
    targets = []
    input_ids = []
    attention_masks = []

    df = dataframe
    cnt = 0
    '''
    # sorting all essays in ascending order of their length
    for ind in df.index:
        #print(type(df["text"][ind]))
        tokens = tokenizer.tokenize(df["text"][ind])
        df.at[ind, "token_len"] = len(tokens)

    df.sort_values(by=["token_len"], inplace=True, ascending=True)
    tmp_df = df["user"]
    tmp_df.to_csv("data/essays/author_id_order.csv", index_label="order")
    print(df["token_len"].mean())
    '''

    for ii in range(len(df)):
        text = preprocess_text(df["text"][ii])
        tokens = tokenizer.tokenize(text)

        if mode.isdigit():
              token_length = int(mode)
              encoding = tokenizer.encode_plus(
                    tokens,
                    add_special_tokens=True,
                    max_length=token_length,
                    pad_to_max_length=True,
                    return_attention_mask=True,
                )
              input_ids.append(encoding['input_ids'])
              attention_masks.append(encoding['attention_mask'])

        elif mode == "normal" or mode == "512_head":
              encoding = tokenizer.encode_plus(
                    tokens,
                    add_special_tokens=True,
                    max_length=token_length,
                    pad_to_max_length=True,
                    return_attention_mask=True,
                )
              input_ids.append(encoding['input_ids'])
              attention_masks.append(encoding['attention_mask'])

        elif mode == "longformer":
              encoding = tokenizer.encode_plus(
                    tokens,
                    add_special_tokens=True,
                    max_length=token_length,
                    pad_to_max_length=True,
                    return_attention_mask=True,
                )
              input_ids.append(encoding['input_ids'])
              attention_masks.append(encoding['attention_mask'])
            
        elif mode == "512_tail":
            input_ids.append(
                tokenizer.encode(
                    tokens[-(token_length - 2) :],
                    add_special_tokens=True,
                    max_length=token_length,
                    pad_to_max_length=True,
                )
            )
        elif mode == "256_head_tail":
            input_ids.append(
                tokenizer.encode(
                    tokens[: (token_length - 1)] + tokens[-(token_length - 1) :],
                    add_special_tokens=True,
                    max_length=token_length,
                    pad_to_max_length=True,
                )
            )

        elif mode == "docbert":
            docmax_len = 2048
            subdoc_len = 512
            max_subdoc_num = docmax_len // subdoc_len
            subdoc_tokens = [
                tokens[i : i + subdoc_len] for i in range(0, len(tokens), subdoc_len)
            ][:max_subdoc_num]
            # print(subdoc_tokens)
            token_ids = [
                tokenizer.encode(
                    x,
                    add_special_tokens=True,
                    max_length=token_length,
                    pad_to_max_length=True,
                )
                for x in subdoc_tokens
            ]
            # print(token_ids)
            token_ids = np.array(token_ids).astype(int)

            buffer_len = docmax_len // subdoc_len - token_ids.shape[0]
            # print(buffer_len)
            tmp = np.full(shape=(buffer_len, token_length), fill_value=0, dtype=int)
            token_ids = np.concatenate((token_ids, tmp), axis=0)

            input_ids.append(token_ids)

        targets.append(
            df["target"][ii]
        )
        cnt += 1

    author_ids = np.array(df.index)
    #print("loaded all input_ids and targets from the data file!")

    #df_tokenized = pd.DataFrame(list(zip(author_ids, input_ids, targets)),
     #          columns =['author_ids', 'input_ids', 'targets'])
    return author_ids, input_ids, attention_masks, targets

In [None]:
def load_Kaggle_df(datafile):
    with open(datafile, "rt", encoding="utf-8") as csvf:
        csvreader = csv.reader(csvf, delimiter=",", quotechar='"')
        first_line = True
        df = pd.DataFrame(columns=["user", "text", "E", "N", "F", "J"])
        for line in csvreader:
            if first_line:
                first_line = False
                continue

            text = line[1]

            df = df.append(
                {
                    "user": line[3],
                    "text": text,
                    "E": 1 if line[0][0] == "E" else 0,
                    "N": 1 if line[0][1] == "N" else 0,
                    "F": 1 if line[0][2] == "F" else 0,
                    "J": 1 if line[0][3] == "J" else 0,
                },
                ignore_index=True,
            )

    print("E : ", df["E"].value_counts())
    print("N : ", df["N"].value_counts())
    print("F : ", df["F"].value_counts())
    print("J : ", df["J"].value_counts())

    return df

def kaggle_embeddings(datafile, tokenizer, token_length):
    hidden_features = []
    targets = []
    token_len = []
    input_ids = []
    author_ids = []
    attention_masks = []

    df = load_Kaggle_df(datafile)
    cnt = 0
    for ind in df.index:

        text = preprocess_text(df["text"][ind])
        tokens = tokenizer.tokenize(text)
        token_len.append(len(tokens))
        encoding = tokenizer.encode_plus(
                    tokens,
                    add_special_tokens=True,
                    max_length=token_length,
                    pad_to_max_length=True,
                    return_attention_mask=True,
                )
        input_ids.append(encoding['input_ids'])
        attention_masks.append(encoding['attention_mask'])
        if cnt < 10:
            print(tokens[:10])

        #input_ids.append(token_ids)
        targets.append([df["E"][ind], df["N"][ind], df["F"][ind], df["J"][ind]])
        author_ids.append(int(df["user"][ind]))
        cnt += 1

    print("average length : ", int(np.mean(token_len)))
    author_ids = np.array(author_ids)

    return author_ids, input_ids, attention_masks, targets

In [None]:
token_length = 512
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
#tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
#tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
tokenizer.model_max_length = token_length
#token_length = 512
mode = 'longformer'
datafile = "data/kaggle/kaggle.csv"
author_ids, input_ids, attention_masks, targets = kaggle_embeddings(
    datafile, tokenizer, token_length
)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

E :  0    6676
1    1999
Name: E, dtype: int64
N :  1    7478
0    1197
Name: N, dtype: int64
F :  1    4694
0    3981
Name: F, dtype: int64
J :  0    5241
1    3434
Name: J, dtype: int64
["'", 'and', 'int', '##j', 'moments', 'sports', '##cent', '##er', 'not', 'top']
["'", 'i', "'", 'm', 'finding', 'the', 'lack', 'of', 'me', 'in']
["'", 'good', 'one', '_', '_', '_', '_', '_', 'course', ',']
["'", 'dear', 'int', '##p', ',', 'i', 'enjoyed', 'our', 'conversation', 'the']
["'", 'you', "'", 're', 'fired', '.', 'that', "'", 's', 'another']


The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).


["'", '18', '/', '37', '.', 'science', 'is', 'not', 'perfect', '.']
["'", 'no', ',', 'i', 'can', "'", 't', 'draw', 'on', 'my']
["'", 'i', 'tend', 'to', 'build', 'up', 'collection', 'of', 'things', 'on']
['i', "'", 'm', 'not', 'sure', ',', 'that', "'", 's', 'good']
["'", 'in', 'this', 'position', 'where', 'i', 'have', 'to', 'actually', 'let']
average length :  1699


## Dataloader & dataset

In [None]:
# one target
class Bert_Dataset(Dataset):
    def __init__(self, author_ids, input_ids, attention_masks, targets):
        input_ids = [np.asarray(x) for x in input_ids]
        attention_masks = [np.asarray(x) for x in attention_masks]
        self.author_ids = torch.from_numpy(np.array(author_ids))
        self.input_ids = torch.from_numpy(np.array(input_ids))
        self.attention_masks = torch.from_numpy(np.array(attention_masks))
        #one_hot_encoding = tf.keras.utils.to_categorical(targets.to_numpy(), num_classes=2)
        #self.targets = torch.from_numpy(one_hot_encoding).float()
        self.targets = torch.from_numpy(targets.to_numpy()).float()
        #print(f'input_ids: {self.input_ids.size()}')
        #print(f'attention_mask: {self.attention_masks.size()}')
        #print(f'targets: {self.targets.size()}')
        

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        #return (self.author_ids[idx], self.input_ids[idx].to(DEVICE), self.attention_masks[idx].to(DEVICE), self.targets[idx].to(DEVICE))
        return (self.author_ids[idx], self.input_ids[idx].to(DEVICE), self.attention_masks[idx].to(DEVICE), self.targets[idx])

In [None]:
def return_dataloader(author_ids, input_ids, attention_masks, targets_arr):
    tokenized_df = pd.DataFrame(list(zip(author_ids, input_ids, attention_masks)),
                  columns =['author_ids', 'input_ids', 'attention_masks']).apply(np.asarray)
    target_df = pd.DataFrame(targets_arr, columns = ["E", "S", "T", "J"])

    df_inputs_train, df_inputs_test, df_targets_train, df_targets_test = train_test_split(tokenized_df, target_df, test_size=0.1, stratify=target_df)

    # testing
    test_on = 10
    inputs = df_inputs_train.iloc[:test_on,]
    targets = df_targets_train.iloc[:test_on,]

    inputs_train, inputs_val, targets_train, targets_val = train_test_split(inputs, targets, test_size=0.5)
    #auth = inputs_train['author_ids']
    #tar = targets_train
    #print(f'author_ids: \n{auth}\ntargets: \n{tar}')

    # dataloader
    train_dataset_small = Bert_Dataset(inputs_train['author_ids'].to_numpy(), inputs_train['input_ids'].to_numpy(), inputs_train['attention_masks'].to_numpy(), targets_train)
    val_dataset_small = Bert_Dataset(inputs_val['author_ids'].to_numpy(), inputs_val['input_ids'].to_numpy(), inputs_val['attention_masks'].to_numpy(), targets_val)
    train_dataloader_small = DataLoader(train_dataset_small, batch_size = batch_size, shuffle = False)
    val_dataloader_small = DataLoader(val_dataset_small, batch_size = batch_size)


    # normal
    inputs = df_inputs_train#.iloc[:test_on,]
    targets = df_targets_train#.iloc[:test_on,]

    inputs_train, inputs_val, targets_train, targets_val = train_test_split(inputs, targets, test_size=0.15, stratify=targets)

    # dataloader
    train_dataset = Bert_Dataset(inputs_train['author_ids'].to_numpy(), inputs_train['input_ids'].to_numpy(), inputs_train['attention_masks'].to_numpy(), targets_train)
    val_dataset = Bert_Dataset(inputs_val['author_ids'].to_numpy(), inputs_val['input_ids'].to_numpy(), inputs_val['attention_masks'].to_numpy(), targets_val)
    train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
    val_dataloader = DataLoader(val_dataset, batch_size = batch_size)

    return train_dataloader, val_dataloader, train_dataloader_small, val_dataloader_small

#df_targets_test.value_counts()
#iter(train_dataloader_small).next()

In [None]:
#author_ids, inputs_ids, attention_mask, target = iter(train_dataloader_small).next()
#print(f'author_ids: \n{author_ids.cpu().numpy()}\ntargets: \n{target.cpu().numpy()[:,0]}\n inputs_ids: \n{inputs_ids.cpu().numpy()}')

## Model

In [None]:
class personet2output(nn.Module):
  def __init__(self, n_classes):
    super(personet2output, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.3)

    self.MLP = nn.Sequential(
      nn.Linear(768,50),
      nn.ReLU(), nn.Dropout(0.1),
      nn.Linear(50, n_classes),
      nn.Softmax(dim=1)
      )
    self.MLP.apply(weights_init)
    '''
    self.list_MLPS = []
    for i in range(n_classes):
      self.list_MLPS.append(nn.Sequential(
          nn.Linear(768,50),
          nn.ReLU(), #nn.Dropout(0.1),
          nn.Linear(50, 1)
          ).to(DEVICE)
      )
    
    for model in self.list_MLPS:
      model.apply(weights_init)
    # torch.nn.init.xavier_uniform_(self.MLP.parameters())
    '''
    #for param in self.bert.parameters():
      #param.requires_grad = False

  
  def forward(self, input_ids, attention_mask):
    outputs = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    output = outputs[1]
    #pooled_output = outputs[0][:,0]
    
    '''
    result = torch.tensor([]).to(DEVICE)
    for network in self.list_MLPS:
      x = network(pooled_output)
      result = torch.cat((result, x), dim = 1)
    '''
    output = self.drop(output)

    
    output = self.MLP(output)
    # result = torch.tensor(result).to(DEVICE)
    #print(f'result: {result.size()}')
    #print(f'result: {result}')
    return output

class personet1output(nn.Module):
  def __init__(self):
    super(personet1output, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.3)

    self.MLP = nn.Sequential(
      nn.Linear(768,50),
      nn.ReLU(), nn.Dropout(0.1),
      nn.Linear(50, 1),
      #nn.Linear(50, n_classes),
      #nn.Softmax(dim=1)
      )
    self.MLP.apply(weights_init)

    #for param in self.bert.parameters():
     # param.requires_grad = False

  
  def forward(self, input_ids, attention_mask, output_hidden_states=False):
    outputs = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask,
      output_hidden_states = output_hidden_states
    )
    #print(outputs)
    output = outputs[1]
    if output_hidden_states:
        hidden_states = outputs[2]
    else:
        hidden_states = None
    #pooled_output = outputs[0][:,0]
    
    '''
    result = torch.tensor([]).to(DEVICE)
    for network in self.list_MLPS:
      x = network(pooled_output)
      result = torch.cat((result, x), dim = 1)
    '''
    output = self.drop(output)

    
    output = self.MLP(output)
    # result = torch.tensor(result).to(DEVICE)
    #print(f'result: {result.size()}')
    #print(f'result: {result}')
    return output, hidden_states

# predict number of subnets at once 
class allnet(nn.Module):
  def __init__(self, subnets):
    super(allnet, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    #self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
    self.drop = nn.Dropout(p=0.3)

    self.list_MLPS = []
    hidden_size = 50
    for i in range(subnets):
      self.list_MLPS.append(nn.Sequential(
          nn.Linear(768,hidden_size),
          nn.ReLU(), #nn.Dropout(0.1),
          nn.Linear(hidden_size, 1)
          ).to(DEVICE)
      )
    
    for model in self.list_MLPS:
      model.apply(weights_init)

  #@autocast()
  def forward(self, input_ids, attention_mask):
    outputs = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    output = outputs[1]
    output = self.drop(output)

    for i, model in enumerate(self.list_MLPS):
        x = model(output)
        if i == 0:
            result = x
        else:
            result = torch.cat((result, x), 1)
    return result


# simple MLP same as was used in baseline paper
class MLP(nn.Module):
    def __init__(self, n_classes):
      super(MLP, self).__init__()
      
      self.MLP = nn.Sequential(
        nn.Linear(768,50),
        nn.ReLU(),
        nn.Linear(50, n_classes),
        nn.Softmax(dim=1)
      )
      self.MLP.apply(weights_init)
    
    def forward(self, input):
      output = self.MLP(input)
      return output

In [None]:
def model_size(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()

    size_all_mb = (param_size + buffer_size) / 1024**2
    print('model size: {:.3f}MB'.format(size_all_mb))

## Training

In [None]:
def train_epoch(model, data_loader, loss_fn, optimizer, epoch, trait_idx, hot_encoding):
  model = model.train()
  train_acc_metric = keras.metrics.BinaryAccuracy()
  local_acc_metric = keras.metrics.BinaryAccuracy()
  sigmoid = nn.Sigmoid()
  losses = []
  for step, (author_ids, inputs_ids, attention_mask, target) in enumerate(data_loader):  
      target = target[:,trait_idx]
      if hot_encoding:
          one_hot_encoding = tf.keras.utils.to_categorical(target, num_classes=2)
          target = torch.from_numpy(one_hot_encoding).float().to(DEVICE)
      else:
          target = target.to(DEVICE).unsqueeze(0).t()

      # basic training steps
      for optim in optimizer:
          optim.zero_grad()
      model = model.train()
      logits, _ = model(inputs_ids, attention_mask)
      proba = sigmoid(logits)
      loss_value = loss_fn(proba, target)
      loss_value.backward()
      losses.append(loss_value.item())
      for optim in optimizer:
          optim.step()
      train_acc_metric.update_state(target.cpu(), proba.cpu().detach())

  # metrics gathering
  train_acc = (train_acc_metric.result() * 100).numpy()
  #print(f'train_acc : {train_acc}')
  #print(f'train_loss: {np.mean(losses)}')
  train_acc_metric.reset_states()

  return train_acc, np.mean(losses)

def val_epoch(model, data_loader, loss_fn, epoch, trait_idx, hot_encoding):
  model = model.eval()
  val_acc_metric = keras.metrics.BinaryAccuracy()
  losses = []
  sigmoid = nn.Sigmoid()

  for step, (author_ids, inputs_ids, attention_mask, target) in enumerate(data_loader):
      target = target[:,trait_idx]
      if hot_encoding:
          one_hot_encoding = tf.keras.utils.to_categorical(target, num_classes=2)
          target = torch.from_numpy(one_hot_encoding).float().to(DEVICE)
      else:
          target = target.to(DEVICE).unsqueeze(0).t()

      with torch.no_grad():
          logits, _ = model(inputs_ids, attention_mask)
          proba = sigmoid(logits)
          #if step == 0:
           #   print(f'val logit: {logits}')
          loss_value = loss_fn(proba, target)
          losses.append(loss_value.item())
          #proba = logits
          val_acc_metric.update_state(target.cpu(), proba.cpu())


  # metrics gathering
  val_acc = (val_acc_metric.result() * 100).numpy()
  #print(f'val_acc  : {val_acc}')
  #print(f'val_loss : {np.mean(losses)}')
  return val_acc, np.mean(losses)

In [None]:
def predict(model, data_loader, loss_fn, epoch, trait_idx, hot_encoding):
  model = model.eval()
  val_acc_metric = keras.metrics.BinaryAccuracy()
  losses = []
  sigmoid = nn.Sigmoid()
  predictions = [[],[],[]]

  for step, (author_ids, inputs_ids, attention_mask, target) in enumerate(data_loader):
      target = target[:,trait_idx]
      #if step == 0: 
       #       print(f'logit: {target}')
      if hot_encoding:
          one_hot_encoding = tf.keras.utils.to_categorical(target, num_classes=2)
          target = torch.from_numpy(one_hot_encoding).float().to(DEVICE)
      else:
          target = target.to(DEVICE).unsqueeze(0).t()

      with torch.no_grad():
          logits, _ = model(inputs_ids, attention_mask)
          proba = sigmoid(logits)
          predictions[0].extend(author_ids.cpu().tolist())
          predictions[1].extend(proba.cpu().tolist())
          predictions[2].extend(target.cpu().tolist())
          
          loss_value = loss_fn(proba, target)
          #if step == 0: 
           #   print(f'logit: {logits}, {logits[:,1]}')
            #  print(f'logit: {target}, {target[:,1]}')
          
          losses.append(loss_value.item())
          #proba = logits
          #proba = sigmoid(logits)
          val_acc_metric.update_state(target.cpu(), proba.cpu())


      # metrics gathering
  val_acc = (val_acc_metric.result() * 100).numpy()
  print(f'val_acc  : {val_acc}')
  print(f'val_loss : {np.mean(losses)}')
      
  return predictions

In [None]:
writer = SummaryWriter(comment="")
#current_time = str(datetime.datetime.now().now().strftime("%Y%m%d-%H%M%S"))
#log_dir = 'logs/tensorboard/' + current_time
#summary_writer = summary.create_file_writer(log_dir)

def full_training(train_dataloader, val_dataloader, trait, trait_idx, hp):
    print(f'trait_idx: {trait_idx}')
    # hyperparameter
    lr_mlp = hp['lr_mlp']
    lr_bert = hp['lr_bert']
    batch_size = hp['batch_size']
    n_classes = hp['n_classes']
    hidden_dim = hp['hidden_dim']
    wd = hp['wd']
    hot_encoding = hp['hot_encoding']
        
    # model, optimizer, loss
    model = personet1output().to(DEVICE)
    optimizer_bert = torch.optim.AdamW(model.bert.parameters(), lr=lr_bert, weight_decay = wd)
    optimizer_mlp = torch.optim.Adam(model.MLP.parameters(), lr=lr_mlp, eps=1e-07, weight_decay = wd)
    optimizer = [optimizer_bert, optimizer_mlp]
    #optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.BCELoss(reduction='sum').to(DEVICE)

    # metrics
    train_loss_list = []
    train_acc_list = []
    val_loss_list = []
    val_acc_list = []
    train_acc = 50
    val_acc = 50

    # training loop
    for epoch in tqdm(range(hp['epochs'])): #tqdm
        #print(f'Epoch: {epoch + 1}')

        # visualization
        #prepare_visulaize(model, train_dataloader, epoch, trait_idx, hot_encoding, 'Train', train_acc)
        #prepare_visulaize(model, val_dataloader, epoch, trait_idx, hot_encoding, 'Val', val_acc)
        
        # training loop
        train_acc, train_loss = train_epoch(model, train_dataloader, loss_fn, optimizer, epoch, trait_idx, hot_encoding)
        train_acc_list.append(train_acc)
        train_loss_list.append(train_loss)
        
        # validation loop
        val_acc, val_loss = val_epoch(model, val_dataloader, loss_fn, epoch, trait_idx, hot_encoding)
        val_acc_list.append(val_acc)
        val_loss_list.append(val_loss)
        
        if epoch == 4:
            print(f'Save model, val acc: {val_acc}')
            #print(f'pretrained_model/pretrained_one_class/{trait}.bin')
            #torch.save(model.state_dict(), f'pretrained_model/pretrained_one_class/{trait}.bin')
            

    print('')
    print(f'Final: ')
    train_acc_best = max(train_acc_list)
    print(f'Best train acc: {train_acc_best}, from: {train_acc_list}')
    val_acc_best = max(val_acc_list)
    print(f'Best val  acc: {val_acc_best}, from: {val_acc_list}')

In [None]:
trait_labels = ["E", "N", "F", "J"]
token_length_list = [512]
to_do = [1,2,3]

hyperparameter = {
    'epochs': 5,
    'lr_mlp': 1e-4,
    'lr_bert': 2e-5,
    'batch_size': 16,
    'n_classes': 2,
    'hidden_dim': 768,
    'wd': 1e-6,
    'hot_encoding': False,
}

for trait_idx, trait in enumerate(trait_labels):
    if trait_idx not in to_do:
        continue
    print(f'Trait: {trait}')
    for token_length in token_length_list:
        base = ''
        token_l = 0
        tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
        mode = token_length
        author_ids, input_ids, attention_masks, targets = kaggle_embeddings(
            datafile, tokenizer, token_length
        )

        train_dataloader, val_dataloader, train_dataloader_small, val_dataloader_small = return_dataloader(author_ids, input_ids, attention_masks, targets)

        print(f'token_length: {token_length}')
        full_training(train_dataloader, val_dataloader, trait, trait_idx, hyperparameter)


    # predict
    #model = personet1output().to(DEVICE)
    #model.load_state_dict(torch.load(f'pretrained_model/pretrained_one_class/{trait}_1_output.bin'))

Trait: N
E :  0    6676
1    1999
Name: E, dtype: int64
N :  1    7478
0    1197
Name: N, dtype: int64
F :  1    4694
0    3981
Name: F, dtype: int64
J :  0    5241
1    3434
Name: J, dtype: int64
["'", 'and', 'int', '##j', 'moments', 'sports', '##cent', '##er', 'not', 'top']
["'", 'i', "'", 'm', 'finding', 'the', 'lack', 'of', 'me', 'in']
["'", 'good', 'one', '_', '_', '_', '_', '_', 'course', ',']
["'", 'dear', 'int', '##p', ',', 'i', 'enjoyed', 'our', 'conversation', 'the']
["'", 'you', "'", 're', 'fired', '.', 'that', "'", 's', 'another']
["'", '18', '/', '37', '.', 'science', 'is', 'not', 'perfect', '.']
["'", 'no', ',', 'i', 'can', "'", 't', 'draw', 'on', 'my']
["'", 'i', 'tend', 'to', 'build', 'up', 'collection', 'of', 'things', 'on']
['i', "'", 'm', 'not', 'sure', ',', 'that', "'", 's', 'good']
["'", 'in', 'this', 'position', 'where', 'i', 'have', 'to', 'actually', 'let']
average length :  1699
token_length: 512
trait_idx: 1


Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

100%|██████████| 5/5 [30:21<00:00, 364.38s/it]

Save model, val acc: 88.05461120605469

Final: 
Best train acc: 96.26224517822266, from: [86.17935, 88.244156, 91.1982, 94.287865, 96.262245]
Best val  acc: 89.50511932373047, from: [86.177475, 83.105804, 89.16382, 89.50512, 88.05461]
Trait: F





E :  0    6676
1    1999
Name: E, dtype: int64
N :  1    7478
0    1197
Name: N, dtype: int64
F :  1    4694
0    3981
Name: F, dtype: int64
J :  0    5241
1    3434
Name: J, dtype: int64
["'", 'and', 'int', '##j', 'moments', 'sports', '##cent', '##er', 'not', 'top']
["'", 'i', "'", 'm', 'finding', 'the', 'lack', 'of', 'me', 'in']
["'", 'good', 'one', '_', '_', '_', '_', '_', 'course', ',']
["'", 'dear', 'int', '##p', ',', 'i', 'enjoyed', 'our', 'conversation', 'the']
["'", 'you', "'", 're', 'fired', '.', 'that', "'", 's', 'another']


The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).


["'", '18', '/', '37', '.', 'science', 'is', 'not', 'perfect', '.']
["'", 'no', ',', 'i', 'can', "'", 't', 'draw', 'on', 'my']
["'", 'i', 'tend', 'to', 'build', 'up', 'collection', 'of', 'things', 'on']
['i', "'", 'm', 'not', 'sure', ',', 'that', "'", 's', 'good']
["'", 'in', 'this', 'position', 'where', 'i', 'have', 'to', 'actually', 'let']
average length :  1699
token_length: 512
trait_idx: 2


100%|██████████| 5/5 [30:19<00:00, 363.87s/it]

Save model, val acc: 83.70307159423828

Final: 
Best train acc: 95.55387878417969, from: [69.58553, 83.737755, 89.223816, 93.73022, 95.55388]
Best val  acc: 84.81228637695312, from: [81.399315, 84.81229, 84.21501, 83.532425, 83.70307]
Trait: J





E :  0    6676
1    1999
Name: E, dtype: int64
N :  1    7478
0    1197
Name: N, dtype: int64
F :  1    4694
0    3981
Name: F, dtype: int64
J :  0    5241
1    3434
Name: J, dtype: int64
["'", 'and', 'int', '##j', 'moments', 'sports', '##cent', '##er', 'not', 'top']
["'", 'i', "'", 'm', 'finding', 'the', 'lack', 'of', 'me', 'in']
["'", 'good', 'one', '_', '_', '_', '_', '_', 'course', ',']
["'", 'dear', 'int', '##p', ',', 'i', 'enjoyed', 'our', 'conversation', 'the']
["'", 'you', "'", 're', 'fired', '.', 'that', "'", 's', 'another']


The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).


["'", '18', '/', '37', '.', 'science', 'is', 'not', 'perfect', '.']
["'", 'no', ',', 'i', 'can', "'", 't', 'draw', 'on', 'my']
["'", 'i', 'tend', 'to', 'build', 'up', 'collection', 'of', 'things', 'on']
['i', "'", 'm', 'not', 'sure', ',', 'that', "'", 's', 'good']
["'", 'in', 'this', 'position', 'where', 'i', 'have', 'to', 'actually', 'let']
average length :  1699
token_length: 512
trait_idx: 3


100%|██████████| 5/5 [30:19<00:00, 363.88s/it]

Save model, val acc: 76.36518859863281

Final: 
Best train acc: 91.30369567871094, from: [64.49133, 77.3474, 81.86887, 86.08892, 91.303696]
Best val  acc: 77.64505004882812, from: [75.682594, 77.38908, 77.30376, 77.64505, 76.36519]



