# Finetune Bert for personality classification

## Setup


In [None]:
!pip install git+https://github.com/allenai/longformer.git
!pip install -q -U watermark
!pip install -qq transformers
!pip install tqdm
!pip install sentence_transformers
%reload_ext watermark
%watermark -v -p numpy,pandas,torch,transformers
# !mkdir ./PRIMERA_model
# !wget https://storage.googleapis.com/primer_summ/PRIMER-large.tar.gz --directory-prefix ./PRIMERA_model
# !git clone https://github.com/allenai/PRIMER.git ./PRIMER
# %cd PRIMER && pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/allenai/longformer.git
  Cloning https://github.com/allenai/longformer.git to /tmp/pip-req-build-yrnzgrha
  Running command git clone -q https://github.com/allenai/longformer.git /tmp/pip-req-build-yrnzgrha
Collecting transformers@ git+http://github.com/ibeltagy/transformers.git@longformer_encoder_decoder#egg=transformers
  Cloning http://github.com/ibeltagy/transformers.git (to revision longformer_encoder_decoder) to /tmp/pip-install-pmipfh1t/transformers_286e5007853a4ddd99b0b9054f7a1224
  Running command git clone -q http://github.com/ibeltagy/transformers.git /tmp/pip-install-pmipfh1t/transformers_286e5007853a4ddd99b0b9054f7a1224
  Running command git checkout -b longformer_encoder_decoder --track origin/longformer_encoder_decoder
  Switched to a new branch 'longformer_encoder_decoder'
  Branch 'longformer_encoder_decoder' set up to track remote branch

In [None]:
!nvidia-smi

Wed Aug  3 09:23:29 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P0    27W / 250W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
#@title Setup & Config
import transformers
from transformers import logging
logging.set_verbosity_error()
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch

import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from joblib import dump, load


import numpy as np
import pandas as pd
import re
import csv
# import preprocessor as p
import math
import tensorflow as tf

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from tqdm import tqdm

%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
personality_list = ["EXT", "NEU", "AGR", "CON", "OPN"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 12, 8

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#DEVICE =torch.device('cpu')

In [None]:
# declare global settings
batch_size = 16 # this is still possible on the gpu for Bert - 32 not tested yet
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
lr = 5e-4
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#DEVICE =torch.device('cpu')

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
from google.colab import drive
drive.mount('/content/drive')
PROJECT_PATH = 'drive/MyDrive/application_project/personality-prediction'
%cd $PROJECT_PATH

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/148ZL90NxBRahz0n-SPpqyLGayyjTqBbV/application_project/personality-prediction


## Data Preprocessing

We'll load the Google Play app reviews dataset, that we've put together in the previous part:

In [None]:
from os import name
import random
def preprocess_text(sentence):
    # remove hyperlinks, hashtags, smileys, emojies
    # sentence = p.clean(sentence)
    # Remove hyperlinks
    sentence = re.sub(r"http\S+", " ", sentence)
    # Remove punctuations and numbers
    # sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    # sentence = re.sub('[^a-zA-Z.?!,]', ' ', sentence)
    # Single character removal (except I)
    # sentence = re.sub(r"\s+[a-zA-HJ-Z]\s+", ' ', sentence)
    # Removing multiple spaces
    sentence = re.sub(r"\s+", " ", sentence)
    sentence = re.sub(r"\|\|\|", " ", sentence)

    return sentence


def load_essays_df_dict(datafile):
    # input pairs for specific personality type

    columns=["user", "text1", 'text2', "token_len", 'label']
    pers_indices = {"EXT":2, "NEU":3, "AGR":4, "CON":5, "OPN":6}
    # pairs = []
    with open(datafile, "rt") as csvf:
        csvreader = list(csv.reader(csvf, delimiter=",", quotechar='"'))
        # test_indices = np.random.randint(len(csvreader), size=int(len(csvreader)/10))
        # train_indices = np.setdiff1d(np.asarray([i for i in range(len(csvreader))]), test_indices)
        dfs = []
        # for t in [train_indices, test_indices]:
        df = pd.DataFrame(columns=columns)
        
        for i in range(len(csvreader)):
          if i == 0:
            continue

          # random_pairs = np.random.choice(train_indices, n_pairs)
          
          # for pair in random_pairs:

          line_1 = csvreader[i]
          # line_2 = csvreader[pair]

          text1 = line_1[1]
          # text2 = line_2[1]
          df = df.append(
              {
                  "user": line_1[0],
                  "text1": text1,
                  # "text2": text2,
                  "token_len": 0,
                  'label': [1 if line_1[2].lower() == "y" else 0,
                            1 if line_1[3].lower() == "y" else 0,
                            1 if line_1[4].lower() == "y" else 0,
                            1 if line_1[5].lower() == "y" else 0,
                            1 if line_1[6].lower() == "y" else 0,]
                  # "EXT": 1 if line_1[2].lower() == "y" else 0,
                  # "NEU": 1 if line_1[3].lower() == "y" else 0,
                  # "AGR": 1 if line_1[4].lower() == "y" else 0,
                  # "CON": 1 if line_1[5].lower() == "y" else 0,
                  # "OPN": 1 if line_1[6].lower() == "y" else 0,
                  # 'label': 1 if line_1[pers_indices[personality]].lower() == line_2[pers_indices[personality]].lower() else 0
              },
              ignore_index=True,
          )
        dfs.append(df)

    # print("EXT : ", df["EXT"].value_counts())
    # print("NEU : ", df["NEU"].value_counts())
    # print("AGR : ", df["AGR"].value_counts())
    # print("CON : ", df["CON"].value_counts())
    # print("OPN : ", df["OPN"].value_counts())

    # df_dict = {}
    # for i in range(2):
    #   t = ['train', 'test'][i]
    #   df_dict[t] = {}
    #   for personality in personality_list:
    #     df_pos = dfs[i].loc[dfs[i][personality] == 1].reset_index(drop=True)
    #     df_neg = dfs[i].loc[dfs[i][personality] == 0].reset_index(drop=True)
    #     df_dict[t][personality] = (df_pos, df_neg)

    return dfs

        
def tokenize(text, mode, token_length, input_ids, attention_masks):
  tokens = tokenizer.tokenize(text)

  if mode == "normal" or mode == "512_head":
        encoding = tokenizer.encode_plus(
              tokens,
              add_special_tokens=True,
              max_length=token_length,
              pad_to_max_length=True,
              return_attention_mask=True,
          )
        return encoding['input_ids']
        return encoding['attention_mask']
      
  elif mode == "docbert":
      docmax_len = 2048
      subdoc_len = 512
      max_subdoc_num = docmax_len // subdoc_len
      subdoc_tokens = [
          tokens[i : i + subdoc_len] for i in range(0, len(tokens), subdoc_len)
      ][:max_subdoc_num]
      token_ids = [
          tokenizer.encode(
              x,
              add_special_tokens=True,
              max_length=token_length,
              pad_to_max_length=True,
          )
          for x in subdoc_tokens
      ]
      token_ids = np.array(token_ids).astype(int)

      buffer_len = docmax_len // subdoc_len - token_ids.shape[0]
      # print(buffer_len)
      tmp = np.full(shape=(buffer_len, token_length), fill_value=0, dtype=int)
      token_ids = np.concatenate((token_ids, tmp), axis=0)

      input_ids.append(token_ids)

      return token_ids


def essays_embeddings(df, tokenizer, token_length, mode, personality):
    targets = []
    input_ids1 = []
    attention_masks1 = []
    input_ids2 = []
    attention_masks2 = []
    cnt = 0
    # sorting all essays in ascending order of their length
    for ind in df.index:
      tokens = tokenizer.tokenize(df["text1"][ind])
      df.at[ind, "token_len"] = len(tokens)

    df.sort_values(by=["token_len", "user"], inplace=True, ascending=True)
    tmp_df = df["user"]
    tmp_df.to_csv("data/essays/author_id_order.csv", index_label="order")
    
    print('Average token length: ', df["token_len"].mean())

    for ii in range(len(df)):
      text1 = preprocess_text(df["text1"][ii])
      tokenize(text1, mode, token_length,  input_ids1, attention_masks1)
      text2 = preprocess_text(df["text2"][ii])
      tokenize(text2, mode, token_length,  input_ids2, attention_masks2)

      # Multi label targets or not
      if personality != 'all':
        targets.append(df['label'][ii])
      else:
        targets.append([df['EXT'][ii], df["NEU"][ii], df["AGR"][ii], 
                        df["CON"][ii], df["OPN"][ii]])
      cnt += 1

    print("loaded all input_ids and targets from the data file!")

    #df_tokenized = pd.DataFrame(list(zip(author_ids, input_ids, targets)),
     #          columns =['author_ids', 'input_ids', 'targets'])
    return input_ids1, attention_masks1, input_ids2, attention_masks2, targets



In [None]:
base = ''
datafile = base + "data/essays/essays.csv"

df_all = load_essays_df_dict(datafile)[0]

In [None]:
df_all

Unnamed: 0,user,text1,text2,token_len,label
0,1997_504851.txt,"Well, right now I just woke up from a mid-day ...",,0,"[0, 1, 1, 0, 1]"
1,1997_605191.txt,"Well, here we go with the stream of consciousn...",,0,"[0, 0, 1, 0, 0]"
2,1997_687252.txt,An open keyboard and buttons to push. The thin...,,0,"[0, 1, 0, 1, 1]"
3,1997_568848.txt,I can't believe it! It's really happening! M...,,0,"[1, 0, 1, 1, 0]"
4,1997_688160.txt,"Well, here I go with the good old stream of co...",,0,"[1, 0, 1, 0, 1]"
...,...,...,...,...,...
2462,2004_493.txt,I'm home. wanted to go to bed but remembe...,,0,"[0, 1, 0, 1, 0]"
2463,2004_494.txt,Stream of consiousnesssskdj. How do you s...,,0,"[1, 1, 0, 0, 1]"
2464,2004_497.txt,"It is Wednesday, December 8th and a lot has be...",,0,"[0, 0, 1, 0, 0]"
2465,2004_498.txt,"Man this week has been hellish. Anyways, now i...",,0,"[0, 1, 0, 0, 1]"


In [None]:
print(df_all.loc[df_all['label'].apply(lambda x: x == [1,0,0,0,0])].shape)
print(df_all.loc[df_all['label'].apply(lambda x: x == [0,1,0,0,0])].shape)
print(df_all.loc[df_all['label'].apply(lambda x: x == [0,0,1,0,0])].shape)
print(df_all.loc[df_all['label'].apply(lambda x: x == [0,0,0,1,0])].shape)
print(df_all.loc[df_all['label'].apply(lambda x: x == [0,0,0,0,1])].shape)

(47, 5)
(141, 5)
(53, 5)
(43, 5)
(59, 5)


In [None]:
indices = []
indices.append(df_all.loc[df_all['label'].apply(lambda x: x == [1,0,0,0,0])][0:5].index.to_list())
indices.append(df_all.loc[df_all['label'].apply(lambda x: x == [0,1,0,0,0])][0:5].index.to_list())
indices.append(df_all.loc[df_all['label'].apply(lambda x: x == [0,0,1,0,0])][0:5].index.to_list())
indices.append(df_all.loc[df_all['label'].apply(lambda x: x == [0,0,0,1,0])][0:5].index.to_list())
indices.append(df_all.loc[df_all['label'].apply(lambda x: x == [0,0,0,0,1])][0:5].index.to_list())
indices

[[72, 134, 181, 236, 265],
 [12, 30, 33, 42, 67],
 [1, 6, 20, 100, 118],
 [110, 138, 221, 245, 250],
 [16, 36, 43, 68, 76]]

In [None]:
# summaries
from transformers import (
    AutoTokenizer,
    LEDConfig,
    LEDForConditionalGeneration,
)
tokenizer = AutoTokenizer.from_pretrained('allenai/PRIMERA')
config=LEDConfig.from_pretrained('allenai/PRIMERA')
model = LEDForConditionalGeneration.from_pretrained('allenai/PRIMERA')

Downloading tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading added_tokens.json:   0%|          | 0.00/20.0 [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/283 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.67G [00:00<?, ?B/s]

In [None]:
texts = []
for j in indices:
  temp = []
  for i in j:
    text = preprocess_text(df_all["text1"][i])
    temp.append(text)
  temp = '<doc-sep> '.join(temp)
  texts.append(temp)

summaries = []

for text in texts:
  inputs = tokenizer.encode(texts[0], return_tensors="pt")
  global_attention_mask = torch.zeros_like(inputs)
  global_attention_mask[:, 0] = 1
  summary_ids = model.generate(inputs, global_attention_mask=global_attention_mask, num_beams=3, max_length=512)
  summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
  summaries.append(summary)

In [None]:
len(summaries)

5

In [None]:
# With the summary text for each label --> Gotten from the selected texts
# This means we have 5 pairs for each input

datasets = []
pair_indices = np.concatenate([np.asarray(sub_indices) for sub_indices in indices]).tolist()
all_indices = [i for i in range(len(df_all)) if i not in pair_indices]
test_indices = np.random.choice(all_indices, size=int(len(all_indices)/10), replace=False)
train_indices = np.setdiff1d(np.asarray(all_indices), test_indices)

print(len(df_all))
print(len(all_indices))
print(len(test_indices))
print(len(train_indices))

for t in [train_indices, test_indices]:
  pair_dataset = []
  for i in t:
    label = 0
    for j in range(len(summaries)):

      text1 = preprocess_text(df_all["text1"][i])
      text2 = preprocess_text(summaries[j])

 
      if df_all["label"][i][j] == 1:
        label = 1

        
      # label = 1 if df_all["label"][i] == df_all["label"][pair_index] else 0
      pair_dataset.append([text1, text2, label])

  pair_dataset = pd.DataFrame(pair_dataset, columns=['text1', 'text2', 'label'])
  datasets.append(pair_dataset)

2467
2442
244
2198


In [None]:
datasets[1]

Unnamed: 0,text1,text2,label
0,m exdcited. All classes seem interesting. I'm ...,I can't think or sleep or eat I don't know wha...,1
1,m exdcited. All classes seem interesting. I'm ...,I can't think or sleep or eat I don't know wha...,1
2,m exdcited. All classes seem interesting. I'm ...,I can't think or sleep or eat I don't know wha...,1
3,m exdcited. All classes seem interesting. I'm ...,I can't think or sleep or eat I don't know wha...,1
4,m exdcited. All classes seem interesting. I'm ...,I can't think or sleep or eat I don't know wha...,1
...,...,...,...
1215,I believe sometimes I think to much about what...,I can't think or sleep or eat I don't know wha...,1
1216,I believe sometimes I think to much about what...,I can't think or sleep or eat I don't know wha...,1
1217,I believe sometimes I think to much about what...,I can't think or sleep or eat I don't know wha...,1
1218,I believe sometimes I think to much about what...,I can't think or sleep or eat I don't know wha...,1


In [None]:
# With the original text for each label
# This means we have 5n pairs for each input

datasets = []
pair_indices = np.concatenate([np.asarray(sub_indices) for sub_indices in indices]).tolist()
all_indices = [i for i in range(len(df_all)) if i not in pair_indices]
test_indices = np.random.choice(all_indices, size=int(len(all_indices)/10), replace=False)
train_indices = np.setdiff1d(np.asarray(all_indices), test_indices)

print(len(df_all))
print(len(all_indices))
print(len(test_indices))
print(len(train_indices))

for t in [train_indices, test_indices]:
  pair_dataset = []
  for i in t:
    for pair_index in np.concatenate([np.asarray(sub_indices) for sub_indices in indices]):

      text1 = preprocess_text(df_all["text1"][i])
      text2 = preprocess_text(df_all["text1"][pair_index])

      label = 0
      for k, item in enumerate(df_all["label"][i]):
        if item == df_all["label"][pair_index][k] and item == 1:
          label = 1
          # print(df_all["label"][i], df_all["label"][pair_index])
          break
        
      # label = 1 if df_all["label"][i] == df_all["label"][pair_index] else 0
      pair_dataset.append([text1, text2, label])

  pair_dataset = pd.DataFrame(pair_dataset, columns=['text1', 'text2', 'label'])
  datasets.append(pair_dataset)

2467
2442
244
2198


## Dataloader & dataset

In [None]:
datasets = {}
dfs = []
df_dicts = []
for pers in personality_list:
  print('creating %s dataset' %  pers)
  df_all, df_dict = load_essays_df_dict(datafile, pers, 30)
  dfs.append(df_all)
  df_dicts.append(df_dict)
  # map_dataset = SDataset(df_all, tokenizer, 512, DEVICE, 'normal', pers)
  # train_size = int(len(map_dataset) * 0.1)
  # val_size = len(map_dataset) - train_size
  # train_dataset, val_dataset = torch.utils.data.random_split(map_dataset,
  #                                                         [train_size, val_size])
  
  # datasets[pers] = (train_dataset, train_size, val_dataset, val_size)
  print('done!')

In [None]:
dump(datasets, 'data/essays/datasets')
# dump(df_dicts, 'data/essays/df_dicts')
# dump(datasets, 'data/essays/datasets')


['data/essays/datasets']

In [None]:
datasets =load('data/essays/datasets')
# df_dicts = load('data/essays/df_dicts')
# datasets = load('data/essays/datasets')

## SBert Model 

In [None]:
from sentence_transformers import SentenceTransformer, InputExample, losses, util
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

In [None]:
model_path_base = '../class_representation/models/sbert'

In [None]:
# Create dataloaders for the SBERT models
# for i in range(len(datasets)): 
#   for j in range(len(dfs[i])):
#     dfs[i][j] = dfs[i][j][['text1', 'text2', 'label']]
#     dfs[i][j] = dfs[i][j].values.tolist()
for i in range(len(datasets)): 
    datasets[i] = datasets[i][['text1', 'text2', 'label']]
    datasets[i] = datasets[i].values.tolist()

In [None]:
class SBert():
  def __init__(self, model_path):
    self.model = SentenceTransformer(model_path)

  def forward(self, text_1, text_2):
    embeddings_1 = self.model.encode(text_1, convert_to_tensor=True)
    embeddings_2 = self.model.encode(text_2, convert_to_tensor=True)

    cosine_scores = util.cos_sim(embeddings_1, embeddings_2)
    return cosine_scores    


class SBertMultiLabel():
  def __init__(self, model_path):
    self.model = SentenceTransformer(model_path)

  def forward(self, text_1, text_2):
    embeddings_1 = self.model.encode(text_1, convert_to_tensor=True)
    embeddings_2 = self.model.encode(text_2, convert_to_tensor=True)

    cosine_scores = util.cos_sim(embeddings_1, embeddings_2)
    return cosine_scores    


In [None]:
def train(train_x, train_y, model_path):
  # print('Training for %s' % pers)
  # Specific dataset for the sentence transformer
  train_xy = [InputExample(texts=train_x[i], label=float(train_y[i])) for i in range(len(train_y))]

  #Define the model. Either from scratch of by loading a pre-trained model
  model = SentenceTransformer('distilbert-base-nli-mean-tokens')

  #Define your   train dataset, the dataloader and the train loss
  train_dataloader = DataLoader(train_xy, shuffle=True, batch_size=128)
  train_loss = losses.ContrastiveLoss(model)

  #Tune the model
  model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100)
  print('Training complete!')
  print('Saving the model to %s' % model_path)
  model.save(model_path)


def test(x, y, model_path):
  print('Testing...')
  sbert = SBert(model_path)

  x = np.asarray(x)
  y = torch.FloatTensor(y).to(DEVICE)
  scores = torch.diagonal(sbert.forward(x[:, 0], x[:, 1]))   

  print('Testing results:')
  print(torch.where(torch.round(scores) == y)[0].shape[0] / y.shape[0])


def test_similarity(x, pers_indices, model_path):
  # test for each personality seperately
  print('Testing...')
  sbert = SBert(model_path)
  x = np.asarray(x)

  y = []
  pred = []
  # per data
  for i in tqdm(range(len(x))):
    scores = []
    y.append(df_all.iloc[i]['label'])
    # run 5 times
    for pers in pers_indices:
      texts = df_all.iloc[pers]['text1']
      # take the mean of 10 predictions
      scores.append(torch.max(sbert.forward(x[0][0], texts.tolist())))
    
    label = []
    for score in scores:

      if score >= .65:
        label.append(1)
      else:
        label.append(0)
    
    pred.append(label)

  pred = torch.FloatTensor(pred).to(DEVICE)
  y = torch.FloatTensor(y).to(DEVICE)

  print('Testing results in total acccuracy:')
  print(torch.where(pred == y)[0].shape[0] / y.shape[0])
  print('Testing results in hamming loss:')
  print(sklearn.metrics.hamming_loss(y.to('cpu').numpy(), pred.to('cpu').numpy()))
  return pred, y


def test_similarity_2(x, summaries, model_path):
  # test for each personality seperately
  print('Testing...')
  sbert = SBert(model_path)
  x = np.asarray(x)

  y = []
  pred = []
  # per data
  for i in tqdm(range(len(x))):
    scores = []
    y.append(df_all.iloc[i]['label'])
    # run 5 times
    for summary in summaries:
      
      # take the mean of 10 predictions
      scores.append(sbert.forward(x[0][0], summary))
    
    label = []
    for score in scores:

      if score >= .65:
        label.append(1)
      else:
        label.append(0)
    
    pred.append(label)

  pred = torch.FloatTensor(pred).to(DEVICE)
  y = torch.FloatTensor(y).to(DEVICE)

  print('Testing results in total acccuracy:')
  print(torch.where(pred == y)[0].shape[0] / y.shape[0])
  print('Testing results in hamming loss:')
  print(sklearn.metrics.hamming_loss(y.to('cpu').numpy(), pred.to('cpu').numpy()))
  return pred, y

In [None]:
# For seperate S-Bert models
torch.cuda.empty_cache()
for i in range(len(personality_list)):  
  # Create the InputExample List
  train_x = []
  test_x = []
  train_y = []
  test_y = []

  pers = personality_list[i]
  model_path = model_path_base + pers
  # train data
  for data in dfs[i][0]:
    train_x.append(data[0:2])
    train_y.append(data[2])
  # test data
  for data in dfs[i][1]:
    test_x.append(data[0:2])
    test_y.append(data[2])
  # train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=.1, random_state=42)
  print(len(train_x))
  print(len(test_x))

  train(train_x, train_y, model_path)
  test(test_x, test_y, model_path)

In [None]:
!ls
torch.cuda.empty_cache()
# Create the InputExample List
train_x = []
test_x = []
train_y = []
test_y = []


model_path = model_path_base + 'all_pers'
# train data
for data in datasets[0]:
  train_x.append(data[0:2])
  train_y.append(data[2])
# test data
for i in range(0, len(datasets[1]), 5):
  data = datasets[1][i]
  test_x.append(data[0:2])
  test_y.append(data[2])
# train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=.1, random_state=42)
print(len(train_x))
print(len(test_x))


attention_masks.npy	LICENSE.md		     __pycache__
author_ids.npy		LM_extractor.py		     README.md
best_model_state.bin	logs			     requirements.txt
data			main.ipynb		     runs
end-to-end_training.py	model.py		     SHAP
explogs			pkl_data		     targets.npy
finetune_models		pretrained_models	     utils
imgs			PRIMERA_model
input_ids.npy		psycholinguistic_extractors
10990
244


In [None]:
import sklearn
train(train_x, train_y, model_path)
test_similarity_2(test_x, summaries, model_path)

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/86 [00:00<?, ?it/s]

Training complete!
Saving the model to ../class_representation/models/sbertall_pers
Testing...


100%|██████████| 244/244 [00:33<00:00,  7.35it/s]

Testing results in total acccuracy:
2.557377049180328
Testing results in hamming loss:
0.4885245901639344





(tensor([[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         ...,
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]], device='cuda:0'), tensor([[0., 1., 1., 0., 1.],
         [0., 0., 1., 0., 0.],
         [0., 1., 0., 1., 1.],
         ...,
         [0., 1., 0., 1., 1.],
         [0., 0., 1., 0., 1.],
         [0., 1., 1., 0., 1.]], device='cuda:0'))

In [None]:
model_path = model_path_base + 'all_pers'
import sklearn
test_similarity(test_x, indices, model_path)
test_similarity_2(test_x, summaries, model_path)


Testing...


100%|██████████| 244/244 [00:37<00:00,  6.56it/s]


Testing results in total acccuracy:
2.557377049180328
Testing results in hamming loss:
0.4885245901639344
Testing...


100%|██████████| 244/244 [00:17<00:00, 13.84it/s]

Testing results in total acccuracy:
2.557377049180328
Testing results in hamming loss:
0.4885245901639344





(tensor([[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         ...,
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]], device='cuda:0'), tensor([[0., 1., 1., 0., 1.],
         [0., 0., 1., 0., 0.],
         [0., 1., 0., 1., 1.],
         ...,
         [0., 1., 0., 1., 1.],
         [0., 0., 1., 0., 1.],
         [0., 1., 1., 0., 1.]], device='cuda:0'))