
# Preparation



Installing and importing libraries

In [None]:
# install correct version of tensorflow
!pip install tensorflow==2.1
# install libraries if needed
!pip install sentence_transformers

In [None]:
import tensorflow_hub as hub
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.preprocessing import Normalizer
from sentence_transformers import SentenceTransformer

Set Dataset paths

In [None]:
# dataset paths
dataset = 'big5'
dataset_path = '../embeddings/'
responses_file_name = "".join([dataset.upper(), "/", dataset, "_responses.csv"])
output_embeddings_file_name = "".join([dataset.upper(), "/", dataset, "_questions_embeddings_SENTENCEBERT.csv"])

Loading item text (output format should be consistent for both Big5 dataset and IPIP dataset)

In [None]:
#load item-response matrix
matrix = pd.read_csv(dataset_path + responses_file_name, header = 0)
print(matrix[-5:])

# preprocess data
# ATTENTION: check if item has "I" or not! If not, manually add "I"
matrix['item'] = [(item).lower() for item in matrix['item']]

# extract data
question_ids = matrix['question-id']
individual_ys = matrix.drop(['question-id'], axis=1).set_index(["item"]).T
items = matrix.iloc[:,1].tolist()
print(items)
print(len(items))

In [None]:
items

# Extract embeddings

Extracting sentence-BERT embeddings.

In [None]:
## sentence-BERT
model = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')
sentence_embeddings = model.encode(items)
vec = sentence_embeddings
print(vec.shape)

# Save to files

In [None]:
print(vec.shape)
embeddings_df = pd.DataFrame(vec)
embeddings_df.insert(0,'question-id', question_ids)
embeddings_df.columns = ['question-id'] + list(range(vec.shape[1]))
print("saving to file: " + dataset_path + output_embeddings_file_name)
print(embeddings_df.head())
embeddings_df.to_csv(dataset_path + output_embeddings_file_name, index = False, header = True)