# Preparation

Installing and importing libraries

In [3]:
# # install correct version of tensorflow
# !pip install tensorflow==2.1
# # install libraries if needed
# !pip install sentence_transformers

In [13]:
import tensorflow_hub as hub
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.preprocessing import Normalizer
from sentence_transformers import SentenceTransformer

Set Dataset paths

In [28]:
# dataset paths
dataset = 'seven'
questionnaire = "rbq"
dataset_path = '../embeddings/'
responses_file_name = "".join([dataset, "/", questionnaire.upper(), "/", questionnaire.lower(), "_responses_nonReversed.csv"])
output_embeddings_file_name = "".join([dataset, "/", questionnaire.upper(), "/", questionnaire.lower(), "_questions_embeddings_SENTENCEBERT.csv"])

Loading item text (output format should be consistent for both Big5 dataset and IPIP dataset)

In [29]:
#load item-response matrix
matrix = pd.read_csv(dataset_path + responses_file_name, header = 0)
print(matrix[-5:])

# preprocess data
# ATTENTION: check if item has "I" or not! If not, manually add "I"
matrix['item'] = [(item).lower() for item in matrix['item']]

# extract data
question_ids = matrix['question_id']
individual_ys = matrix.drop(['question_id'], axis=1).set_index(["item"]).T
items = matrix.iloc[:,1].tolist()
print(items)
print(len(items))

   question_id                                               item    0    1  \
63      rbq064           Concentrates on or works hard at a task.  8.0  3.0   
64      rbq065  Engages in physical activity. (e.g., works up ...  1.0  3.0   
65      rbq066  Acts in a self-indulgent manner. (e.g., spendi...  6.0  2.0   
66      rbq067  Exhibits physical discomfort or pain. (High pl...  1.0  1.0   
67      rbq068  Behaves in a stereotypically feminine style or...  2.0  1.0   

      2    3    4    5    6    7  ...  5966  5967  5968  5969  5970  5971  \
63  2.0  7.0  6.0  6.0  3.0  3.0  ...   5.0   9.0   9.0   4.0   9.0   5.0   
64  2.0  9.0  1.0  9.0  3.0  1.0  ...   2.0   2.0   9.0   3.0   2.0   2.0   
65  5.0  4.0  5.0  5.0  5.0  1.0  ...   6.0   4.0   8.0   8.0   1.0   9.0   
66  5.0  4.0  1.0  2.0  4.0  5.0  ...   6.0   7.0   2.0   4.0   2.0   1.0   
67  1.0  8.0  3.0  6.0  5.0  2.0  ...   5.0   5.0   2.0   3.0   1.0   1.0   

    5972  5973  5974  5975  
63   4.0   5.0   1.0   8.0  
64  

# Extract embeddings

Extracting sentence-BERT embeddings.

In [30]:
## sentence-BERT
# model = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')
# model = SentenceTransformer('all-MiniLM-L12-v2')
model = SentenceTransformer('all-mpnet-base-v2')
sentence_embeddings = model.encode(items)
vec = sentence_embeddings
print(vec.shape)

(68, 768)


# Save to files

In [31]:
print(vec.shape)
embeddings_df = pd.DataFrame(vec)
embeddings_df.insert(0,'question_id', question_ids)
embeddings_df.columns = ['question_id'] + list(range(vec.shape[1]))
print("saving to file: " + dataset_path + output_embeddings_file_name)
print(embeddings_df.head())
embeddings_df.to_csv(dataset_path + output_embeddings_file_name, index = False, header = True)

(68, 768)
saving to file: ../embeddings/seven/RBQ/rbq_questions_embeddings_SENTENCEBERT.csv
  question_id         0         1         2         3         4         5  \
0      rbq001  0.044745  0.001965 -0.016740  0.034363 -0.089471 -0.012975   
1      rbq002  0.049429  0.072673  0.003481 -0.004440  0.022819  0.004320   
2      rbq003 -0.004291  0.089111  0.013607 -0.007228 -0.105615  0.006196   
3      rbq004  0.044867 -0.038076 -0.006002 -0.008826 -0.073893  0.001470   
4      rbq005  0.003389 -0.064028 -0.009089  0.009025 -0.040007 -0.016604   

          6         7         8  ...       758       759       760       761  \
0  0.043938 -0.048950 -0.006987  ...  0.082882 -0.057834  0.020197  0.022347   
1 -0.007476 -0.026199  0.015136  ...  0.020008 -0.010176 -0.057272  0.007057   
2  0.016123  0.017257 -0.004818  ...  0.018837  0.059451  0.009165  0.018991   
3 -0.022699  0.001197  0.010874  ... -0.032847  0.025618 -0.048261  0.047556   
4 -0.053681 -0.043588  0.002558  ... -0.05821