# SentenceTransformer example taken from [original docs](https://www.sbert.net/examples/applications/computing-embeddings/README.html)


In [1]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
sentences = [
    "This framework generates embeddings for each input sentence",
    "Sentences are passed as a list of string.",
    "The quick brown fox jumps over the lazy dog.",
]
embeddings = model.encode(sentences)

  from .autonotebook import tqdm as notebook_tqdm


# Extract texts & labels and generate doc-embedding per text

Return list of [doc-embedding-vector, list-of-true-labels-by-attribute]


In [2]:
from sentence_transformers import SentenceTransformer
import pandas as pd


def get_bool_labels(
    df: pd.DataFrame, attr_names_list: list[str], current_row_index: int
) -> list[bool]:
    current_row_labels = [
        df[attr_name][current_row_index] for attr_name in attr_names_list
    ]
    return [True if label == "y" else False for label in current_row_labels]


def generate_embeddings_and_labels(
    file_path: str,
    encoding="ISO-8859-1",
    content_column_name="TEXT",
    attr_names_list=["cEXT", "cNEU", "cAGR", "cCON", "cOPN"],
) -> list[tuple[list[float], list[bool]]]:
    """
    1. Generate embedding to every text in the given csv file.
    2. Extract true labels for every text and convert to booleans.
    3. Return list of tuples (embedding, labels). So every item in the list is a the text's vector and its true labels by order.
    """
    df = pd.read_csv(file_path, encoding=encoding)
    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = model.encode(df[content_column_name])
    embeddings_with_labels = []
    for i, vector in enumerate(embeddings):
        row_labels = get_bool_labels(df, attr_names_list, i)
        embeddings_with_labels.append((vector, row_labels))
    return embeddings_with_labels


essays_filepath = "../data/essays.csv"
essays_attrs_names = ["cEXT", "cNEU", "cAGR", "cCON", "cOPN"]
embeddings_with_labels = generate_embeddings_and_labels(
    essays_filepath, attr_names_list=essays_attrs_names
)
print("""Here's a sample of the embeddings_with_labels data (1 item).\n 
      As you can see, the item is a tuple of doc-embedding-vector and a list of the true labels of the Big Five traits:""")
print(embeddings_with_labels[0])

Here's a sample of the embeddings_with_labels data (1 item).
 
      As you can see, the item is a tuple of doc-embedding-vector and a list of the true labels of the Big Five traits:
(array([ 1.18929811e-03, -3.41515988e-02,  1.45341754e-02,  7.72646861e-03,
        1.16681702e-01,  5.39377555e-02,  6.08672351e-02,  2.26857010e-02,
       -4.65348288e-02, -1.89967435e-02, -3.88598652e-03,  8.06692094e-02,
        1.47118038e-02,  3.55075449e-02, -3.67502160e-02, -1.04212342e-02,
        1.79847740e-02,  1.83973350e-02, -4.96940054e-02, -2.24592742e-02,
        4.12261672e-02, -5.77117503e-02, -2.36110520e-02,  6.25444725e-02,
        3.60451266e-02,  1.63510010e-01, -5.74698625e-03, -9.83862281e-02,
       -4.59853075e-02, -5.94336428e-02,  4.70443964e-02,  2.40970794e-02,
       -3.37781757e-02, -3.90312672e-02,  4.54586372e-03,  1.72685795e-02,
       -3.23326811e-02,  8.55467767e-02,  7.84962699e-02, -2.98641436e-02,
       -4.21924720e-04,  1.85214765e-02,  3.63461711e-02,  5.97724

# Split to train/test sets (0.8/0.2) and train 5 logistic regression models (1 per each of the Big Five traits)


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import numpy as np

train_data, test_data = train_test_split(embeddings_with_labels, test_size=0.2)
train_labels = np.array([x[1] for x in train_data])
train_data = np.array([x[0] for x in train_data])
clfs = []

for i in range(len(essays_attrs_names)):
    single_attr_true_labels = [row_labels[i] for row_labels in train_labels]
    clfs.append(
        LogisticRegression(random_state=0).fit(train_data, single_attr_true_labels)
    )

# Evaluate against the test set

### As we can see, the logistic regression's model per trait provide accuracy scores that are not too far away from the baseline tools we've evaluated


In [4]:
_test_labels = np.array([x[1] for x in test_data])
_test_data = np.array([x[0] for x in test_data])

for i in range(len(essays_attrs_names)):
    single_attr_true_labels = [row_labels[i] for row_labels in _test_labels]
    print(essays_attrs_names[i], clfs[i].score(_test_data, single_attr_true_labels))

cEXT 0.5566801619433198
cNEU 0.5991902834008097
cAGR 0.5506072874493927
cCON 0.5141700404858299
cOPN 0.6234817813765182
