# Train all traits' models

This notebook generalizes the processes done in notebooks 7 & 9 for OPN and EXT <br>
Those steps were written in a patch-over-patch and to scale well to other traits, has to be re-written.


## Read the dataset


In [1]:
import pandas as pd

file_path = "analysis/llm-dataset-generation/traits-definitions.xlsx"


def read_dataset(file_path, sheet_name) -> list[str]:
    df = pd.read_excel(file_path, sheet_name=sheet_name)
    return df.iloc[:, 0].tolist()

## Filter unique texts

Embed the dataset and apply cosine similarity to filter out "too similar" texts


In [2]:
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
import openpyxl
from torch.types import _TensorOrTensors

SIMILARITY_THRESHOLD = 0.95
MODEL = "intfloat/e5-large-v2"


def overwrite_sheet(file_path: str, sheet_name: str, texts: list[str]) -> None:
    workbook = openpyxl.load_workbook(file_path)
    if sheet_name in workbook.sheetnames:
        workbook.remove(workbook[sheet_name])
    new_sheet = workbook.create_sheet(title=sheet_name)
    for i, paragraph in enumerate(texts):
        new_sheet.cell(row=i + 1, column=1, value=paragraph)
    workbook.save(file_path)


def get_unique_paragraphs(
    texts: list[str], label: str, sheet_name: str
) -> tuple[list[_TensorOrTensors], list[str]]:
    model = SentenceTransformer(MODEL)
    embeddings = model.encode(texts, convert_to_tensor=True)
    similarities = util.pytorch_cos_sim(embeddings, embeddings)
    unique_paragraphs = []
    unique_embeddings = []
    for i in tqdm(range(len(texts))):
        is_dissimilar = all(
            similarity < SIMILARITY_THRESHOLD
            for j, similarity in enumerate(similarities[i])
            if j != i
        )
        if is_dissimilar:
            unique_paragraphs.append(texts[i])
            unique_embeddings.append((embeddings[i], label))

    print(f"{label}: {len(unique_paragraphs)}/{len(texts)} Unique Paragraphs.")
    if not unique_paragraphs:
        print("No unique paragraphs found.")

    overwrite_sheet(
        file_path,
        sheet_name,
        unique_paragraphs,
    )
    return (unique_embeddings, unique_paragraphs)

In [3]:
HIGH_EXT_LABEL = "HIGH_EXT"
LOW_EXT_LABEL = "LOW_EXT"
HIGH_OPN_LABEL = "HIGH_OPN"
LOW_OPN_LABEL = "LOW_OPN"
labels = [HIGH_EXT_LABEL, LOW_EXT_LABEL, HIGH_OPN_LABEL, LOW_OPN_LABEL]

unique_vectors_with_labels = {}
unique_paragraphs = {}

for label in labels:
    texts = read_dataset(file_path, f"{label}-GPT3.5")
    unique_vectors_with_labels[label], unique_paragraphs[label] = get_unique_paragraphs(
        texts, label, f"{label}-GPT3.5-filtered"
    )

100%|██████████| 176/176 [00:00<00:00, 2214.06it/s]


HIGH_EXT: 167/176 Unique Paragraphs.


100%|██████████| 359/359 [00:00<00:00, 1185.08it/s]


LOW_EXT: 286/359 Unique Paragraphs.


100%|██████████| 149/149 [00:00<00:00, 2600.40it/s]


HIGH_OPN: 143/149 Unique Paragraphs.


100%|██████████| 215/215 [00:00<00:00, 2223.07it/s]


LOW_OPN: 129/215 Unique Paragraphs.


## Dataset Statistics


In [4]:
dfs = []
for label in labels:
    df = pd.DataFrame(unique_paragraphs[label], columns=["Paragraph"])
    df["Token Count"] = df["Paragraph"].apply(lambda x: len(x.split()))
    dfs.append(df["Token Count"].describe())

pd.concat(dfs, keys=labels)

HIGH_EXT  count    167.000000
          mean      80.922156
          std       34.531284
          min       38.000000
          25%       52.000000
          50%       77.000000
          75%       98.000000
          max      199.000000
LOW_EXT   count    286.000000
          mean      69.300699
          std       21.085801
          min       34.000000
          25%       53.000000
          50%       65.000000
          75%       82.000000
          max      142.000000
HIGH_OPN  count    143.000000
          mean      65.524476
          std       10.643938
          min       40.000000
          25%       59.500000
          50%       65.000000
          75%       72.000000
          max      105.000000
LOW_OPN   count    129.000000
          mean      60.302326
          std       12.095715
          min       39.000000
          25%       53.000000
          50%       59.000000
          75%       65.000000
          max      100.000000
Name: Token Count, dtype: float64

## Logistic Regression


### Train


In [5]:
from sklearn.linear_model import LogisticRegression
import pickle


def train_model(low_label, high_label):
    vectors_with_labels = (
        unique_vectors_with_labels[low_label] + unique_vectors_with_labels[high_label]
    )
    train_vectors = [t[0] for t in vectors_with_labels]
    train_labels = [t[1] for t in vectors_with_labels]
    return LogisticRegression(random_state=0).fit(train_vectors, train_labels)


def persist_model(model, model_name):
    with open(f"models/step-10/{model_name}.pkl", "wb") as f:
        pickle.dump(model, f)

In [6]:
import os


def get_trained_trait_model(model_name, low_label, high_label):
    model_path = f"models/step-10/{model_name}.pkl"
    if os.path.exists(model_path):
        with open(model_path, "rb") as f:
            model = pickle.load(f)
    else:
        model = train_model(low_label, high_label)
        persist_model(model, model_name)
    return model


gpt_ext = get_trained_trait_model("gpt_ext", LOW_EXT_LABEL, HIGH_EXT_LABEL)
gpt_opn = get_trained_trait_model("gpt_opn", LOW_OPN_LABEL, HIGH_OPN_LABEL)

### Test


#### MyPersonality general processing


In [7]:
import pandas as pd
import pickle
import os


def process_my_personality_dataset(file_path, vectors_path):
    MODEL = "intfloat/e5-large-v2"
    model = SentenceTransformer(MODEL)
    df = pd.read_csv(file_path, encoding="ISO-8859-1")

    if os.path.exists(vectors_path):
        with open(vectors_path, "rb") as f:
            vectors = pickle.load(f)
    else:
        vectors = model.encode(df["STATUS"], convert_to_tensor=True)
        with open(vectors_path, "wb") as f:
            pickle.dump(vectors, f)

    return vectors, df


def evaluate_score_for_trait(model, vectors, df, trait_col, trait_value_map):
    df[trait_col] = df[trait_col].map(trait_value_map)
    print("Model score for", trait_col, model.score(vectors, df[trait_col]))

##### MyPersonality Concatenated


In [8]:
vectors, df = process_my_personality_dataset(
    file_path="./data/myPersonality-concatenated.csv",
    vectors_path="analysis/vectors/my_personality_concatenated.pkl",
)

evaluate_score_for_trait(
    gpt_ext, vectors, df, "cEXT", {"y": HIGH_EXT_LABEL, "n": LOW_EXT_LABEL}
)
evaluate_score_for_trait(
    gpt_opn, vectors, df, "cOPN", {"y": HIGH_OPN_LABEL, "n": LOW_OPN_LABEL}
)

Model score for cEXT 0.484
Model score for cOPN 0.464


##### MyPersonality


In [9]:
vectors, df = process_my_personality_dataset(
    file_path="./data/myPersonality.csv",
    vectors_path="analysis/vectors/my_personality.pkl",
)

evaluate_score_for_trait(
    gpt_ext, vectors, df, "cEXT", {"y": HIGH_EXT_LABEL, "n": LOW_EXT_LABEL}
)
evaluate_score_for_trait(
    gpt_opn, vectors, df, "cOPN", {"y": HIGH_OPN_LABEL, "n": LOW_OPN_LABEL}
)

Model score for cEXT 0.5005546032066149
Model score for cOPN 0.5387718059897146


#### Bard's unssen dataset


In [10]:
import pandas as pd


def load_and_encode_bard_dataset(file_path, low_label, high_label):
    df = pd.read_excel(file_path, sheet_name=f"{high_label}-Bard")
    high_texts = df.iloc[:, 0].tolist()
    high_labels = [high_label for i in high_texts]

    df = pd.read_excel(file_path, sheet_name=f"{low_label}-Bard")
    low_texts = df.iloc[:, 0].tolist()
    low_labels = [low_label for i in low_texts]

    MODEL = "intfloat/e5-large-v2"
    model = SentenceTransformer(MODEL)
    embeddings = model.encode(low_texts + high_texts, convert_to_tensor=True)
    labels = low_labels + high_labels
    return embeddings, labels

In [12]:
bard_ext_embeddings, bard_ext_labels = load_and_encode_bard_dataset(
    file_path, LOW_EXT_LABEL, HIGH_EXT_LABEL
)

bard_opn_embeddings, bard_opn_labels = load_and_encode_bard_dataset(
    file_path, LOW_OPN_LABEL, HIGH_OPN_LABEL
)

In [13]:
gpt_ext_on_bard_score = gpt_ext.score(bard_ext_embeddings, bard_ext_labels)
print("GPT_EXT score on Bard's unseen test set:", gpt_ext_on_bard_score)

gpt_opn_on_bard_score = gpt_opn.score(bard_opn_embeddings, bard_opn_labels)
print("GPT_OPN score on Bard's unseen test set:", gpt_opn_on_bard_score)

GPT_EXT score on Bard's unseen test set: 0.963302752293578
GPT_OPN score on Bard's unseen test set: 0.8723404255319149


### GPT & MyPerosnality Model

Train a logistic regression model on GPT-Generated data combined with myPersonality data


In [14]:
def train_my_personality_with_gpt(trait_column, low_label, high_label):
    model_name = f"my_personality_gpt_{trait_column}"
    model_path = f"models/step-10/{model_name}.pkl"
    if os.path.exists(model_path):
        with open(model_path, "rb") as f:
            model = pickle.load(f)
    else:
        myPersonality_vecs, df = process_my_personality_dataset(
            file_path="./data/myPersonality.csv",
            vectors_path="analysis/vectors/my_personality.pkl",
        )

        value_map = {"y": high_label, "n": low_label}
        df[trait_column] = df[trait_column].map(value_map)
        my_personality_vectors_with_labels = list(
            zip(myPersonality_vecs.tolist(), df[trait_column])
        )

        vectors_with_labels = (
            unique_vectors_with_labels[high_label]
            + unique_vectors_with_labels[low_label]
            + my_personality_vectors_with_labels
        )

        train_vectors = [t[0] for t in vectors_with_labels]
        train_labels = [t[1] for t in vectors_with_labels]

        model = LogisticRegression(random_state=0).fit(train_vectors, train_labels)
        persist_model(model, model_name)
    return model

In [15]:
gpt_and_my_personality_ext_model = train_my_personality_with_gpt(
    "cEXT", LOW_EXT_LABEL, HIGH_EXT_LABEL
)

gpt_and_my_personality_opn_model = train_my_personality_with_gpt(
    "cOPN", LOW_OPN_LABEL, HIGH_OPN_LABEL
)

In [16]:
print(
    "GPT_EXT + MyPersonality score on Bard's unseen test set:",
    gpt_and_my_personality_ext_model.score(bard_ext_embeddings, bard_ext_labels),
)
print(
    "GPT_OPN + MyPersonality score on Bard's unseen test set:",
    gpt_and_my_personality_opn_model.score(bard_opn_embeddings, bard_opn_labels),
)

GPT_EXT + MyPersonality score on Bard's unseen test set: 0.8486238532110092
GPT_OPN + MyPersonality score on Bard's unseen test set: 0.526595744680851
