# Train all traits' models

This notebook generalizes the processes done in notebooks 7 & 9 for OPN and EXT <br>
Those steps were written in a patch-over-patch and to scale well to other traits, has to be re-written.


## Read the dataset


In [1]:
import pandas as pd

file_path = "analysis/llm-dataset-generation/traits-definitions.xlsx"


def read_dataset(file_path, sheet_name) -> list[str]:
    df = pd.read_excel(file_path, sheet_name=sheet_name)
    return df.iloc[:, 0].tolist()

## Filter unique texts

Embed the dataset and apply cosine similarity to filter out "too similar" texts


In [2]:
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
import openpyxl
from torch.types import _TensorOrTensors

SIMILARITY_THRESHOLD = 0.95
MODEL = "intfloat/e5-large-v2"


def overwrite_sheet(file_path: str, sheet_name: str, texts: list[str]) -> None:
    workbook = openpyxl.load_workbook(file_path)
    if sheet_name in workbook.sheetnames:
        workbook.remove(workbook[sheet_name])
    new_sheet = workbook.create_sheet(title=sheet_name)
    for i, paragraph in enumerate(texts):
        new_sheet.cell(row=i + 1, column=1, value=paragraph)
    workbook.save(file_path)


def get_unique_paragraphs(
    texts: list[str], label: str, sheet_name: str
) -> tuple[list[_TensorOrTensors], list[str]]:
    model = SentenceTransformer(MODEL)
    embeddings = model.encode(texts, convert_to_tensor=True)
    similarities = util.pytorch_cos_sim(embeddings, embeddings)
    unique_paragraphs = []
    unique_embeddings = []
    for i in tqdm(range(len(texts))):
        is_dissimilar = all(
            similarity < SIMILARITY_THRESHOLD
            for j, similarity in enumerate(similarities[i])
            if j != i
        )
        if is_dissimilar:
            unique_paragraphs.append(texts[i])
            unique_embeddings.append((embeddings[i], label))

    print(f"{label}: {len(unique_paragraphs)}/{len(texts)} Unique Paragraphs.")
    if not unique_paragraphs:
        print("No unique paragraphs found.")

    overwrite_sheet(
        file_path,
        sheet_name,
        unique_paragraphs,
    )
    return (unique_embeddings, unique_paragraphs)

In [3]:
HIGH_EXT_LABEL = "HIGH_EXT"
LOW_EXT_LABEL = "LOW_EXT"
HIGH_OPN_LABEL = "HIGH_OPN"
LOW_OPN_LABEL = "LOW_OPN"
HIGH_NEU_LABEL = "HIGH_NEU"
LOW_NEU_LABEL = "LOW_NEU"
HIGH_AGR_LABEL = "HIGH_AGR"
LOW_AGR_LABEL = "LOW_AGR"
HIGH_CON_LABEL = "HIGH_CON"
LOW_CON_LABEL = "LOW_CON"
labels = [
    HIGH_EXT_LABEL,
    LOW_EXT_LABEL,
    HIGH_OPN_LABEL,
    LOW_OPN_LABEL,
    HIGH_NEU_LABEL,
    LOW_NEU_LABEL,
    HIGH_AGR_LABEL,
    LOW_AGR_LABEL,
    HIGH_CON_LABEL,
    LOW_CON_LABEL,
]

unique_vectors_with_labels = {}
unique_paragraphs = {}

for label in labels:
    texts = read_dataset(file_path, f"{label}-GPT3.5")
    unique_vectors_with_labels[label], unique_paragraphs[label] = get_unique_paragraphs(
        texts, label, f"{label}-GPT3.5-filtered"
    )

100%|██████████| 176/176 [00:00<00:00, 2198.82it/s]


HIGH_EXT: 167/176 Unique Paragraphs.


100%|██████████| 359/359 [00:00<00:00, 1202.64it/s]


LOW_EXT: 286/359 Unique Paragraphs.


100%|██████████| 149/149 [00:00<00:00, 2587.49it/s]


HIGH_OPN: 143/149 Unique Paragraphs.


100%|██████████| 215/215 [00:00<00:00, 2309.54it/s]


LOW_OPN: 129/215 Unique Paragraphs.


100%|██████████| 192/192 [00:00<00:00, 2153.72it/s]


HIGH_NEU: 173/192 Unique Paragraphs.


100%|██████████| 187/187 [00:00<00:00, 2096.82it/s]


LOW_NEU: 181/187 Unique Paragraphs.


100%|██████████| 179/179 [00:00<00:00, 2196.99it/s]


HIGH_AGR: 157/179 Unique Paragraphs.


100%|██████████| 170/170 [00:00<00:00, 2235.20it/s]


LOW_AGR: 164/170 Unique Paragraphs.


100%|██████████| 179/179 [00:00<00:00, 2334.40it/s]


HIGH_CON: 157/179 Unique Paragraphs.


100%|██████████| 159/159 [00:00<00:00, 2305.02it/s]


LOW_CON: 149/159 Unique Paragraphs.


## Dataset Statistics


In [4]:
dfs = []
for label in labels:
    df = pd.DataFrame(unique_paragraphs[label], columns=["Paragraph"])
    df["Token Count"] = df["Paragraph"].apply(lambda x: len(x.split()))
    dfs.append(df["Token Count"].describe())

pd.concat(dfs, keys=labels)

HIGH_EXT  count    167.000000
          mean      80.922156
          std       34.531284
          min       38.000000
          25%       52.000000
                      ...    
LOW_CON   min       32.000000
          25%       51.000000
          50%       61.000000
          75%       73.000000
          max      123.000000
Name: Token Count, Length: 80, dtype: float64

## Logistic Regression


### Train


In [5]:
from sklearn.linear_model import LogisticRegression
import pickle


def train_model(low_label, high_label):
    vectors_with_labels = (
        unique_vectors_with_labels[low_label] + unique_vectors_with_labels[high_label]
    )
    train_vectors = [t[0] for t in vectors_with_labels]
    train_labels = [t[1] for t in vectors_with_labels]
    return LogisticRegression(random_state=0).fit(train_vectors, train_labels)


def persist_model(model, model_name):
    with open(f"models/step-10/{model_name}.pkl", "wb") as f:
        pickle.dump(model, f)

In [6]:
import os


def get_trained_trait_model(model_name, low_label, high_label):
    model_path = f"models/step-10/{model_name}.pkl"
    if os.path.exists(model_path):
        with open(model_path, "rb") as f:
            model = pickle.load(f)
    else:
        model = train_model(low_label, high_label)
        persist_model(model, model_name)
    return model


gpt_ext = get_trained_trait_model("gpt_ext", LOW_EXT_LABEL, HIGH_EXT_LABEL)
gpt_opn = get_trained_trait_model("gpt_opn", LOW_OPN_LABEL, HIGH_OPN_LABEL)
gpt_neu = get_trained_trait_model("gpt_neu", LOW_NEU_LABEL, HIGH_NEU_LABEL)
gpt_agr = get_trained_trait_model("gpt_agr", LOW_AGR_LABEL, HIGH_AGR_LABEL)
gpt_con = get_trained_trait_model("gpt_con", LOW_CON_LABEL, HIGH_CON_LABEL)

### Test


#### MyPersonality general processing


In [7]:
import pandas as pd
import pickle
import os


def process_my_personality_dataset(file_path, vectors_path):
    MODEL = "intfloat/e5-large-v2"
    model = SentenceTransformer(MODEL)
    df = pd.read_csv(file_path, encoding="ISO-8859-1")

    if os.path.exists(vectors_path):
        with open(vectors_path, "rb") as f:
            vectors = pickle.load(f)
    else:
        vectors = model.encode(df["STATUS"], convert_to_tensor=True)
        with open(vectors_path, "wb") as f:
            pickle.dump(vectors, f)

    return vectors, df


def evaluate_score_for_trait(model, vectors, df, trait_col, trait_value_map):
    df[trait_col] = df[trait_col].map(trait_value_map)
    print("Model score for", trait_col, model.score(vectors, df[trait_col]))

##### MyPersonality Concatenated


In [8]:
vectors, df = process_my_personality_dataset(
    file_path="./data/myPersonality-concatenated.csv",
    vectors_path="analysis/vectors/my_personality_concatenated.pkl",
)

evaluate_score_for_trait(
    gpt_ext, vectors, df, "cEXT", {"y": HIGH_EXT_LABEL, "n": LOW_EXT_LABEL}
)
evaluate_score_for_trait(
    gpt_opn, vectors, df, "cOPN", {"y": HIGH_OPN_LABEL, "n": LOW_OPN_LABEL}
)
evaluate_score_for_trait(
    gpt_neu, vectors, df, "cNEU", {"y": HIGH_NEU_LABEL, "n": LOW_NEU_LABEL}
)
evaluate_score_for_trait(
    gpt_agr, vectors, df, "cAGR", {"y": HIGH_AGR_LABEL, "n": LOW_AGR_LABEL}
)
evaluate_score_for_trait(
    gpt_con, vectors, df, "cCON", {"y": HIGH_CON_LABEL, "n": LOW_CON_LABEL}
)

Model score for cEXT 0.484
Model score for cOPN 0.464
Model score for cNEU 0.536
Model score for cAGR 0.456
Model score for cCON 0.496


##### MyPersonality


In [9]:
vectors, df = process_my_personality_dataset(
    file_path="./data/myPersonality.csv",
    vectors_path="analysis/vectors/my_personality.pkl",
)

evaluate_score_for_trait(
    gpt_ext, vectors, df, "cEXT", {"y": HIGH_EXT_LABEL, "n": LOW_EXT_LABEL}
)
evaluate_score_for_trait(
    gpt_opn, vectors, df, "cOPN", {"y": HIGH_OPN_LABEL, "n": LOW_OPN_LABEL}
)
evaluate_score_for_trait(
    gpt_neu, vectors, df, "cNEU", {"y": HIGH_NEU_LABEL, "n": LOW_NEU_LABEL}
)
evaluate_score_for_trait(
    gpt_agr, vectors, df, "cAGR", {"y": HIGH_AGR_LABEL, "n": LOW_AGR_LABEL}
)
evaluate_score_for_trait(
    gpt_con, vectors, df, "cCON", {"y": HIGH_CON_LABEL, "n": LOW_CON_LABEL}
)

Model score for cEXT 0.5005546032066149
Model score for cOPN 0.5387718059897146
Model score for cNEU 0.5565191086013915
Model score for cAGR 0.47584955127558737
Model score for cCON 0.5417969143894323


#### Bard's unssen dataset


In [10]:
import pandas as pd


def load_and_encode_bard_dataset(file_path, low_label, high_label):
    df = pd.read_excel(file_path, sheet_name=f"{high_label}-Bard")
    high_texts = df.iloc[:, 0].tolist()
    high_labels = [high_label for i in high_texts]

    df = pd.read_excel(file_path, sheet_name=f"{low_label}-Bard")
    low_texts = df.iloc[:, 0].tolist()
    low_labels = [low_label for i in low_texts]

    MODEL = "intfloat/e5-large-v2"
    model = SentenceTransformer(MODEL)
    embeddings = model.encode(low_texts + high_texts, convert_to_tensor=True)
    labels = low_labels + high_labels
    return embeddings, labels

In [11]:
bard_ext_embeddings, bard_ext_labels = load_and_encode_bard_dataset(
    file_path, LOW_EXT_LABEL, HIGH_EXT_LABEL
)
bard_opn_embeddings, bard_opn_labels = load_and_encode_bard_dataset(
    file_path, LOW_OPN_LABEL, HIGH_OPN_LABEL
)
bard_neu_embeddings, bard_neu_labels = load_and_encode_bard_dataset(
    file_path, LOW_NEU_LABEL, HIGH_NEU_LABEL
)
bard_agr_embeddings, bard_agr_labels = load_and_encode_bard_dataset(
    file_path, LOW_AGR_LABEL, HIGH_AGR_LABEL
)
bard_con_embeddings, bard_con_labels = load_and_encode_bard_dataset(
    file_path, LOW_CON_LABEL, HIGH_CON_LABEL
)

In [12]:
gpt_ext_on_bard_score = gpt_ext.score(bard_ext_embeddings, bard_ext_labels)
print("GPT_EXT score on Bard's unseen test set:", gpt_ext_on_bard_score)

gpt_opn_on_bard_score = gpt_opn.score(bard_opn_embeddings, bard_opn_labels)
print("GPT_OPN score on Bard's unseen test set:", gpt_opn_on_bard_score)

gpt_neu_on_bard_score = gpt_neu.score(bard_neu_embeddings, bard_neu_labels)
print("GPT_NEU score on Bard's unseen test set:", gpt_neu_on_bard_score)

gpt_agr_on_bard_score = gpt_agr.score(bard_agr_embeddings, bard_agr_labels)
print("GPT_AGR score on Bard's unseen test set:", gpt_agr_on_bard_score)

gpt_con_on_bard_score = gpt_con.score(bard_con_embeddings, bard_con_labels)
print("GPT_CON score on Bard's unseen test set:", gpt_con_on_bard_score)

GPT_EXT score on Bard's unseen test set: 0.963302752293578
GPT_OPN score on Bard's unseen test set: 0.8723404255319149
GPT_NEU score on Bard's unseen test set: 0.991044776119403
GPT_AGR score on Bard's unseen test set: 0.959731543624161
GPT_CON score on Bard's unseen test set: 0.9340277777777778


### GPT & MyPerosnality Model

Train a logistic regression model on GPT-Generated data combined with myPersonality data


In [13]:
def train_my_personality_with_gpt(trait_column, low_label, high_label):
    model_name = f"my_personality_gpt_{trait_column}"
    model_path = f"models/step-10/{model_name}.pkl"
    if os.path.exists(model_path):
        with open(model_path, "rb") as f:
            model = pickle.load(f)
    else:
        myPersonality_vecs, df = process_my_personality_dataset(
            file_path="./data/myPersonality.csv",
            vectors_path="analysis/vectors/my_personality.pkl",
        )

        value_map = {"y": high_label, "n": low_label}
        df[trait_column] = df[trait_column].map(value_map)
        my_personality_vectors_with_labels = list(
            zip(myPersonality_vecs.tolist(), df[trait_column])
        )

        vectors_with_labels = (
            unique_vectors_with_labels[high_label]
            + unique_vectors_with_labels[low_label]
            + my_personality_vectors_with_labels
        )

        train_vectors = [t[0] for t in vectors_with_labels]
        train_labels = [t[1] for t in vectors_with_labels]

        model = LogisticRegression(random_state=0).fit(train_vectors, train_labels)
        persist_model(model, model_name)
    return model

In [14]:
gpt_and_my_personality_ext_model = train_my_personality_with_gpt(
    "cEXT", LOW_EXT_LABEL, HIGH_EXT_LABEL
)

gpt_and_my_personality_opn_model = train_my_personality_with_gpt(
    "cOPN", LOW_OPN_LABEL, HIGH_OPN_LABEL
)

gpt_and_my_personality_neu_model = train_my_personality_with_gpt(
    "cNEU", LOW_NEU_LABEL, HIGH_NEU_LABEL
)

gpt_and_my_personality_agr_model = train_my_personality_with_gpt(
    "cAGR", LOW_AGR_LABEL, HIGH_AGR_LABEL
)

gpt_and_my_personality_con_model = train_my_personality_with_gpt(
    "cCON", LOW_CON_LABEL, HIGH_CON_LABEL
)

In [15]:
print(
    "GPT_EXT + MyPersonality score on Bard's unseen test set:",
    gpt_and_my_personality_ext_model.score(bard_ext_embeddings, bard_ext_labels),
)
print(
    "GPT_OPN + MyPersonality score on Bard's unseen test set:",
    gpt_and_my_personality_opn_model.score(bard_opn_embeddings, bard_opn_labels),
)

print(
    "GPT_NEU + MyPersonality score on Bard's unseen test set:",
    gpt_and_my_personality_neu_model.score(bard_neu_embeddings, bard_neu_labels),
)

print(
    "GPT_AGR + MyPersonality score on Bard's unseen test set:",
    gpt_and_my_personality_agr_model.score(bard_agr_embeddings, bard_agr_labels),
)

print(
    "GPT_CON + MyPersonality score on Bard's unseen test set:",
    gpt_and_my_personality_con_model.score(bard_con_embeddings, bard_con_labels),
)

GPT_EXT + MyPersonality score on Bard's unseen test set: 0.8486238532110092
GPT_OPN + MyPersonality score on Bard's unseen test set: 0.526595744680851
GPT_NEU + MyPersonality score on Bard's unseen test set: 0.7313432835820896
GPT_AGR + MyPersonality score on Bard's unseen test set: 0.9395973154362416
GPT_CON + MyPersonality score on Bard's unseen test set: 0.9548611111111112
