# Extroversion Analysis with LLM Generated Dataset


## Read the dataset


In [1]:
import pandas as pd

file_path = "../analysis/llm-dataset-generation/traits-definitions.xlsx"
high_ext_sheet_name = "High-EXT-GPT3.5"
low_ext_sheet_name = "Low-EXT-GPT3.5"

df = pd.read_excel(file_path, sheet_name=high_ext_sheet_name)
high_ext_texts_gpt = df.iloc[:, 0].tolist()

df = pd.read_excel(file_path, sheet_name=low_ext_sheet_name)
low_ext_texts_gpt = df.iloc[:, 0].tolist()

## Filter unique texts

Embed the dataset and apply cosine similarity to filter out "too similar" texts


In [2]:
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm

SIMILARITY_THRESHOLD = 0.95
MODEL = "intfloat/e5-large-v2"


def get_unique_paragraphs(texts: list[str], label: str):
    model = SentenceTransformer(MODEL)
    embeddings = model.encode(texts, convert_to_tensor=True)
    similarities = util.pytorch_cos_sim(embeddings, embeddings)
    unique_paragraphs = []
    unique_embeddings = []
    for i in tqdm(range(len(texts))):
        is_dissimilar = all(
            similarity < SIMILARITY_THRESHOLD
            for j, similarity in enumerate(similarities[i])
            if j != i
        )
        if is_dissimilar:
            unique_paragraphs.append(texts[i])
            unique_embeddings.append((embeddings[i], label))

    print(f"{len(unique_paragraphs)}/{len(texts)} Unique Paragraphs.")
    if not unique_paragraphs:
        print("No unique paragraphs found.")
    return unique_embeddings, unique_paragraphs

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import openpyxl


def overwrite_sheet(file_path: str, sheet_name: str, texts: list[str]):
    # workbook = openpyxl.load_workbook(file_path)
    # if sheet_name in workbook.sheetnames:
    #     workbook.remove(workbook[sheet_name])
    # new_sheet = workbook.create_sheet(title=sheet_name)
    # for i, paragraph in enumerate(texts):
    #     new_sheet.cell(row=i + 1, column=1, value=paragraph)
    # workbook.save(file_path)
    pass

### High Extroversion

99/176 texts left


In [4]:
(
    unique_high_ext_vectors_with_labels,
    unique_high_ext_paragraphs_gpt,
) = get_unique_paragraphs(high_ext_texts_gpt, label="HIGH_EXT")

# overwrite_sheet(
#     file_path,
#     f"High-EXT-GPT3.5-filtered-{SIMILARITY_THRESHOLD}",
#     unique_high_ext_paragraphs_gpt,
# )

100%|██████████| 176/176 [00:00<00:00, 2349.52it/s]

167/176 Unique Paragraphs.





### Low Extroversion

102/359 texts left


In [5]:
(
    unique_low_ext_vectors_with_labels,
    unique_low_ext_paragraphs_gpt,
) = get_unique_paragraphs(low_ext_texts_gpt, label="LOW_EXT")

# overwrite_sheet(
#     file_path,
#     f"Low-EXT-GPT3.5-filtered-{SIMILARITY_THRESHOLD}",
#     unique_low_ext_paragraphs_gpt,
# )

100%|██████████| 359/359 [00:00<00:00, 1249.51it/s]


286/359 Unique Paragraphs.


## Dataset Statistics


### High Extroversion


In [6]:
df_unique_high_ext = pd.DataFrame(unique_high_ext_paragraphs_gpt, columns=["Paragraph"])
df_unique_high_ext["Token Count"] = df_unique_high_ext["Paragraph"].apply(
    lambda x: len(x.split())
)
df_unique_high_ext["Token Count"].describe()

count    167.000000
mean      80.922156
std       34.531284
min       38.000000
25%       52.000000
50%       77.000000
75%       98.000000
max      199.000000
Name: Token Count, dtype: float64

### Low Extroversion


In [7]:
df_unique_low_ext = pd.DataFrame(unique_low_ext_paragraphs_gpt, columns=["Paragraph"])
df_unique_low_ext["Token Count"] = df_unique_low_ext["Paragraph"].apply(
    lambda x: len(x.split())
)
df_unique_low_ext["Token Count"].describe()

count    286.000000
mean      69.300699
std       21.085801
min       34.000000
25%       53.000000
50%       65.000000
75%       82.000000
max      142.000000
Name: Token Count, dtype: float64

## Logistic Regression


### Train


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

vectors_with_labels = (
    unique_high_ext_vectors_with_labels + unique_low_ext_vectors_with_labels
)

train_data, test_data = train_test_split(vectors_with_labels, test_size=0.2)
train_vectors = [t[0] for t in train_data]
train_labels = [t[1] for t in train_data]
test_vectors = [t[0] for t in test_data]
test_labels = [t[1] for t in test_data]


gpt_only_model = LogisticRegression(random_state=0).fit(train_vectors, train_labels)
print(gpt_only_model.score(test_vectors, test_labels))

0.945054945054945


### Test


#### MyPersonality Concatenated


In [13]:
import pandas as pd

MODEL = "intfloat/e5-large-v2"
model = SentenceTransformer(MODEL)
myPersonality_df = pd.read_csv(
    "../data/myPersonality-concatenated.csv",
    usecols=["STATUS", "cEXT"],
    encoding="ISO-8859-1",
)
value_map = {"y": "HIGH_EXT", "n": "LOW_EXT"}
myPersonality_df["cEXT"] = myPersonality_df["cEXT"].map(value_map)
myPersonality_embeddings = model.encode(
    myPersonality_df["STATUS"], convert_to_tensor=True
)

print(
    "GPT-Only EXT Score:",
    gpt_only_model.score(myPersonality_embeddings, myPersonality_df["cEXT"]),
)

GPT-Only EXT Score: 0.512


#### MyPersonality


In [14]:
import pandas as pd

MODEL = "intfloat/e5-large-v2"
model = SentenceTransformer(MODEL)
myPersonality_df = pd.read_csv(
    "../data/myPersonality.csv",
    usecols=["STATUS", "cEXT"],
    encoding="ISO-8859-1",
)
value_map = {"y": "HIGH_EXT", "n": "LOW_EXT"}
myPersonality_df["cEXT"] = myPersonality_df["cEXT"].map(value_map)
myPersonality_embeddings = model.encode(
    myPersonality_df["STATUS"], convert_to_tensor=True
)

print(
    "GPT-Only EXT Score:",
    gpt_only_model.score(myPersonality_embeddings, myPersonality_df["cEXT"]),
)

GPT-Only EXT Score: 0.5085207219925381


#### Bard's unssen dataset


In [16]:
import pandas as pd

df = pd.read_excel(file_path, sheet_name="High-EXT-Bard-Testset")
high_ext_texts_bard = df.iloc[:, 0].tolist()
high_ext_labels = ["HIGH_EXT" for i in high_ext_texts_bard]

df = pd.read_excel(file_path, sheet_name="Low-EXT-Bard-Testset")
low_ext_texts_bard = df.iloc[:, 0].tolist()
low_ext_labels = ["LOW_EXT" for i in low_ext_texts_bard]

In [17]:
MODEL = "intfloat/e5-large-v2"
model = SentenceTransformer(MODEL)
bard_embeddings = model.encode(
    high_ext_texts_bard + low_ext_texts_bard, convert_to_tensor=True
)
gpt_only_model.score(bard_embeddings, high_ext_labels + low_ext_labels)

0.9541284403669725

### GPT & MyPerosnality Model


In [18]:
print(
    "Train a logistic regression model on GPT-Generated data combined with myPersonality data"
)
MODEL = "intfloat/e5-large-v2"
model = SentenceTransformer(MODEL)
myPersonality_df = pd.read_csv(
    "../data/myPersonality.csv", usecols=["STATUS", "cEXT"], encoding="ISO-8859-1"
)
value_map = {"y": "HIGH_EXT", "n": "LOW_EXT"}
myPersonality_df["cEXT"] = myPersonality_df["cEXT"].map(value_map)
# myPersonality_embeddings = model.encode(
#     myPersonality_df["STATUS"], convert_to_tensor=True
# )
my_personality_vectors_with_labels = list(
    zip(myPersonality_embeddings.tolist(), myPersonality_df["cEXT"])
)

vectors_with_labels = (
    unique_high_ext_vectors_with_labels
    + unique_low_ext_vectors_with_labels
    + my_personality_vectors_with_labels
)

train_vectors = [t[0] for t in vectors_with_labels]
train_labels = [t[1] for t in vectors_with_labels]

gpt_and_myPersonality_model = LogisticRegression(random_state=0).fit(
    train_vectors, train_labels
)

Train a logistic regression model on GPT-Generated data combined with myPersonality data


In [20]:
gpt_and_myPersonality_model.score(bard_embeddings, high_ext_labels + low_ext_labels)

0.8486238532110092

In [None]:
import pickle

with open("../models/gpt_and_myPersonality_ext_95.pkl", "wb") as file:
    pickle.dump(gpt_and_myPersonality_model, file)

In [26]:
high_bard_embeddings = model.encode(high_ext_texts_bard, convert_to_tensor=True)
low_bard_embeddings = model.encode(low_ext_texts_bard, convert_to_tensor=True)

In [73]:
high_ext_predictions = gpt_only_model.predict(high_bard_embeddings)
low_ext_predictions = gpt_only_model.predict(low_bard_embeddings)

In [74]:
from sklearn.metrics import precision_score, recall_score

# HIGHs
true_labels = ["HIGH_EXT"] * len(high_ext_texts_bard)
precision = precision_score(true_labels, high_ext_predictions, average='weighted')
recall = recall_score(true_labels, high_ext_predictions, average='weighted')
correct_predictions = sum([1 for pred in high_ext_predictions if pred == "HIGH_EXT"])

precision, recall, f"{correct_predictions}/{len(high_ext_texts_bard)}"

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


(1.0, 0.908256880733945, '99/109')

In [66]:
# LOWs
true_labels = ["LOW_EXT"] * len(low_ext_predictions)
precision = precision_score(true_labels, low_ext_predictions, average='weighted')
recall = recall_score(true_labels, low_ext_predictions, average='weighted')
correct_predictions = sum([1 for pred in low_ext_predictions if pred == "LOW_EXT"])

precision, recall, f"{correct_predictions}/{len(low_ext_predictions)}"

(1.0, 1.0, '109/109')