# Load the dataset with vectors (E5-Large)


## 1. Load & Process


### 1.1 Load as DF and filter out by tokens threshold (40)


In [1]:
import pandas as pd
import os
import json
from tqdm import tqdm

df = pd.DataFrame(columns=["community", "user_name", "user_texts", "#_of_long_texts"])


def process_files(community, df):
    threshold = 40
    directory = f"../data/detailed_filtered_data/texts/{community}"
    for filename in tqdm(os.listdir(directory)):
        if filename.endswith(".json"):
            user_texts = []
            with open(os.path.join(directory, filename), "r") as f:
                data = json.load(f)
                for key in data:
                    for submission in data[key].get("submissions", []):
                        title = submission.get("title", "")
                        body = submission.get("body", "")
                        if len(title.split()) >= threshold:
                            user_texts.append(title)
                        if len(body.split()) >= threshold:
                            user_texts.append(body)
                    for submission in data[key].get("comments", []):
                        body = submission.get("body", "")
                        if len(body.split()) >= threshold:
                            user_texts.append(body)
            if len(user_texts) == 0:
                continue
            user_name = filename.split(".json")[0]
            new_row = pd.DataFrame(
                {
                    "community": community,
                    "user_name": [user_name],
                    "user_texts": [user_texts],
                    "#_of_long_texts": [len(user_texts)],
                }
            )
            df = pd.concat([df, new_row], ignore_index=True)
    return df


communities = [
    "classicalmusic",
    "electronicmusic",
    "hiphopheads",
    "indieheads",
    "Metal",
]

for vector_file_name in communities:
    df = pd.concat([df, process_files(vector_file_name, df)])
print("Done processing texts :)")

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1000/1000 [00:01<00:00, 523.97it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1000/1000 [00:01<00:00, 823.92it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1000/1000 [00:01<00:00, 627.96it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1000/1000 [00:01<00:00, 834.89it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1000/1000 [00:01<00:00, 732.79it/s]

Done processing texts :)





### 1.2 De-Duplication


In [2]:
def deduplicate_texts(df):
    exploded_df = df.explode("user_texts")
    exploded_df = exploded_df.drop_duplicates(subset="user_texts")
    return exploded_df


df = deduplicate_texts(df)
print("After dedeuplication there are", format(len(df), ","), "unique texts in total")

After dedeuplication there are 575,816 unique texts in total


### 1.3 Split DF by Genre


In [3]:
classicalmusic_df = df[df["community"] == "classicalmusic"]
electronicmusic_df = df[df["community"] == "electronicmusic"]
hiphopheads_df = df[df["community"] == "hiphopheads"]
indieheads_df = df[df["community"] == "indieheads"]
metal_df = df[df["community"] == "Metal"]

communities_dfs = [
    classicalmusic_df,
    electronicmusic_df,
    hiphopheads_df,
    indieheads_df,
    metal_df,
]

print(
    "Number of texts left after deduplication per community:",
    "\nClassicalmusic:",
    format(len(classicalmusic_df), ","),
    "\nElectronicmusic:",
    format(len(electronicmusic_df), ","),
    "\nHiphopheads:",
    format(len(hiphopheads_df), ","),
    "\nIndieheads:",
    format(len(indieheads_df), ","),
    "\nMetal:",
    format(len(metal_df), ","),
)

Number of texts left after deduplication per community: 
Classicalmusic: 170,251 
Electronicmusic: 97,063 
Hiphopheads: 121,538 
Indieheads: 84,314 
Metal: 102,650


### 1.4 Text-Count per user


In [4]:
classicalmusic_counts = classicalmusic_df["user_name"].value_counts()
electronicmusic_counts = electronicmusic_df["user_name"].value_counts()
hiphopheads_counts = hiphopheads_df["user_name"].value_counts()
indieheads_counts = indieheads_df["user_name"].value_counts()
metal_counts = metal_df["user_name"].value_counts()


print("Median and Mean of texts-count per user in each community after deduplication:")
print(
    "classicalmusic: Median =",
    classicalmusic_counts.median(),
    " | Mean =",
    round(classicalmusic_counts.mean(), 2),
)
print(
    "electronicmusic: Median =",
    electronicmusic_counts.median(),
    " | Mean =",
    round(electronicmusic_counts.mean(), 2),
)
print(
    "hiphopheads: Median =",
    hiphopheads_counts.median(),
    " | Mean =",
    round(hiphopheads_counts.mean(), 2),
)
print(
    "indieheads: Median =",
    indieheads_counts.median(),
    " | Mean =",
    round(indieheads_counts.mean(), 2),
)
print(
    "metal: Median =", metal_counts.median(), " | Mean =", round(metal_counts.mean(), 2)
)

Median and Mean of texts-count per user in each community after deduplication:
classicalmusic: Median = 118.5  | Mean = 173.37
electronicmusic: Median = 60.5  | Mean = 109.55
hiphopheads: Median = 83.0  | Mean = 122.39
indieheads: Median = 52.0  | Mean = 96.8
metal: Median = 58.0  | Mean = 107.94


## 2. Vectorize all Redditors texts


### 2.1 Vectorize


#### 2.1.1 Vectorization method


In [5]:
from sentence_transformers import SentenceTransformer
from typing import List, Union
from torch import Tensor
from numpy import ndarray


def vectorize_text(
    text, model: SentenceTransformer, batch_size=12, show_progress_bar=False
) -> Union[List[Tensor], ndarray, Tensor]:
    return model.encode(
        text, show_progress_bar=show_progress_bar, batch_size=batch_size
    )

#### 2.1.2 Vectorize & persist or reload (method only)


In [6]:
import h5py
import numpy as np

MODEL = "intfloat/e5-large-v2"

model = SentenceTransformer(MODEL)


def vectorize_or_load_community_data(community_name: str, df: pd.DataFrame):
    file_path = f"../data/detailed_filtered_data/vectors/{community_name}.h5"
    if os.path.exists(file_path):
        print(f"Loading vectors for {community_name} from file")
        with h5py.File(file_path, "r") as f:
            dataset = f["vectors"]
            df["vector"] = dataset[:].tolist()
    else:
        print(f"Vectorizing texts for {community_name}...")
        compression = "lzf"
        estimated_rows = len(df)

        with h5py.File(file_path, "a") as f:
            if "vectors" not in f:
                dataset = f.create_dataset(
                    "vectors",
                    (estimated_rows, model.get_sentence_embedding_dimension()),
                    dtype=np.float32,
                    compression=compression,
                )
                print("Dataset shape:", dataset.shape)
            else:
                dataset = f["vectors"]

            i = 0
            progress_bar = tqdm(total=estimated_rows, desc="Vectorizing texts")
            for _, row in df.iterrows():
                vector = vectorize_text(row["user_texts"], model)
                dataset[i] = vector
                i += 1
                progress_bar.update(1)
            progress_bar.close()

#### 2.1.3 Load vectors (or vectorize)


In [7]:
community_names = ["classical", "electronic", "hiphopheads", "indieheads", "metal"]
for community_name, df in zip(community_names, communities_dfs):
    vectorize_or_load_community_data(community_name, df)
df = pd.concat(communities_dfs, ignore_index=True)

Loading vectors for classical from file
Loading vectors for electronic from file
Loading vectors for hiphopheads from file
Loading vectors for indieheads from file
Loading vectors for metal from file


### 2.4 Show Sample


In [8]:
df.head()

Unnamed: 0,community,user_name,user_texts,#_of_long_texts,vector
0,classicalmusic,Radaxen,I remember doing this with WC3 Dota 1 Lifestea...,339,"[-0.0013457498280331492, -0.06375082582235336,..."
1,classicalmusic,Radaxen,Why not? Though there's no AD in League. I ki...,339,"[0.018474051728844643, -0.01666010357439518, 0..."
2,classicalmusic,Radaxen,"Haha I was kind of joking. That aside, why mos...",339,"[-0.008903076872229576, -0.05069487541913986, ..."
3,classicalmusic,Radaxen,"Mana is often a less limiting factor, unless y...",339,"[0.04738666117191315, -0.053208597004413605, 0..."
4,classicalmusic,Radaxen,"It's first pick worthy, but late in first phas...",339,"[0.007831799797713757, -0.0637780949473381, 0...."


## 3. Predict personality traits presence


### 3.1 Load the traits models


In [9]:
import pickle
from sklearn.linear_model import LogisticRegression

traits_and_models = {"opn": None, "ext": None, "neu": None, "agr": None, "con": None}

for trait in traits_and_models.keys():
    with open(f"../models/step-10/gpt_{trait}.pkl", "rb") as f:
        traits_and_models[trait] = pickle.load(f)

### 3.2 Predict-Probablily per trait


In [10]:
def predict_trait_probab(trait: str, df: pd.DataFrame, model: LogisticRegression):
    df[f"{trait}_proba"] = df["vector"].apply(
        lambda x: model.predict_proba(np.array(x).reshape(1, -1))
    )
    return df

In [11]:
for trait, model in traits_and_models.items():
    df = predict_trait_probab(trait, df, model)

# That's it!

The dataset is loaded, along with the texts' vectors and the presence probability prediction per personality trait


In [16]:
# Print the first row of each community as example
df.groupby("community").head(1)

Unnamed: 0,community,user_name,user_texts,#_of_long_texts,vector,opn_proba,ext_proba,neu_proba,agr_proba,con_proba
0,classicalmusic,Radaxen,I remember doing this with WC3 Dota 1 Lifestea...,339,"[-0.0013457498280331492, -0.06375082582235336,...","[[0.4633694992256616, 0.5366305007743384]]","[[0.4047406305005551, 0.5952593694994449]]","[[0.5133525548832986, 0.48664744511670144]]","[[0.2907748902422237, 0.7092251097577763]]","[[0.3720289946657592, 0.6279710053342408]]"
170251,electronicmusic,headphase,"This Monday, June 12, r/BedStuy will become un...",305,"[0.011666242964565754, -0.035366132855415344, ...","[[0.4260546229680149, 0.5739453770319851]]","[[0.6087019486538947, 0.39129805134610535]]","[[0.5780140586341542, 0.4219859413658457]]","[[0.24227405375973832, 0.7577259462402617]]","[[0.23508181739111556, 0.7649181826088844]]"
267314,hiphopheads,thanks_bruh,Whats good yâ€™all. Iâ€™m looking to make friends ...,50,"[0.028840450569987297, -0.06617984920740128, 0...","[[0.5274351693681876, 0.4725648306318124]]","[[0.5410485410530944, 0.4589514589469056]]","[[0.36379346487378283, 0.6362065351262172]]","[[0.37324202653229677, 0.6267579734677032]]","[[0.31871618922553135, 0.6812838107744686]]"
388852,indieheads,simco1974,"2008 or so, I was digging around at a local re...",1,"[0.024371720850467682, -0.05200977995991707, 0...","[[0.5114953603142274, 0.4885046396857727]]","[[0.4437402015227965, 0.5562597984772035]]","[[0.42615630341430943, 0.5738436965856906]]","[[0.37897041332216275, 0.6210295866778373]]","[[0.33153541480610726, 0.6684645851938927]]"
473166,Metal,sam1oq,I found out that apparently I was still paying...,309,"[0.024809353053569794, -0.036989957094192505, ...","[[0.38509045003141096, 0.614909549968589]]","[[0.5761465262055332, 0.42385347379446686]]","[[0.6205980989623865, 0.3794019010376135]]","[[0.21988913688937062, 0.7801108631106294]]","[[0.20360275726503696, 0.796397242734963]]"


## How to vectorize (new) texts & predict 
presence of each personality traits in them? <br>
Just use the following code ðŸ‘‡

In [15]:
MODEL = "intfloat/e5-large-v2"


def embed_and_perdict_personality_trait_presence(texts: list[str]):
    """Text could be a single text or a list of texts."""
    model = SentenceTransformer(MODEL)
    embeddings = model.encode(texts, convert_to_tensor=True)
    result = {}
    for trait, model in traits_and_models.items():
        proba = model.predict_proba(embeddings)
        result.update({trait: proba})
    return result


texts = ["This is an example text 1"]
print(embed_and_perdict_personality_trait_presence(texts))

{'opn': array([[0.51758315, 0.48241685]]), 'ext': array([[0.40808484, 0.59191516]]), 'neu': array([[0.41384857, 0.58615143]]), 'agr': array([[0.38006169, 0.61993831]]), 'con': array([[0.45266537, 0.54733463]])}


### Final experiment:
Predict music genre from personality probability vector <br>
The baseline is 0.2 (because we have 5 classes). So >0.2 beats the baseline.

In [None]:
df['personality_vector'] = df.apply(lambda row: [float(row['proba_for_high_opn']), float(row['proba_for_high_ext']), float(row['proba_for_high_neu']), float(row['proba_for_high_agr']), float(row['proba_for_high_con'])], axis=1)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=433)
model = LogisticRegression()
model.fit(train_df['personality_vector'].tolist(), train_df['community'])
community_prediction = model.predict(test_df['personality_vector'].tolist())
accuracy = accuracy_score(test_df['community'], community_prediction)
print(f"Accuracy: {accuracy}")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

print("Classification Report:")
print(classification_report(test_df['community'], community_prediction, target_names=train_df['community'].unique()))
# Print Classification Report
classification_report = classification_report(test_df['community'], community_prediction, target_names=train_df['community'].unique(), output_dict=True)
classification_report_df = pd.DataFrame(classification_report).transpose()
classification_report_df = classification_report_df.drop(columns=['support'])

# Set a larger figure size
plt.figure(figsize=(10, 6))  # Adjust the figure size as needed

# Plot with a different colormap, e.g., 'viridis'
classification_report_df.plot(kind='bar', cmap='viridis')

plt.xlabel('Class')
plt.ylabel('Score')
plt.title('Classification Report')
plt.legend(loc='lower right')
plt.show()

conf_matrix = confusion_matrix(test_df['community'], community_prediction)
print("Confusion Matrix:")

class_labels = ['Metal', 'Classical', 'Electronic', 'Indie', 'HipHop']
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
import numpy as np
from tqdm import tqdm
vals = []

X = np.array(df['personality_vector'].tolist())
y = np.array(df['community'])

kfold = KFold(n_splits=10, shuffle=True, random_state=307)
model = LogisticRegression()

accuracies = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')

for i, accuracy in enumerate(accuracies, start=1):
    print(f"Fold {i}: Accuracy = {accuracy}")

print(f"Average Accuracy: {np.mean(accuracies)}")

In [None]:
print("Classification Report:")
print(classification_report(test_df['community'], community_prediction, target_names=train_df['community'].unique()))
# Print Classification Report
classification_report = classification_report(test_df['community'], community_prediction, target_names=train_df['community'].unique(), output_dict=True)
classification_report_df = pd.DataFrame(classification_report).transpose()
classification_report_df = classification_report_df.drop(columns=['support'])

# Set a larger figure size
plt.figure(figsize=(10, 6))  # Adjust the figure size as needed

# Plot with a different colormap, e.g., 'viridis'
classification_report_df.plot(kind='bar', cmap='viridis')

plt.xlabel('Class')
plt.ylabel('Score')
plt.title('Classification Report')
plt.legend(loc='lower right')
plt.show()

conf_matrix = confusion_matrix(test_df['community'], community_prediction)
print("Confusion Matrix:")

class_labels = ['Metal', 'Classical', 'Electronic', 'Indie', 'HipHop']
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()