In [None]:
import os, json, time, pickle
import pandas as pd, numpy as np 
import matplotlib.pyplot as plt

os.chdir('/home/jovyan/work/')

in_qwen = os.path.join('personas', 'qwendata_llm_1_2')
out_qwen = os.path.join('personas', 'qwen_study_1_2')

in_mistral = os.path.join('personas', 'mistraldata_llm_1_2')
out_mistral = os.path.join('personas', 'mistral_study_1_2')

#### Persona Embeddings

In [None]:
from sentence_transformers import SentenceTransformer, util
import accelerate, torch, pickle

use of-the-shelf model

In [None]:
persona_model = SentenceTransformer('all-MiniLM-L12-v2')

load personas

In [None]:
pd_personas = pd.read_pickle('personas/data_ext/pd_personas_cleaned.pkl')
pd_personas.head(3)

embed personas

In [None]:
%time persona_embeddings = persona_model.encode(list(pd_personas['persona']), convert_to_tensor=True)

In [None]:
persona_embeddings.shape

In [None]:
len(pd_personas)

In [None]:
persona_embeddings = persona_embeddings.cpu()

In [None]:
pd_personas['embed'] = persona_embeddings.tolist()

In [None]:
pd_personas.to_pickle('personas/data_ext/pd_personas_embedded.pkl')

In [None]:
with open('personas/data_ext/persona_embeddings.pkl', 'wb') as f:
    pickle.dump(persona_embeddings,f)

#### Cluster Persona Embeddings

In [None]:
pd.set_option('max_colwidth', 800)

def show_cluster(pd_personas, id_):
    pd_ = pd_personas[pd_personas['clusterId']==id_]
    return pd_[['personaId','persona']]

def get_cluster_solutions(embedding, t): # explore number and size of clusters for different thresholds
    clusters = util.community_detection(embedding, min_community_size=25, threshold=t)

    pd_clusters = pd.DataFrame([i for i in range(len(clusters))], columns=['clusterId'])
    pd_clusters['clusterSize'] = [len(c) for c in clusters]
    pd_clusters['clusterLists'] = clusters
    
    return len(pd_clusters), pd_clusters.sum()['clusterSize']

load persona embeddings onto gpu

In [None]:
pd_personas = pd.read_pickle('personas/data_ext/pd_personas_embedded.pkl')

In [None]:
persona_embeddings = torch.tensor(pd_personas['embed']).float().cuda()

In [None]:
persona_embeddings.shape

check number of clusters and number of assigned personas for different thresholds

In [None]:
cluster_solutions = []

for t in [0.5, 0.55, 0.6, 0.65, 0.7, 0.75]:
    n, s = get_cluster_solutions(persona_embeddings, t)
    cluster_solutions.append([t, n, s])

In [None]:
pd_cluster_solutions = pd.DataFrame(cluster_solutions, columns=['thresh', 'n_clusters', 'n_personas'])
pd_cluster_solutions

fix threshold at 0.6 and produce final clusters

In [None]:
clusters = util.community_detection(persona_embeddings, min_community_size=25, threshold=0.6)

In [None]:
index2id = {k:v for k,v in zip(pd_personas['personaIndex'], pd_personas['personaId'])}
id2index = {k:v for k,v in zip(pd_personas['personaId'], pd_personas['personaIndex'])}

In [None]:
pd_clusters = pd.DataFrame([i for i in range(len(clusters))], columns=['clusterId'])
pd_clusters['clusterSize'] = [len(c) for c in clusters]
pd_clusters['personaIndex'] = clusters # clusters with personaIndex

true_clusters = [] # clusters with personaId
for cluster in clusters:
    true_clusters.append([index2id[id_] for id_ in cluster])
pd_clusters['personaIds'] = true_clusters

In [None]:
len(pd_clusters)

In [None]:
pd_clusters.sum()['clusterSize']

In [None]:
pd_clusters.to_pickle('personas/data_ext/pd_clusters.pkl')

merge cluster info to personas

In [None]:
personas_in_clusters = []

for i, row in pd_clusters.iterrows():
    for pid in row['personaIds']:
        personas_in_clusters.append([pid, row['clusterId']])

pd_personas_in_clusters = pd.DataFrame(personas_in_clusters, columns=['personaId','clusterId'])
pd_personas_in_clusters = pd_personas_in_clusters.sort_values('personaId').reset_index(drop=True)

pd_personas = pd_personas.merge(pd_personas_in_clusters, on='personaId', how='left')

set unassigned personas to cluster -1

In [None]:
ix_none = list(pd_personas[pd_personas['clusterId'].isnull()].index)
pd_personas.iloc[ix_none,-1] = -1
pd_personas = pd_personas.astype({'clusterId': int})

In [None]:
len(pd_personas[pd_personas['clusterId']>-1])

In [None]:
pd_personas.to_pickle('personas/data_ext/pd_personas_clustered.pkl')

inspect clusters

In [None]:
show_cluster(pd_personas, 209).head()

#### Extract ctfidf-terms for Persona Clusters

In [None]:
os.chdir('personas')

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from ctfidf import CTFIDFVectorizer

os.chdir('..')

aggregate persona descriptions based on cluster id 

In [None]:
pd_personas = pd.read_pickle('personas/data_ext/pd_personas_clustered.pkl')
pd_personas.head(2)

pd_personas_in_cluster = pd_personas[['personaId','persona','clusterId']]
pd_personas_per_cluster = pd_personas_in_cluster.groupby(['clusterId'], as_index=False).agg({'persona': ' '.join})
pd_personas_per_cluster = pd_personas_per_cluster.drop(0, axis=0) # drop none cluster

create tfidf-matrix on document level

In [None]:
tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(pd_personas_per_cluster['persona'])

tfidf_dense = tfidf.todense()

In [None]:
tfidf.shape

In [None]:
with open('personas/data_ext/pclusters_tfidf_dense.pkl', 'wb') as f:
    pickle.dump(tfidf_dense, f)

utility functions to find ctfidf-terms per cluster

In [None]:
word2index = {k: v for k,v in tfidf_vectorizer.vocabulary_.items()}
index2word = {v: k for k,v in tfidf_vectorizer.vocabulary_.items()}

def find_ctfidf_terms(clusterId, ctfidf_mx, n):
    ixs = ctfidf_mx[clusterId].argsort()
    ixs = ixs.tolist()
    ixs = [ix for ix in reversed(ixs[0][-n:])]
    return [index2word[ix] for ix in ixs]

In [None]:
pd_clusters = pd.read_pickle('personas/data_ext/pd_clusters.pkl')

augment clusters with ctfidf terms

In [None]:
from sklearn.feature_extraction import text
stop_words = list(text.ENGLISH_STOP_WORDS)

def remove_stopwords(terms, stopwords, n_keep):
    keep_terms = []
    for term in terms:
        if not term in stopwords:
            keep_terms.append(term)
    return keep_terms[:10]

In [None]:
candidate_terms = [find_ctfidf_terms(clusterId, tfidf_dense, 25) for clusterId in pd_clusters['clusterId']]
definite_terms = [remove_stopwords(terms, stop_words, 10) for terms in candidate_terms]

In [None]:
pd_clusters['cluster_terms'] = definite_terms

augment clusters with 10 random personas

In [None]:
pd_clusters['cluster_personas'] = [list(show_cluster(pd_personas, clusterId).sample(10, random_state=1)['persona']) for clusterId in pd_clusters['clusterId']]

In [None]:
pd_clusters.head(1)

save

In [None]:
pd_clusters.to_excel('personas/data_ext/pd_clusters_augmented.xlsx')
pd_clusters.to_pickle('personas/data_ext/pd_clusters_augmented.pkl')