In [None]:
import os, json, time, pickle
import pandas as pd, numpy as np 
import matplotlib.pyplot as plt

os.chdir('/home/jovyan/work/')

modelname = 'mistral_'
indatadir = os.path.join('personas', 'mistraldata_llm_3_4')
outdatadir = os.path.join('personas', 'mistral_study_3')
resultsdir = os.path.join('personas', 'mistralresults')

### load 200,000 persona annotations // create label embedding space

In [None]:
pd_personas = pd.read_pickle('personas/data_ext/pd_personas_clustered.pkl')

In [None]:
pd_personas.head(2)

In [None]:
pd_personas_labels = pd.DataFrame()

for file_ in os.listdir(os.path.join(indatadir,'runs')):
    pd_in = pd.read_pickle(os.path.join(indatadir,'runs',file_))
    pd_out = pd.DataFrame([int(p.split('_')[1]) for p in pd_in.iloc[:,3:].sum().index], columns=['personaId'])
    codes_ = []
    for code_ in pd_in.iloc[:,3:].sum().values:
        codes_.append([int(i) for i in code_])
    pd_out['labels'] = codes_
    pd_personas_labels = pd.concat([pd_personas_labels, pd_out])

In [None]:
pd_personas_labels = pd_personas_labels.sort_values('personaId').reset_index(drop=True)
pd_personas_labels.head(2)

In [None]:
pd_personas = pd_personas.merge(pd_personas_labels, on='personaId', how='left')
pd_personas.head()

In [None]:
# pd_personas.to_pickle(os.path.join(outdatadir,'pd_personas_labels.pkl'))

#### augment cluster with average cluster labels

In [None]:
pd_clusters = pd.read_pickle('personas/data_ext/pd_clusters_augmented.pkl')
pd_clusters.head(2)

In [None]:
def get_labels(pd_personas, clusterId):
    pd_ = pd_personas[pd_personas['clusterId']==clusterId]
    label_mx = []
    for l in pd_['labels'].values:
        label_mx.append(l)
    label_mx = np.matrix(label_mx)
    return np.round(label_mx.mean(axis=0),2)

In [None]:
pd_clusters['cluster_labels'] = [get_labels(pd_personas, clusterId)[0] for clusterId in pd_clusters['clusterId']]

In [None]:
pd_clusters.head(2)

In [None]:
# pd_clusters.to_pickle(os.path.join(outdatadir,'pd_clusters_labels.pkl'))

### Analysis

#### Persona Clusters --- Intra- and Inter-Cluster Distances

In [None]:
import torch

In [None]:
pd_personas = pd.read_pickle(os.path.join(outdatadir,'pd_personas_labels.pkl'))
pd_personas.head(2)

load label and persona embeddings

In [None]:
persona_embeddings = torch.tensor(pd_personas['embed']).float().cpu()
label_embeddings = torch.tensor(pd_personas['labels']).float().cpu()

load persona clusters

In [None]:
pd_clusters = pd.read_pickle('personas/data_ext/pd_clusters.pkl')
pd_clusters.head(2)

functionality to calculate cluster distances

##### Calculate

In [None]:
from sklearn.metrics.pairwise import cosine_distances

def distance_between_clusters(vectors_i, vectors_j):
    dists = cosine_distances(vectors_i, vectors_j)**2
    return np.mean(dists)

def create_distance_matrix(dict_clusters):
    cluster_distances = []
    for k1, v1 in dict_clusters.items():
        dists = []
        for k2, v2 in dict_clusters.items():
            if k1 > k2:
                dists.append(None)
            else:
                dists.append(distance_between_clusters(v1, v2))
        cluster_distances.append(dists)
    return cluster_distances

def create_full_matrix(mx):
    new_mx = np.matrix(mx)
    for i in range(new_mx.shape[0]):
        for j in range(new_mx.shape[1]):
            if i > j:
                new_mx[i,j] = mx[j][i]
            else:
                new_mx[i,j] = mx[i][j]
    return new_mx

def calculate_cluster_distances(pd_clusters_, persona_embeddings, label_embeddings, lclusters=False):
    if not lclusters:
        cluster_personas = {i: persona_embeddings[pd_clusters_.iloc[i,2]] for i in range(len(pd_clusters_))}
        cluster_labels = {i: label_embeddings[pd_clusters_.iloc[i,2]] for i in range(len(pd_clusters_))}
    else:
        cluster_personas = {i: persona_embeddings[list(pd_clusters_[pd_clusters_['clusterId_label']==i]['personaIndex'])] for i in range(pd_clusters_['clusterId_label'].max()+1)}
        cluster_labels = {i: label_embeddings[list(pd_clusters_[pd_clusters_['clusterId_label']==i]['personaIndex'])] for i in range(pd_clusters_['clusterId_label'].max()+1)}
    
    cluster_distances_personas = create_distance_matrix(cluster_personas)
    cluster_distances_labels = create_distance_matrix(cluster_labels)

    cluster_distances_personas_full = create_full_matrix(cluster_distances_personas)
    cluster_distances_labels_full = create_full_matrix(cluster_distances_labels)

    return cluster_distances_personas_full, cluster_distances_labels_full

In [None]:
%time all_dists_personas_full, all_dists_labels_full = calculate_cluster_distances(pd_clusters, persona_embeddings, label_embeddings)

In [None]:
all_dists_full = {'personas': all_dists_personas_full, 'labels': all_dists_labels_full}

with open(os.path.join(outdatadir,'all_dists_full.pkl'), 'wb') as f:
    pickle.dump(all_dists_full, f)

In [None]:
with open(os.path.join(outdatadir,'all_dists_full.pkl'), 'rb') as f:
    all_dists_full = pickle.load(f)

all_dists_personas_full = all_dists_full['personas']
all_dists_labels_full = all_dists_full['labels']

In [None]:
all_dists_personas_full.shape

#### Correlations Between Pairwise Distances in Persona- and Label Spaces

In [None]:
from sentence_transformers import util
from scipy import stats

In [None]:
pd_personas = pd.read_pickle(os.path.join(outdatadir,'pd_personas_labels.pkl'))

##### Calculate 200k pairwise correlation coefficients

In [None]:
pd_personas.head(2)

In [None]:
labels = pd_personas.iloc[:,-1]
personas = pd_personas.iloc[:,-3]

In [None]:
tensor_labels = torch.tensor([l for l in labels]).float()
del labels

tensor_personas = torch.tensor([l for l in personas]).float()
del personas

In [None]:
%time dists_labels = util.cos_sim(tensor_labels, tensor_labels)
del tensor_labels

In [None]:
%time dists_personas = util.cos_sim(tensor_personas, tensor_personas)
del tensor_personas

In [None]:
dists_labels.shape

In [None]:
dists_personas.shape

In [None]:
def calc_coeff(func, dists_labels, dists_personas):
    coeffs = []
    for i in range(dists_labels.shape[0]):
        r = func(dists_labels[i], dists_personas[i])
        coeffs.append([r.statistic, r.pvalue])
    pd_coeffs = pd.DataFrame(coeffs, columns=['statistic','pvalue'])
    pd_coeffs['direction'] = ['pos' if s>0 else 'neg' for s in pd_coeffs['statistic']]
    return pd_coeffs

In [None]:
pd_pearson = calc_coeff(stats.pearsonr, dists_labels, dists_personas)
pd_spearman = calc_coeff(stats.spearmanr, dists_labels, dists_personas)
pd_kendall = calc_coeff(stats.kendalltau, dists_labels, dists_personas)

In [None]:
# pd_pearson.to_pickle(os.path.join(outdatadir,'correlations_pearson.pkl'))
# pd_spearman.to_pickle(os.path.join(outdatadir,'correlations_spearman.pkl'))
# pd_kendall.to_pickle(os.path.join(outdatadir,'correlations_kendall.pkl'))

In [None]:
pd_pearson = pd.read_pickle(os.path.join(outdatadir,'correlations_pearson.pkl'))
pd_spearman = pd.read_pickle(os.path.join(outdatadir,'correlations_spearman.pkl'))
pd_kendall = pd.read_pickle(os.path.join(outdatadir,'correlations_kendall.pkl'))