In [178]:
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np

In [179]:
workdir = '../personality-prediction-from-text'
results_t1 = os.path.join(workdir, 'data_output_dataset_trait_activating_answers_3')

In [180]:
results_dirs = [results_t3]

In [181]:
for r_dir in results_dirs:
     for i in range(10):
        x_path = os.path.join(r_dir,f'prediction_{i}.xlsx')
        if not os.path.exists(x_path):
            print(x_path)

In [182]:
from tqdm import tqdm
all_df={}
for r_dir in tqdm(results_dirs):
    
    df = []
    for i in range(10):
        path = os.path.join(r_dir,f'prediction_{i}.xlsx')
        if os.path.exists(path):
            df.append(pd.read_excel(path))
    
    df = pd.concat(df) 
    df['pred_sES'] = df['pred_sNEU'].apply(lambda x:5-x)
    df['pred_cES'] = df['pred_cNEU']
    all_df[r_dir]=df

100%|██████████| 3/3 [02:20<00:00, 46.87s/it]


In [185]:
!rm -rf output_trait_activating_answers

In [186]:
output_folder='output_trait_activating_answers'
os.makedirs(output_folder, exist_ok=True)
output_sent_folder = {}
for nsent in range(1,len(results_dirs)+1):
    output_sent_folder[nsent] = os.path.join(output_folder,f'sents_{nsent}')
    os.makedirs(output_sent_folder[nsent], exist_ok=True)

# Average Personality for each Model

In [187]:
traits = ['OPN', 'CON', 'EXT', 'AGR', 'NEU']
traits_ok = ['OPN', 'CON', 'EXT', 'AGR', 'ES']
categories = ['Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Emotional stability']
model_names = ['gpt2','gpt2_medium','gpt2_large','op_125m','op_350m','op_13b', 'xlnet', 'xlnet_large']

all_xvalues = {}

for r_index,r_dir in enumerate(results_dirs):
    all_xvalues[r_index+1] = {}
    df = all_df[r_dir]
    
    for k,g in enumerate(model_names):
        
        gdf=df[df['model_name']==g]
        
        xmvalues = []
        xvvalues = []
        for i,(cat,trait) in enumerate(zip(categories, traits_ok)):
            
            x = gdf[gdf['category']==cat]['pred_s'+trait].values
            xmvalues.append(np.mean(x))
            xvvalues.append(np.std(x))
            
        all_xvalues[r_index+1][k] = {'mean':xmvalues,'std':xvvalues}

In [188]:
names = list(map(lambda x:x[0],traits_ok))

In [189]:
fig, axs = plt.subplots(len(model_names),1, sharex=False, sharey=False, figsize=(15,30), frameon=False)
#fig.subplots_adjust(left=0.25)   
colors = ['g','b','k']
width = 0.27
for k,model in enumerate(model_names):
    ind = np.arange(5) 
    rects = []
    for n in range(3):
        vals = np.array(all_xvalues[n+1][k]['mean'])
        err = np.array(all_xvalues[n+1][k]['std'])
        rect = axs[k].bar(ind+width*n, vals, width, yerr=err, color=colors[n])
        rects.append(rect)
    
    axs[k].set_title(model)
    
    axs[k].set_ylabel('Scores')
    axs[k].set_xticks(ind+width)
    axs[k].set_xticklabels( names )
    axs[k].legend( (rects[0][0], rects[1][0], rects[2][0]), ('1 sent', '2 sents', '3 sents') )
plt.savefig(os.path.join(output_folder, f'personality_by_nsents.jpg'))

# Higher and Lower Scores Examples

In [190]:
traits = ['OPN', 'CON', 'EXT', 'AGR', 'NEU']
traits_ok = ['OPN', 'CON', 'EXT', 'AGR', 'ES']
categories = ['Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Emotional stability']
model_names = ['gpt2','gpt2_medium','gpt2_large','op_125m','op_350m','op_13b', 'xlnet', 'xlnet_large']
all_examples = {}

for r_index,r_dir in enumerate(results_dirs):
    df = all_df[r_dir]   
    all_examples[r_index+1] = {}    
    for k,g in enumerate(model_names):
        
        gdf=df[df['model_name']==g]
        
        examples = {}
        for i,(cat,trait) in enumerate(zip(categories, traits_ok)):

            xdf = gdf[gdf['category']==cat].sort_values(by='pred_s'+trait, ascending=False)
            
            higher = xdf.iloc[:3][['text','pred_s'+trait]]
            lower  = xdf.iloc[-3:][['text','pred_s'+trait]]
            
            examples[trait] = {'higher':higher, 'lower':lower}
            
        all_examples[r_index+1][k] = examples

In [191]:
for nsent in range(1,4):
    for case_name in ['higher','lower']:
        for trait in traits_ok:
            path = os.path.join(output_sent_folder[nsent], f"trait_{trait}_{case_name}.xlsx")            
            table = []
            for k,m in enumerate(model_names):
                case = all_examples[nsents][k][trait]['higher']
                for _,row in case.iterrows():
                    trow = {'score':row['pred_s'+trait] , 'text':row['text'], 'model':m}
                    table.append(trow)
            table = pd.DataFrame(table)
            table.to_excel(path)

# Histograms

In [192]:
import matplotlib.pyplot as plt
traits = ['OPN', 'CON', 'EXT', 'AGR', 'NEU']
traits_ok = ['OPN', 'CON', 'EXT', 'AGR', 'ES']
categories = ['Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Emotional stability']

for r_index,r_dir in enumerate(results_dirs):
    df = all_df[r_dir]   
        
    for k,g in enumerate(model_names):

        gdf=df[df['model_name']==g]
        
        plt.figure(figsize=(10,20))

        for i,(cat,trait) in enumerate(zip(categories, traits_ok)):
            #print(cat,trait)
            plt.subplot(len(categories),1,i+1)
            x = gdf[gdf['category']==cat]['pred_s'+trait].values
            plt.hist(x)
            plt.title(cat)
            plt.xlim([1,5])
            
        plt.savefig(os.path.join(output_sent_folder[r_index+1], f'model_{g}.jpg'))

Openness OPN
Conscientiousness CON
Extraversion EXT
Agreeableness AGR
Emotional stability ES
Openness OPN
Conscientiousness CON
Extraversion EXT
Agreeableness AGR
Emotional stability ES
Openness OPN
Conscientiousness CON
Extraversion EXT
Agreeableness AGR
Emotional stability ES
Openness OPN
Conscientiousness CON
Extraversion EXT
Agreeableness AGR
Emotional stability ES
Openness OPN
Conscientiousness CON
Extraversion EXT
Agreeableness AGR
Emotional stability ES
Openness OPN
Conscientiousness CON
Extraversion EXT
Agreeableness AGR
Emotional stability ES
Openness OPN
Conscientiousness CON
Extraversion EXT
Agreeableness AGR
Emotional stability ES
Openness OPN
Conscientiousness CON
Extraversion EXT
Agreeableness AGR
Emotional stability ES
Openness OPN
Conscientiousness CON
Extraversion EXT
Agreeableness AGR
Emotional stability ES
Openness OPN
Conscientiousness CON
Extraversion EXT
Agreeableness AGR
Emotional stability ES
Openness OPN
Conscientiousness CON
Extraversion EXT
Agreeableness AGR


# Bar chart for each category

In [193]:
import matplotlib.pyplot as plt
import numpy as np

traits = ['OPN', 'CON', 'EXT', 'AGR', 'NEU']
traits_ok = ['OPN', 'CON', 'EXT', 'AGR', 'ES']
categories = ['Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Emotional stability']

ind = np.arange(len(traits_ok))
for r_index,r_dir in enumerate(results_dirs):
    
    df = all_df[r_dir]
    
    for k,g in enumerate(model_names):

        gdf=df[df['model_name']==g]

        fig, axs = plt.subplots(5,1, sharex=False, sharey=False, figsize=(15,20), frameon=False)
        fig.subplots_adjust(left=0.25)
        
        for i,cat in enumerate(categories):
            cdf = gdf[gdf['category']==cat]
            scores = []
            for j,trait in enumerate(traits_ok):
                x = cdf['pred_c' + trait]
                if len(x)==0:
                    x=0
                else:
                    x = x.sum()/len(x)
                scores.append(x)
            #print(scores)
            axs[i].bar(ind, scores)
                     
            axs[i].set_title(cat)

            axs[i].set_xticks(ind)
            axs[i].set_xticklabels(traits_ok)
            axs[i].set_ylim([0,1])

        plt.savefig(os.path.join(output_sent_folder[r_index+1], f'model_{g}_clf_vs_groups.jpg'))

[0.971, 0.3576, 0.2402, 0.4164, 0.0722]
[0.9744, 0.375, 0.1482, 0.7192, 0.0344]
[0.9644, 0.2494, 0.1232, 0.61, 0.074]
[0.9696, 0.2628, 0.1228, 0.6154, 0.1054]
[0.9682, 0.3406, 0.2558, 0.559, 0.1132]
[0.9762, 0.3454, 0.2228, 0.418, 0.08]
[0.9678, 0.3616, 0.1454, 0.7084, 0.0386]
[0.965, 0.2368, 0.1386, 0.6228, 0.0926]
[0.9672, 0.267, 0.113, 0.6086, 0.1048]
[0.9712, 0.342, 0.2632, 0.545, 0.1128]
[0.9758, 0.349, 0.2398, 0.3998, 0.0768]
[0.9734, 0.378, 0.1358, 0.7178, 0.035]
[0.9696, 0.2374, 0.1326, 0.6184, 0.0744]
[0.967, 0.2856, 0.1138, 0.661, 0.0964]
[0.972, 0.3214, 0.2606, 0.5524, 0.1104]
[0.9656, 0.2248, 0.2594, 0.309, 0.143]
[0.9774, 0.2094, 0.211, 0.759, 0.027]
[0.9636, 0.0962, 0.089, 0.4846, 0.0998]
[0.9602, 0.1428, 0.07, 0.4864, 0.108]
[0.941, 0.3516, 0.3318, 0.3414, 0.175]
[0.9778, 0.3204, 0.274, 0.3718, 0.086]
[0.9722, 0.3426, 0.1772, 0.701, 0.0366]
[0.967, 0.2288, 0.1288, 0.601, 0.0882]
[0.9686, 0.2264, 0.1176, 0.611, 0.1112]
[0.965, 0.3126, 0.2916, 0.559, 0.127]
[0.9668, 0.2922

# Classification vs Starting words groups

In [194]:
import json
json_path = r"../data_input/trait_activating_questions.json"
with open(json_path) as file:
    data = json.load(file)

In [195]:
import matplotlib.pyplot as plt
import numpy as np

traits = ['OPN', 'CON', 'EXT', 'AGR', 'NEU']
traits_ok = ['OPN', 'CON', 'EXT', 'AGR', 'ES']
categories = ['Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Emotional stability']

for r_index,r_dir in enumerate(results_dirs):
    
    df = all_df[r_dir]
    
    for k,g in enumerate(model_names):

        gdf=df[df['model_name']==g]

        fig, axs = plt.subplots(5,1, sharex=False, sharey=False, figsize=(15,20), frameon=False)
        fig.subplots_adjust(left=0.25)
        for i,(cat,trait) in enumerate(zip(categories, traits_ok)):

            starting_texts = data[cat] 
            cdf = gdf[gdf['category']==cat]

            scores = []
            for stext in starting_texts:
                mask = cdf['text'].apply(lambda x:x.startswith(stext))
                x = cdf[mask]['pred_c' + trait]
                x = x.sum()/len(x)
                scores.append(x)
                
            axs[i].barh(starting_texts, scores)

            axs[i].set_title(cat)

            axs[i].invert_yaxis()

            axs[i].set_xlim([0,1])

        plt.savefig(os.path.join(output_sent_folder[r_index+1], f'model_{g}_text.jpg'))

# Saving all results

In [196]:
!tar -czvf output_trait_activating_answers.tar.gz /petrobr/parceirosbr/bigoilict/users/cristian.villalobos/personality-prediction-from-text/notebooks/output_trait_activating_answers

tar: Removing leading `/' from member names
/petrobr/parceirosbr/bigoilict/users/cristian.villalobos/personality-prediction-from-text/notebooks/output_trait_activating_answers/
/petrobr/parceirosbr/bigoilict/users/cristian.villalobos/personality-prediction-from-text/notebooks/output_trait_activating_answers/personality_by_nsents.jpg
/petrobr/parceirosbr/bigoilict/users/cristian.villalobos/personality-prediction-from-text/notebooks/output_trait_activating_answers/sents_3/
/petrobr/parceirosbr/bigoilict/users/cristian.villalobos/personality-prediction-from-text/notebooks/output_trait_activating_answers/sents_3/model_gpt2_text.jpg
/petrobr/parceirosbr/bigoilict/users/cristian.villalobos/personality-prediction-from-text/notebooks/output_trait_activating_answers/sents_3/model_op_350m_text.jpg
/petrobr/parceirosbr/bigoilict/users/cristian.villalobos/personality-prediction-from-text/notebooks/output_trait_activating_answers/sents_3/model_gpt2_clf_vs_groups.jpg
/petrobr/parceirosbr/bigoilict/u

/petrobr/parceirosbr/bigoilict/users/cristian.villalobos/personality-prediction-from-text/notebooks/output_trait_activating_answers/sents_2/model_op_350m.jpg
/petrobr/parceirosbr/bigoilict/users/cristian.villalobos/personality-prediction-from-text/notebooks/output_trait_activating_answers/sents_2/trait_AGR_lower.xlsx
/petrobr/parceirosbr/bigoilict/users/cristian.villalobos/personality-prediction-from-text/notebooks/output_trait_activating_answers/sents_2/model_gpt2_medium_text.jpg
/petrobr/parceirosbr/bigoilict/users/cristian.villalobos/personality-prediction-from-text/notebooks/output_trait_activating_answers/sents_2/model_xlnet.jpg
/petrobr/parceirosbr/bigoilict/users/cristian.villalobos/personality-prediction-from-text/notebooks/output_trait_activating_answers/sents_2/model_gpt2_large_text.jpg
/petrobr/parceirosbr/bigoilict/users/cristian.villalobos/personality-prediction-from-text/notebooks/output_trait_activating_answers/sents_2/trait_OPN_lower.xlsx
/petrobr/parceirosbr/bigoilict/