# Setup
This template analyzes the effectiveness of shaping **one** synthetic personality trait of a model at a time (i.e., without shaping instructions for any other traits). Use `multidimensional_trait_shaping_analysis.ipynb` to see how well a given model can simulate control of multiple traits at once.

1. Specify your model's full results pickle file, JSON `admin_session`, and identifier (model pointer), below.
2. If you'd like to save the test scores for further analysis, specify a `SAVE_SCORES_FILENAME`.
3. Run this notebook in `personality_in_llms/analysis` 

In [None]:
# path to directory containing psyborgs
# this default path should work if you've cloned the repo
PATH = "../" 

# psychometric utils path (no need to change)
PSYCHOMETRIC_UTILS_PATH = PATH + "psyborgs/psychometric_utils.R"

# filename of pickled results to be analyzed
PKL_PATH = "../results/" + "your_results_here.pkl"

# admin_session filename
ADMIN_SESSION_PATH = "../admin_sessions/" + \
    "ablation01_ind_big5_9lvls_50desc_admin_session_rating.json"

# save joined IPIP-NEO scores?
SAVE_SCORES_FILENAME = False

## Load Dependencies

In [None]:
import pandas as pd
import numpy as np

import sys
sys.path.append(PATH)

from psyborgs import score_calculation, survey_bench_lib

import matplotlib.pyplot as plt

# dependencies for descriptive statistics
import itertools
from typing import Union, List

# dependencies for R code
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr
from rpy2.robjects.conversion import localconverter

# dependencies for correlation analysis
from scipy.stats import spearmanr, pearsonr


In [None]:
SPID = ['item_preamble_id',
        'item_postamble_id',
        'response_scale_id',
        'response_choice_postamble_id',
        'model_id']

BFI_SCALE_IDS = ["BFI-EXT", "BFI-AGR", "BFI-CON", "BFI-NEU", "BFI-OPE"]
IPIP_SCALE_IDS = ["IPIP300-EXT", "IPIP300-AGR", "IPIP300-CON", "IPIP300-NEU", "IPIP300-OPE"]
VALIDATION_SCALE_IDS = ["PA", "NA", "CSE", "CPI", "PHYS", "VRBL", "ANGR", "HSTL", "ACHV", "CONF", "SCRT"]

## Unpickle Raw Results

In [None]:
df_raw_response_scores = pd.read_pickle(PKL_PATH)

In [None]:
df_raw_response_scores.head(5)

In [None]:
test_df = df_raw_response_scores.query(
    "item_postamble_id == 'plk-ipip-0' & item_preamble_id == 'ext0-agr2-con0-neu0-ope0-d36-ev2' & item_id == 'ipip1'"
)

test_df

## Load Admin Session


In [None]:
admin_session = survey_bench_lib.load_admin_session(
    ADMIN_SESSION_PATH)

# Score Session

In [None]:
# adapt df to match a df with scores for possible continuations
df_raw_response_scores['score'] = 1
df_raw_response_scores['response_value'] = df_raw_response_scores['model_output'].astype('int')

In [None]:
# score session
scored_session_df = score_calculation.score_session(
    admin_session, df_raw_response_scores)

scored_session_df.head(5)

In [None]:
# optional: save scores to disk
if SAVE_SCORES_FILENAME:
    scored_session_df.to_pickle(SAVE_SCORES_FILENAME)

# Descriptives

In [None]:
pd.set_option('display.float_format', '{:.2f}'.format)

In [None]:
def get_domain_fragments(big5_id, levels=range(1,10)):
    """Returns list of preamble ID fragments for one domain."""
    return [f"{big5_id}{i}" for i in levels]


def get_big5_lvl_fragments(levels=range(1,10)):
    """Returns list of preamble ID fragments for all Big Five domains."""
    big5_id_fragments = ["ext", "agr", "con", "neu", "ope"]
    nested_fragments = [get_domain_fragments(big5_id, levels) for big5_id in big5_id_fragments]
    preamble_id_fragments = list(itertools.chain(*nested_fragments))
    return preamble_id_fragments


def subset_one_preamble(df, id_fragment):
    return df[df["item_preamble_id"].str.contains(id_fragment)][IPIP_SCALE_IDS]


def subset_by_preambles(df, id_fragments):
    """Subsets data by a given list of item preamble fragments."""
    preambles = []
  
    for id_fragment in id_fragments:
        preambles.append(subset_one_preamble(df, id_fragment))

    return pd.concat(preambles, keys=id_fragments)


def describe_by_preambles(
    df, id_fragments,
    by: Union[str, List[str]]=['median', 'min', 'max', 'std']):
    # organize data by preamble_id fragment
    df_by_preambles = subset_by_preambles(df, id_fragments)
    
    # group by preamble_id fragments
    df_grouped = df_by_preambles.groupby(level=0)
    
    # aggregate by specified summary stats
    summary = df_grouped.agg(by)
    
    return summary

### IPIP-NEO-300

In [None]:
scored_session_df[IPIP_SCALE_IDS].describe().round(2)

In [None]:
fig = plt.figure(tight_layout=True)
scored_session_df[IPIP_SCALE_IDS] \
    .hist(range=[1,5], alpha=1, figsize=(10, 7.5), sharey=True)

plt.show()

## Descriptives by Prompted Personality (In Item Preamble)

### Descriptives of Extremely Low vs. Extremely High Prompts for Each Domain

In [None]:
big5_domain_lvls = get_big5_lvl_fragments(levels=[1,7])
describe_by_preambles(scored_session_df, big5_domain_lvls)

# Quick Validity Check
We don't have criterion measures in this run to test for criterion validity, but we can look at the inter-scale correlations of the IPIP-NEO-300.

In [None]:
def calculate_pvalues(df):
    dfcols = pd.DataFrame(columns=df.columns)
    pvalues = dfcols.transpose().join(dfcols, how='outer')
    for r in df.columns:
        for c in df.columns:
            tmp = df[df[r].notnull() & df[c].notnull()]
            pvalues[r][c] = round(pearsonr(tmp[r], tmp[c])[1], 4)
    return pvalues

## IPIP-NEO-300 Intercorrelations
EXT should correlate moderately + negatively with NEU

In [None]:
scored_session_df[IPIP_SCALE_IDS].corr()

In [None]:
# p Values
calculate_pvalues(scored_session_df[IPIP_SCALE_IDS])

## IPIP-NEO-300 Intercorrelations Across Preamble Prompts

In [None]:
scored_session_df \
    .query(f"item_preamble_id.str.contains('con1')") \
    [IPIP_SCALE_IDS].corr()

In [None]:
scored_session_df \
    .query(f"item_preamble_id.str.contains('con9')") \
    [IPIP_SCALE_IDS].corr()

# R Analysis

## Reliability Functionalized

In [None]:
def launch_r_instance(psychometric_utils_path: str) -> None:
    # load R instance
    global r
    r = robjects.r

    # source R script
    r['source'](psychometric_utils_path)

    # load function(s) within script
    global tidyjson_r
    tidyjson_r = importr('tidyjson')
    # admin_session_to_nested_key_r = robjects.globalenv['admin_session_to_nested_key']
    # score_subscale_r = robjects.globalenv['score_subscale']
    
    global subscale_reliability_r
    subscale_reliability_r = robjects.globalenv['subscale_reliability']


def load_r_scored_session(scored_session_df: pd.DataFrame) -> pd.DataFrame:
    """Load scored_session_df in R."""
    with localconverter(robjects.default_converter + pandas2ri.converter):
      scored_session_df_r = robjects.conversion.py2rpy(scored_session_df)
    
    return scored_session_df_r

def compute_reliability_indices_per_scale(admin_session, admin_session_r, scored_session_df_r, **kwargs):   
    # create list of scores to be later converted into the output dataframe    
    score_list = []

    # compute reliability for each scale in an admin_session
    # if a particular reliability index can't be estimated, record as NA
    for measure_id, measure in admin_session.measures.items():
        for scale_id in measure.scales:

            # try computing Cronbach's Alpha
            try:
                alpha = subscale_reliability_r(admin_session_r, scored_session_df_r, measure_id, scale_id, "alpha")[0]
            except Exception as e:
                print(f"An error occurred while calculating alpha for measure {measure_id} and scale {scale_id}: {e}")
                alpha = np.nan

            # try computing McDonald's Omega
            try:
                omega = subscale_reliability_r(admin_session_r, scored_session_df_r, measure_id, scale_id, "omega")[0]
            except Exception as e:
                print(f"An error occurred while calculating omega for measure {measure_id} and scale {scale_id}: {e}")
                omega = np.nan

            # try computing Guttman's Lambda 6
            try:
                g6 = subscale_reliability_r(admin_session_r, scored_session_df_r, measure_id, scale_id, "G6")[0]
            except Exception as e:
                print(f"An error occurred while calculating G6 for measure {measure_id} and scale {scale_id}: {e}")
                g6 = np.nan

            # add the above reliability estimates to running score_list
            score_list.append([measure_id, scale_id, alpha, omega, g6])

    # combine accumulated estimates into one dataframe
    reliabilities_df = pd.DataFrame(score_list, columns=['measure_id', 'scale_id', 'alpha', 'omega', 'g6'])
    
    return reliabilities_df

def run_reliability_analysis_in_r(psychometric_utils_path: str,
                                  scored_session_df: pd.DataFrame,
                                  admin_session_json_path: str) -> pd.DataFrame:
    # launch R instance
    launch_r_instance(psychometric_utils_path)
    
    # load admin_session in R
    admin_session_r = tidyjson_r.read_json(admin_session_json_path)
    
    # load scored_session_df into R
    scored_session_df_r = load_r_scored_session(scored_session_df)
    
    # load main admin_session
    admin_session = survey_bench_lib.load_admin_session(
        admin_session_json_path)
    
    # compute reliability indices per scale
    reliabilities_df = compute_reliability_indices_per_scale(
        admin_session, admin_session_r, scored_session_df_r)
    
    return reliabilities_df

## Compute Reliability Estimates

Stronger correlations between ordinal intended levels of traits and observed psychometric test scores indicate success.

In [None]:
run_reliability_analysis_in_r(
    psychometric_utils_path=PSYCHOMETRIC_UTILS_PATH,
    scored_session_df=scored_session_df,
    admin_session_json_path=ADMIN_SESSION_PATH
)

## Compute Correlations

In [None]:
LVL_IDS = ["lvl-EXT", "lvl-AGR", "lvl-CON", "lvl-NEU", "lvl-OPE"]

def main(scores_df):
  """Calculates correlations between intended trait levels and actual scores.
  
  Make sure that `variable1` is your level variable, since this function
  filters out observations where shaping for a particular trait does not occur
  (i.e., where the level is 0).
  """
  # create a list to store the correlations
  correlation_data = []

  # calculate correlations for each pair of variables
  for variable1, variable2 in zip(scores_df.columns[:5], scores_df.columns[5:]):
    # retain only data where the intended trait level variable is greater than 0
    subset = scores_df[scores_df[variable1] > 0]
    
    spearman, spearman_p = spearmanr(subset[variable1], subset[variable2])
    pearson, pearson_p = pearsonr(subset[variable1], subset[variable2])

    # add the correlation coefficient and p-value to the new dataframe
    correlation_df = correlation_data.append({
      "Variable1": variable1,
      "Variable2": variable2,
      "spearman": spearman,
      "spearman_p": spearman_p,
      "pearson": pearson,
      "pearson_p": pearson_p
    })
  
  # convert the list to a DataFrame
  correlation_df = pd.DataFrame(correlation_data)

  # print new dataframe
  return(correlation_df)

def add_ordinal_levels(df):
  scores_df = df[["item_preamble_id"] + IPIP_SCALE_IDS]

  scores_df[["lvl-EXT", "lvl-AGR", "lvl-CON", "lvl-NEU", "lvl-OPE", "description_id", "instruction_id"]] = scores_df["item_preamble_id"].str.split('-', expand=True)

  print(scores_df["lvl-EXT"])

  # replace the values of each lvl- column with only the digits they contain
  scores_df.loc[:, "lvl-EXT"] = scores_df.loc[:, "lvl-EXT"].str[3:].astype(int)
  scores_df.loc[:, "lvl-AGR"] = scores_df.loc[:, "lvl-AGR"].str[3:].astype(int)
  scores_df.loc[:, "lvl-CON"] = scores_df.loc[:, "lvl-CON"].str[3:].astype(int)
  scores_df.loc[:, "lvl-NEU"] = scores_df.loc[:, "lvl-NEU"].str[3:].astype(int)
  scores_df.loc[:, "lvl-OPE"] = scores_df.loc[:, "lvl-OPE"].str[3:].astype(int)

  return scores_df

In [None]:
df_subset = add_ordinal_levels(scored_session_df)

In [None]:
df_subset.head(1)

In [None]:
main(df_subset[LVL_IDS + IPIP_SCALE_IDS])

In [None]:
# subset Spearman correlations only
display(main(df_subset[LVL_IDS + IPIP_SCALE_IDS])[["spearman"]])