### Imports, constants and functions

In [None]:
import os, sys
import pandas as pd
import numpy as np
import json
import re
from scipy import stats
from typing import List
from sentiment3d import Sentiment3D
from utils import load_wan_ratings, get_reliable_words, sentiment_from_logits, df_corr, get_corr, get_stats, separate_utterances
from plotting import plotly_setting, plot_heatmap, plot_combined_distributions, plot_timeseries

plotly_setting()

pd.set_option("display.max_rows", 200)
pd.set_option("display.max_colwidth", 1000)

BASEDIR = './'
FIGDIR = f'{BASEDIR}/figures'
os.makedirs(FIGDIR, exist_ok=True)
print(f'BASEDIR for data, figures, and tables: {BASEDIR}')

In [None]:
SENTCOLS = ["valence", "arousal", "confidence"]
STDCOLS = [f"{c}_std" for c in SENTCOLS]

COLMAP = {
    "valence_nrc": "Valence NRC",
    "arousal_nrc": "Arousal NRC",
    "confidence_nrc": "Confidence NRC",
    "valence_warriner": "Valence Warr",
    "arousal_warriner": "Arousal Warr",
    "confidence_warriner": "Confidence Warr",
    "valence_anew": "Valence ANEW",
    "arousal_anew": "Arousal ANEW",
    "confidence_anew": "Confidence ANEW",
    "valence": "Valence VAC",
    "arousal": "Arousal VAC",
    "confidence": "Confidence VAC",
    "valence": "Valence",
    "arousal": "Arousal",
    "confidence": "Confidence",
}

def map_cols(df):
    df.columns = [c.replace("dominance", "confidence") for c in df.columns]
    cols = [c for c in COLMAP.keys() if c in df.columns]
    df = df[cols].rename(columns=COLMAP)
    return df

## Load human rating data

In [None]:
hdf = load_wan_ratings()
hdf.head(2)

In [None]:
reldf, extras = get_reliable_words()
reldf.head(2)

## Create subsets of rating data

* wdf is the full set of NRC/Warriner ratings in wide format
* rwdf is the set of reliable words used for training
* swdf is the smaller set of ratings where NRC, Warriner and ANEW overlap

In [None]:
wdf = hdf.loc[hdf.source != "anew"].pivot(index="word", columns="source").copy()
wdf.columns = ["_".join(c) for c in wdf.columns]
wdf = wdf.loc[:, [c for c in wdf.columns if "std" not in c]]
wdf.dropna(inplace=True)
wdf = map_cols(wdf)

In [None]:
rwdf = reldf.loc[:, [c for c in reldf.columns if "std" not in c]].copy()
rwdf = map_cols(rwdf)

swdf = hdf.pivot(index="word", columns="source").copy()
swdf.columns = ["_".join(c) for c in swdf.columns]
swdf = swdf.loc[:, [c for c in swdf.columns if "std" not in c]]
swdf.dropna(inplace=True)
swdf = map_cols(swdf)

## Load sentiment model

In [None]:
with open("anchor_spec.json") as fp:
    model = json.load(fp)

In [None]:
model

# Run model on human data

## Heatmap for all wordstmpdf

In [None]:
cols = ["NRC", "Warr"]
sentdf, anchors = sentiment_from_logits(model, wdf.index)
sentdf.columns = [c.capitalize() + " VAC" for c in sentdf.columns]
tmpdf = wdf.join(sentdf)
tmpdf = tmpdf[
    [
        "Valence NRC",
        "Valence Warr",
        "Valence VAC",
        "Arousal NRC",
        "Arousal Warr",
        "Arousal VAC",
        "Confidence NRC",
        "Confidence Warr",
        "Confidence VAC",
    ]
]
map_cols(tmpdf)
stat_rdf, stat_pdf, n, all_stats = get_stats(tmpdf, cols)

r, p, ndf = df_corr(tmpdf)
n = np.nanmin(ndf)
fig = plot_heatmap(r, font_sz=16, size=(700, 700))
fig.write_image(f"{FIGDIR}/vac_heatmap_wn_{n}.svg") 
fig

## Heatmap for 1023 WAN words

In [None]:
sentdf, anchors = sentiment_from_logits(model, swdf.index)
sentdf.columns = [c.capitalize() + " VAC" for c in sentdf.columns]
tmpdf = swdf.join(sentdf)
tmpdf = tmpdf[
    [
        "Valence NRC",
        "Valence Warr",
        "Valence ANEW",
        "Valence VAC",
        "Arousal NRC",
        "Arousal Warr",
        "Arousal ANEW",
        "Arousal VAC",
        "Confidence NRC",
        "Confidence Warr",
        "Confidence ANEW",
        "Confidence VAC",
    ]
]
map_cols(tmpdf)
r, p, ndf = df_corr(tmpdf)
n = np.nanmin(ndf)

fig = plot_heatmap(r, font_sz=16, size=(800, 850),blocksize=4)
fig.write_image(f"{FIGDIR}/vac_heatmap_wan_{n}.svg") 
fig

## Carl Roger and Gloria therapy session

In [None]:
carl_gloria_df = pd.read_csv("data/carl_and_gloria.csv", sep="\t", index_col=0,)

In [None]:
carl_gloria_df.head()

In [None]:
utterance_df = separate_utterances(carl_gloria_df)
utterance_df.head()

In [None]:
s3 = Sentiment3D()
sentiment_dict = s3(utterance_df["utterance"].to_list())

In [None]:
utt_res = pd.json_normalize(sentiment_dict, max_level=2)
utterance_df = utterance_df.merge(utt_res, left_index=True, right_index=True, validate="1:1")

In [None]:
utterance_df.head()

In [None]:
therapist_df = utterance_df[utterance_df['speaker']=='Therapist']
patient_df = utterance_df[utterance_df['speaker']=='Patient']
therapist_df.reset_index(drop=True, inplace=True)
patient_df.reset_index(drop=True, inplace=True)

fig = plot_combined_distributions(sample1=therapist_df, sample2=patient_df, xlabels=['valence', 'arousal', 'confidence'], group_labels=['Therapist', 'Patient'])
fig.write_image(f"{FIGDIR}/carl_roger_combined_distributions.svg")
fig.show()

In [None]:
# group by speaker and aggregate using mean
utterance_gr = (
    utterance_df.groupby(
        (utterance_df["speaker"] != utterance_df["speaker"].shift()).cumsum()
    )
    .agg(
        {
            "speaker": "first",
            "utterance": " ".join,
            "valence": "mean",
            "arousal": "mean",
            "confidence": "mean",
        }
    )
    .reset_index(drop=True)
)

In [None]:
therapist_df = utterance_gr[(utterance_gr["speaker"] == "Therapist")]
patient_df = utterance_gr[(utterance_gr["speaker"] == "Patient")]

therapist_df.reset_index(drop=True, inplace=True)
patient_df.reset_index(drop=True, inplace=True)

In [None]:
span_n = 3

samples1 = [
    [therapist_df.index, therapist_df["valence"].ewm(span=span_n).mean()],
    [therapist_df.index, therapist_df["arousal"].ewm(span=span_n).mean()],
    [therapist_df.index, therapist_df["confidence"].ewm(span=span_n).mean()],
]
samples2 = [
    [patient_df.index, patient_df["valence"].ewm(span=span_n).mean()],
    [patient_df.index, patient_df["arousal"].ewm(span=span_n).mean()],
    [patient_df.index, patient_df["confidence"].ewm(span=span_n).mean()],
]

In [None]:
fig = plot_timeseries(samples1, samples2, xlabels=["valence", "arousal", "confidence"], group_labels=["Therapist", "Patient"])
fig.write_image(f"{FIGDIR}/carl_roger_combined_timeseries.svg")
fig