In [None]:
import re
import os
import pandas as pd
from tqdm import tqdm
import plotly.express as px
import plotly.graph_objs as go
from sentiment2d import Sentiment2D

# Create new `pandas` methods which use `tqdm` progress
tqdm.pandas()

def turn_to_utterance(df, speaker):
    END_OF_UTTERANCE_PUNCTUATION = ".!?"
    end_of_line = "$"
    regexp = f'([^{END_OF_UTTERANCE_PUNCTUATION}]+({end_of_line}|[{END_OF_UTTERANCE_PUNCTUATION}]))'
    
    # split into target speaker and all other speakers
    idx = df['speaker'].isin([speaker])
    odf = df[~idx].copy()
    odf['sequence'] = df.index[~idx]
    tdf = df[idx].copy()
    tdf['sequence'] = df.index[idx]
    tdf['Xline'] = tdf['line'].apply(lambda x: x.replace("...", "XXX") if x else ' ')
    odf['Xline'] = odf['line'].apply(lambda x: x.replace("...", "XXX") if x else ' ')
    
    frags = []
    for row in tdf.itertuples():
        if (row.Index - 1) in odf.index:
            preceeding_Xline_s = odf[odf.index == (row.Index - 1)]['Xline']
            preceeding_Xline_v = preceeding_Xline_s.values
            if len(preceeding_Xline_v) > 0:
                preceeding_Xline = preceeding_Xline_v[0]
        for m in re.findall(regexp, row.Xline.replace('"', ' ')):
            frags += [(m[0], row.sequence)]

    # join any true fragments into full utterances
    utterances = []
    utt = ''
    for f, s in frags:
        utt += ' ' + f
        if f[-1] in END_OF_UTTERANCE_PUNCTUATION:
            utterances += [(utt.replace("XXX", "..."), s, speaker)]
            utt = ''
    
    return pd.DataFrame(utterances, columns=['utterance', 'sequence', 'speaker'])


def session_sentiment_figure(df, annotations=None, labels=('Valence', 'Arousal'), 
                             title='', legend_x=0.48, legend_y=0.90):
    '''
    This figure is inspired by Russell's original work 
    --- A Circumplex Model of Affect ---
    
    :param df: Data Frame with one utterance per row and columns [speaker, utterance, valence, arousal]
    '''
    FONT_COLOR = '#666666'
    FILL_COLOR = 'rgb(.7,.7,.7)'
    COLORS = ['rgba(62,148,204,127)', 'rgba(213,134,58,127)', 'rgba(126,154,102,127)']
    
    utt_av = df.copy()
    speakers = utt_av.speaker.unique().tolist()
    utt_av['length'] = utt_av['utterance'].apply(lambda u: len(u.split()))
    utt_av['weight'] = utt_av['length'] / utt_av['length'].sum()
    
    utt_av['utterance'] = utt_av['utterance'].str.wrap(30)
    utt_av['utterance'] = utt_av['utterance'].apply(lambda x: x.replace('\n', '<br>'))
    
    fig = px.scatter(utt_av,
                     x=labels[0], y=labels[1],
                     title=title,
                     hover_data=['utterance'],
                     size='weight',
                     color='speaker',
                     color_discrete_sequence=COLORS,
                     opacity=0.5)
    
    for i, spk in enumerate(speakers):
        idx = utt_av.speaker == spk
        x_mn = (utt_av.loc[idx, labels[0]] * utt_av.loc[utt_av.speaker == spk, 'weight']).sum()
        y_mn = (utt_av.loc[idx, labels[1]] * utt_av.loc[utt_av.speaker == spk, 'weight']).sum()
        fig.add_annotation(x=x_mn, y=y_mn, text='<b>x<b>', showarrow=False, xanchor='center',
                           font=dict(family="Arial Black", size=30, color=COLORS[i % len(COLORS)]))

    if annotations:
        for w, coord in annotations.items():
            align = 'left' if coord[0] < -0.5 else 'right' if coord[0] > 0.5 else 'center'
            fig.add_annotation(x=coord[0], y=coord[1], text=w, showarrow=False, xanchor=align,
                               font=dict(size=15, color='rgba(0,0,0,.6)'))
        
    fig.update_yaxes(range=[-1.05, 1.05],
                     showgrid=False,
                     zerolinecolor='rgba(.2,.2,.2,.2)',
                     tickfont=dict(size=18, color=FONT_COLOR),
                     title_font=dict(size=24, color=FONT_COLOR),
                     tickvals=[-1, -0.5, 0.0, 0.5, 1])
    
    fig.update_xaxes(scaleanchor='x', 
                     scaleratio=1,
                     range=[-1.05, 1.05],
                     showgrid=False,
                     zerolinecolor='rgba(.2,.2,.2,.2)',
                     tickfont=dict(size=18, color=FONT_COLOR),
                     title_font=dict(size=24, color=FONT_COLOR),
                     tickvals=[-1, -0.5, 0.0, 0.5, 1])
    
    fig.add_shape(type='rect', xref='x', yref='y', fillcolor=FILL_COLOR, x0=-1.05, y0=-1.05, x1=1.05, y1=1.05, 
                  line_color=FILL_COLOR, opacity=0.1)
    
    fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', showlegend=True, height=700, width=700, font=dict(size=16), 
                      legend=dict(yanchor='top', y=legend_y, xanchor='center', x=legend_x, 
                                  bgcolor='rgba(1,1,1,0)', title='', font=dict(size=15)))
    
    return fig

os.environ["TOKENIZERS_PARALLELISM"] = "true"

## Parse the raw transcript into utterances

Loads the raw therapy transcript and parses it into a dataframe where each row is a talk turn. Note that the transcript is adapted from a publicly available therapy session transcript from Carl Rogers' published [therapy session transcripts](https://anamartinspsicoterapiaacp.files.wordpress.com/2016/04/brodley-transcripts-of-carl-rogers-therapy-sessions.pdf)


In [None]:
# load the turn-taking dataframe
turn_df = pd.read_csv('./carl_and_gloria.csv')

# parse into utterances
utterance_df = pd.concat((turn_to_utterance(turn_df, 'Therapist'), turn_to_utterance(turn_df, 'Patient')))
utterance_df = utterance_df.sort_values('sequence').reset_index(drop=True)
utterance_df.head(10)

## Run the sentiment model on utterances
NOTE: the first time you run this the large language model will need to be downloaded. It will be cached locally, so after the first run it should be faster. However, depending on your system (e.g., no GPU acceleration), computing sentiment may still take a few minutes.

In [None]:
# add sentiment scores (may take a while!)
s2d = Sentiment2D()
valence, arousal = [], []
for idx, row in enumerate(utterance_df.itertuples()):
    v, a = s2d(row.utterance)
    valence.append(v)
    arousal.append(a)
    print('.', end='', flush=True)
    
# If running on a GPU, the following may be more efficient:
#valence, arousal = s2d(utterance_df.utterance.to_list())

df = utterance_df.copy()
df['Valence'] = valence
df['Arousal'] = arousal

df.head()

## Display the 2d sentiment scores

In [None]:
# Compute 2D sentiment coordinates for some anchor words to annotate the plot
anchor_words = ['excited', 'satisfied', 'relaxed', 'bored', 'depressed', 
                'annoyed', 'grounded', 'furious', 'sleepy', 'disgusted', 
                'upset', 'content', 'aroused', 'numb']
v, a = s2d([f'I am feeling {word}' for word in anchor_words])
annotations = {w: (v, a) for w, v, a in zip(anchor_words, v, a)}
#annotations = {word: s2d([f'I am feeling {word}']) for word in anchor_words}

In [None]:
fig = session_sentiment_figure(df, annotations, legend_y=1.01, legend_x=0.3)
fig.write_image('./sentiment.png', scale=1.5)
# To save an interactive version of the plot (note: this will be a large file!):
#fig.write_html("./sentiment.html")
fig.show()