In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import os
import os.path
from os.path import join
import numpy as np
import imodelsx
from tqdm import tqdm
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
import data
files_dict = data.load_files_dict_single_site()

### Filter data for single-site analysis

In [None]:
# site = 'Atlanta'
site = 'Columbus'
df = files_dict[site]
theme_index = np.where(
    np.array(list(map(str.lower, df.columns.values))) == 'theme')[0][0]
col_vals = df.columns[4: theme_index]

# separate into relevant pieces
qs = df['Subcategory']
responses_df = df[col_vals]
themes_df = df[df.columns[theme_index:]]

### Run sentiment

In [41]:
sentiment_prompt = '''### You are given a question and a response. Rate the sentiment/supportiveness of the response on a scale of 1 to 5, where 1 is very negative and 5 is very positive. ###

Question: {question}

Response: {response}

Rating (1-5):'''

llm = imodelsx.llm.get_llm('gpt-4', repeat_delay=None)

num_questions = len(qs)
sentiments = defaultdict(list)
for question_num in tqdm(range(num_questions), position=0):
    question, responses, theme_dict = data.get_data_for_question_single_site(
        question_num=question_num, qs=qs, responses_df=responses_df, themes_df=themes_df)

    for response_num in tqdm(range(len(responses)), position=1):
        response = responses.values[response_num]

        if pd.isna(response):
            sentiments[question_num].append(np.nan)
        else:
            prompt = sentiment_prompt.format(
                question=question, response=response)
            ans = llm(prompt)
            sentiments[question_num].append(ans)

100%|██████████| 11/11 [00:00<00:00, 5687.54it/s]


cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!


100%|██████████| 11/11 [00:00<00:00, 8403.89it/s]


cached!
cached!
cached!
cached!


100%|██████████| 11/11 [00:00<00:00, 131445.42it/s]


cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
not cached


100%|██████████| 11/11 [00:00<00:00, 14.94it/s]
  9%|▊         | 4/46 [00:00<00:07,  5.29it/s]

cached!
cached!
not cached




not cached
not cached


100%|██████████| 11/11 [00:04<00:00,  2.70it/s]
 11%|█         | 5/46 [00:04<00:49,  1.21s/it]

cached!
cached!
cached!




not cached




not cached




not cached




cached!
not cached




not cached




not cached




not cached




cached!
not cached


100%|██████████| 11/11 [00:07<00:00,  1.50it/s]
 13%|█▎        | 6/46 [00:12<01:55,  2.88s/it]

not cached




not cached




not cached




not cached




cached!
not cached




not cached


### Plot sentiment

In [None]:
sent_df = pd.DataFrame([(key, var) for (key, L) in sentiments.items() for var in L],
                       columns=['Question', 'Value'])

# round  values
idxs = sent_df['Value'].notna()
sent_df['Value'][idxs] = sent_df['Value'][idxs].astype(float).round()
value_maps = {
    1: 'Very Negative',
    2: 'Negative',
    3: 'Neutral',
    4: 'Positive',
    5: 'Very Positive',
}
sent_df['Value'] = sent_df['Value'].map(value_maps.get)
sent_df['Value'] = sent_df['Value'].fillna('No response')

sent_df = sent_df.groupby(['Question', 'Value']).size().unstack(fill_value=0)
levels = ['Very Negative', 'Negative', 'Neutral',
          'No response', 'Positive', 'Very Positive']
sent_df = sent_df.reindex(levels, axis=1)
sent_df = sent_df.sort_values(by=levels, ascending=False)

# set colors
colors = sns.diverging_palette(20, 220, n=6).as_hex()
colors = colors[:2] + ['#ddd', '#eee'] + colors[-2:]
sent_df.plot(kind='barh', stacked=True, figsize=(5, 10), color=colors)

# add xticklabels
plt.yticks(range(46), labels=df['Domain'].values[sent_df.index.values])

# move legend to top
plt.legend(bbox_to_anchor=(0.5, 1.1), loc='center', ncol=3, title='Sentiment')
plt.xlabel('Answer count')
plt.title(site)
plt.savefig(f'../figs/sentiment_example_{site}.pdf', bbox_inches='tight')
plt.show()

In [None]:
# no limit display
with pd.option_context('display.max_rows', None, 'display.max_colwidth', None):
    display(df[['Domain', 'Subcategory']])