In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import os
import os.path
from os.path import join
import numpy as np
import imodelsx
from tqdm import tqdm
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
import data
import sys
files_dict = data.load_files_dict_single_site()

### Filter data for single-site analysis

In [None]:
k = 'Atlanta'
# k = 'Columbus'
df = files_dict[k]
theme_index = np.where(
    np.array(list(map(str.lower, df.columns.values))) == 'theme')[0][0]
col_vals = df.columns[4: theme_index]

# separate into relevant pieces
qs = df['Subcategory']
responses_df = df[col_vals]
themes_df = df[df.columns[theme_index:]]

In [None]:
def numbered_list(responses):
    return '\n'.join([f'{i+1}. {c.strip()}' for i, c in enumerate(responses)])


themes_prompt = '''### You are given a question and a set of responses below.

**Question**: {question}

**Responses**:
{response_list}

### Group all responses into 2 or more non-overlapping themes.
### Return a comma-separated list, where each element is a theme, followed by the numbers of the responses that fall into that theme in brackets.
### **Example answer**: Theme 1: Negative responses [1, 2, 5], Theme 2: Positive responses [3, 4]

**Answer**: Theme 1:'''

llm = imodelsx.llm.get_llm('gpt-4', repeat_delay=3)

**Run single example**

In [None]:
# question, responses, theme_dict = data.get_data_for_question_single_site(
#     question_num=2, qs=qs, responses_df=responses_df, themes_df=themes_df)

# resps = responses[pd.notna(responses)]
# prompt = themes_prompt.format(
#     question=question,
#     response_list=numbered_list(resps)
# )
# print(prompt)
# llm(prompt)

### Screen valid questions
Valid questions have multiple unique responses.

In [17]:
def count_unique(resps):
    resps_match = resps.apply(str.lower)
    resps_match = resps_match.str.replace('[^\w\s]', '')
    # print(set(resps_match))
    return len(set(resps_match))


# screen valid questions
valid_question_nums = []
for question_num in tqdm(range(len(qs)), position=0):

    question, responses, theme_dict = data.get_data_for_question_single_site(
        question_num=question_num, qs=qs, responses_df=responses_df, themes_df=themes_df)
    resps = responses[pd.notna(responses)]

    # valid only if there are multiple unique responses
    if count_unique(resps) > 3:
        # print(resps)
        valid_question_nums.append(question_num)

    # print(np.unique(resps))
print('num valid qs', len(valid_question_nums), 'of', len(qs))

  resps_match = resps_match.str.replace('[^\w\s]', '')
100%|██████████| 46/46 [00:00<00:00, 2163.08it/s]

num valid qs 32 of 46





### Run generating themes

In [18]:
themes_generated = {}
resps_list = {}
for question_num in tqdm(valid_question_nums, position=0):

    question, responses, theme_dict = data.get_data_for_question_single_site(
        question_num=question_num, qs=qs, responses_df=responses_df, themes_df=themes_df)
    resps = responses[pd.notna(responses)]

    prompt = themes_prompt.format(
        question=question,
        response_list=numbered_list(resps)
    )
    ans = llm(prompt)
    themes_generated[question_num] = ans
    resps.index = np.arange(len(resps)) + 1
    resps_list[question_num] = resps

100%|██████████| 32/32 [00:00<00:00, 2480.32it/s]

cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!





In [19]:
def dprint(*args, f):
    # print(*args, file=sys.stdout)
    print(*args, file=f)


with open('../figs/themes_generated.md', 'w') as f:
    for question_num in valid_question_nums:
        dprint('### Question:', qs[question_num], f=f)
        dprint('\nResponses', f=f)
        for i in range(len(resps_list[question_num])):
            dprint(f'{i+1}. {resps_list[question_num].iloc[i]}', f=f)
        dprint('\nThemes', f=f)
        # print(themes_generated[question_num])
        themes = [s.strip(' ,:1234567890')
                  for s in themes_generated[question_num].split('Theme')]
        for i, theme in enumerate(themes):
            dprint(f'- Theme {i + 1}:', theme, f=f)
        dprint('', f=f)

In [20]:
resps_list

{0: 1        Twin sons with epilepsy
 2                       Son once
 3                            Son
 4                  Son has had 2
 5      Daughter genetic syndrome
 6       Self and son with fevers
 7                 Self and child
 8     Daughter, febrile seizures
 9                       Daughter
 10                      Daughter
 Name: 0, dtype: object,
 1: 1                           Sons
 2                            Son
 3                            Son
 4                  Son has had 2
 5                       Daughter
 6       Self and son with fevers
 7                 Self and child
 8     Daughter, febrile seizures
 9                       Daughter
 10                      Daughter
 Name: 1, dtype: object,
 2: 1                                    Stressful, hard
 2                     Fell and had seizure once only
 3     Myoclonic seizures and meds don't work anymore
 4                                              Scary
 5                                  Constant 