In [None]:
year = 2024
month = 11
program = "default"
study_type = "study"
mode_of_interest = None
include_test_users = True
labels = {}
use_imperial = True
survey_info = {}

In [None]:
import collections
import urllib.request
import numpy as np
import pandas as pd
import xml.dom.minidom as minidom

import emcommon.survey.conditional_surveys as conditional_surveys

from plots import *
import scaffolding
import re

sns.set_style("whitegrid")
sns.set()
%matplotlib inline

# get metric vs imperial vars
label_units, short_label, label_units_lower, distance_col, weight_unit = scaffolding.get_units(use_imperial)

In [None]:
# Do not run this notebook at all unless it is for a survey configuration; nbclient will run up through this cell
if not survey_info.get('trip-labels', None) == 'ENKETO':
    ipython = get_ipython()
    ipython._showtraceback = scaffolding.no_traceback_handler
    raise Exception("The plots in this notebook are only relevant to deployments with trip-level surveys")

In [None]:
#input: list of survey names from the config
#output: list of links to the sheets where questions/answers are
#will run n surveys times
def get_sheet_links(survey_list):
    sheet_list = {}
    for name in survey_list:
        form_path = survey_info['surveys'][name]['formPath']
        #THIS ASSUMES THE FILENAME IS THE SAME AS THE FORM PATH BUT WITH xml FILE TYPE
        l_path = form_path.split('.')
        l_path[-1] = 'xml'
        s = '.'
        sheet_path = s.join(l_path)
        sheet_list[name] = sheet_path

    return sheet_list

#input: list of urls for the survey xlsx files
#output: two dictionaries to translate the ?s/ans (?s dict is nested per survey)
#will run n surveys times
def build_dictionaries(sheet_list):
    opt_dict = {}
    quest_dict = {}
    
    for survey_name in sheet_list:
        url = sheet_list[survey_name]
        result = urllib.request.urlopen(url)
        doc = minidom.parse(result) 
        
        #nested dictionaries to keep surveys grouped
        survey_questions = {}

        labels = doc.getElementsByTagName("label") 
        for label in labels:
            if(bool(label.parentNode.getAttribute("ref"))):
                #label appearance = not a question?
                #nodeName input -- to word cloud as a later addition?
                if label.parentNode.getAttribute("appearance") != "label" and label.parentNode.nodeName != "input": 
                    survey_questions[str(label.parentNode.getAttribute("ref").split('/')[-1])] = label.firstChild.data
            
            if label.parentNode.nodeName == 'item':
                if label.parentNode.parentNode.getAttribute("appearance") == "likert":
                    if label.parentNode.getElementsByTagName("value")[0].firstChild.data not in opt_dict.keys():
                        #use labels if not a dash and numbers if it is a dash?
                        if label.firstChild.data  == "-":
                            opt_dict[label.parentNode.getElementsByTagName("value")[0].firstChild.data] = label.parentNode.getElementsByTagName("value")[0].firstChild.data
                        else:
                            opt_dict[label.parentNode.getElementsByTagName("value")[0].firstChild.data] = label.firstChild.data + ' (' + label.parentNode.getElementsByTagName("value")[0].firstChild.data + ')'
                elif label.parentNode.getElementsByTagName("value"):
                    if label.parentNode.getElementsByTagName("value")[0].firstChild.data not in opt_dict.keys():
                        opt_dict[label.parentNode.getElementsByTagName("value")[0].firstChild.data] = label.firstChild.data  
                elif label.parentNode.getElementsByTagName("name"):
                    if label.parentNode.getElementsByTagName("name")[0].firstChild.data not in opt_dict.keys():
                        opt_dict[label.parentNode.getElementsByTagName("name")[0].firstChild.data] = label.firstChild.data  
        
        quest_dict[survey_name] = survey_questions
        
    return opt_dict, quest_dict


def get_response_to_normalize(row):
    data_key = list(row.user_input['trip_user_input']['data']['jsonDocResponse'].keys())[0]
    response = row.user_input['trip_user_input']['data']['jsonDocResponse'][data_key]

    return response 

#input: dataframe containing all trips that have non-blank user_input
#output: dataframe with questions in the columns and answers in the rows
#for loop will run n survey responses times (this could get big!)
def create_trip_survey_dataframe(df_trips_w_surveys):
    df = df_trips_w_surveys.reset_index()
    
    #normalize the survey responses
    normalized_responses = pd.json_normalize(df.apply(get_response_to_normalize, axis=1))
    normalized_responses['survey_name'] = df['survey_name']
    normalized_responses['user_id'] = df['user_id']

    #update the column names
    rename_nests = {}
    for col in normalized_responses.columns:
        rename_nests[col] = col.split('.')[-1]

    normalized_responses = normalized_responses.rename(columns=rename_nests)

    return normalized_responses

#input: list of labels that will end up on the chart
#output: translated to readable list, with multiples handled
#the for loop will run n times, where num_options <= n < all possible combinations of options
#if people are selecting many different combinations, could be large
def traslate_options(label, opt_dict):
    try:
        l_labels = str(label).split(" ")
        for k in range(len(l_labels)):
            try:
                #workaround for the case wher we had "5.0" and need "5"
                l_labels[k] = opt_dict[str(int(float(l_labels[k])))]
            except:
                l_labels[k] = opt_dict[l_labels[k]]
        sep = "\n"
        
        final = sep.join(l_labels)
        
        return final
    except:
        return label #probably a row without an answer

#create a debug dataframe
def generate_debug_df(program, include_test_users, full_df, labeled_df):
    debug_df = pd.DataFrame.from_dict({
            "year": year,
            "month": month,
            "Registered_participants": len(scaffolding.get_participant_uuids(program, include_test_users)),
            "Participants_with_at_least_one_trip": scaffolding.unique_users(full_df),
            "Participant_with_at_least_one_labeled_trip": scaffolding.unique_users(labeled_df),
            "Trips_with_at_least_one_label": len(labeled_df)
            },
        orient='index', columns=["value"])
    
    return debug_df

#workaround missing colors
def get_survey_colors(labels, existing_map):
    color_map = {}
    for label in labels:
        l_labels = label.split("\n")
        color = (0,0,0)
        n = 0
        for i in range(len(l_labels)):
            try:
                color = tuple(map(lambda i, j: i + j, color, existing_map[l_labels[i]]))
            except:
                print("missing color")
                color = tuple(map(lambda i, j: i + j, color, (0.1, 0.2, 0.5)))
            
            n += 1
            
        print(color, n)
        color = [x/n for x in color]
        
        color_map[label] = color
    
    color_map['Other'] = existing_map['Other']
    
    return color_map

In [None]:
#list of all surveys that are not a "UserProfileSurvey"
survey_list = list(survey_info['surveys'].keys())
survey_list.remove('UserProfileSurvey')
sheet_list = get_sheet_links(survey_list)

print('survey sheets: ', sheet_list)

In [None]:
#load data - all data and data with labels
all_confirmed_trips, survey_trips, file_suffix = await scaffolding.load_viz_notebook_survey_data(year, month, program, include_test_users)

try:
    #survey counts df
    survey_trips = survey_trips.reset_index()
    survey_trips['survey_name'] = survey_trips.user_input.apply(lambda sr: sr['trip_user_input']['data']['name'])

    #gather the cols needed for charts and text
    survey_trips = survey_trips[['survey_name', 'user_id', 'user_input']]
    survey_trips.groupby('survey_name').count()
    
except:
    survey_trips = pd.DataFrame()

#format survey trips into responses dataframe
if len(survey_trips) > 0:
    df_responses = create_trip_survey_dataframe(survey_trips)
else:
    df_responses = pd.DataFrame(columns=survey_trips.columns)

In [None]:
#create translation dictionaries
opt_dict, quest_dict = build_dictionaries(sheet_list)
print("Questions dictionary:\n",quest_dict)
print("Options dictionary:\n", opt_dict)

color_map = scaffolding.mapping_color_surveys(opt_dict)

In [None]:
# #create the total dfs
# total_dfs = {}
# #for conditional surveys!
# wrapped_config = {'survey_info': survey_info}
# if 'buttons' in survey_info.keys():
#     all_confirmed_trips = all_confirmed_trips[all_confirmed_trips['ble_sensed_summary'].notna()]
#     all_confirmed_trips["confirmedMode_baseMode"] = all_confirmed_trips.ble_sensed_summary.apply(lambda md: max(md["distance"], key=md["distance"].get))
#     all_confirmed_trips['survey_name_prompted'] = all_confirmed_trips.apply(lambda row: conditional_surveys.survey_prompted_for_trip(row.to_dict(), wrapped_config), axis=1)
    
#     for survey_name in list(sheet_list.keys()):
#         if survey_name in all_confirmed_trips['survey_name_prompted'].unique():
#             total_dfs[survey_name] = all_confirmed_trips[all_confirmed_trips['survey_name_prompted'] == survey_name]
#         else:
#             #never prompted
#             total_dfs[survey_name] = pd.DataFrame()
# else:
#     survey_name = list(sheet_list.keys())[0] #there is only one if non-conditional
#     total_dfs[survey_name] = all_confirmed_trips

# for key in total_dfs.keys():
#     if len(total_dfs[key]) > 0:
#         print(key, ":", len(total_dfs[key]), "trips", total_dfs[key].user_id.nunique(), "users")
#     else:
#         print(key, ":", len(total_dfs[key]), "trips")

In [None]:
#merge any cols with the same name into 1 col -- should have different values in their survey_name col
#https://stackoverflow.com/questions/24390645/python-pandas-merge-samed-name-columns-in-a-dataframe
def sjoin(x): return ';'.join(x[x.notnull()].astype(str))
df_responses = df_responses.groupby(level=0, axis=1).apply(lambda x: x.apply(sjoin, axis=1))

In [None]:
#create one plot per survey, one bar per question
for survey_name in quest_dict.keys():
    print("Charts for:", survey_name)
#     debug_df = generate_debug_df(program, include_test_users, total_dfs[survey_name], total_dfs[survey_name][total_dfs[survey_name]['user_input'] != {} if len(total_dfs[survey_name]) > 0 else pd.DataFrame()])
    debug_df = generate_debug_df(program, include_test_users, all_confirmed_trips, df_responses) #survey filtering still buggy, omitting all features of "num trips survey presented"
    
    n_quests =len(quest_dict[survey_name].keys())
    fig, ax = plt.subplots(nrows=n_quests, ncols=1, figsize=(15,2*n_quests), sharex=True)
    text_results = [["Unmodified Alt Text", "Unmodified HTML"] for i in range(n_quests)]
    filename = survey_name + file_suffix
    
    #temp quality text -- just based on the responses
    plot_title_no_quality = survey_name
    
    try:
        qual_text = scaffolding.get_quality_text_numerator(df_responses[df_responses['survey_name'] == survey_name], include_test_users) if 'survey_name' in df_responses.columns else "No Responses"
        plot_title = plot_title_no_quality+'\n'+qual_text
    
        for i in range(n_quests):
            quest_name = list(quest_dict[survey_name].keys())[i]

            quest_frame = df_responses.copy()
            quest_frame = quest_frame[quest_frame['survey_name'] == survey_name]
            quest_frame[quest_name].replace('', np.nan, inplace=True)
            quest_frame.dropna(subset=[quest_name], inplace=True)

            quest_frame[quest_name] = quest_frame[quest_name].apply(lambda x: traslate_options(x, opt_dict))

            plot_df = quest_frame.groupby(['survey_name', quest_name]).count().reset_index()
            plot_df = plot_df[plot_df['survey_name'] == survey_name]
            plot_df = plot_df.set_index(quest_name)[['start']]
            plot_df = plot_df.reindex(opt_dict.values()).dropna() #ordered, as for likert

            #split question for axis label
            axis_list = quest_dict[survey_name][quest_name].split()
            l = len(axis_list)//3
            axis_label = f'{" ".join(axis_list[0:l])}\n{" ".join(axis_list[l:2*l])}\n{" ".join(axis_list[2*l:])}'

            plot_and_text_stacked_bar_chart(plot_df, lambda df: df, axis_label, ax[i], text_results[i], get_survey_colors(list(plot_df.index.values), color_map), debug_df)
        
        set_title_and_save(fig, text_results, plot_title, filename) 

    except (AttributeError, KeyError, pd.errors.UndefinedVariableError) as e:
        plt.clf()
        generate_missing_plot(plot_title_no_quality, debug_df, filename)
        alt_text = store_alt_text_missing(debug_df, filename, plot_title_no_quality)        
        alt_html = store_alt_html_missing(debug_df, filename, plot_title_no_quality)
    except Exception as e:
        fig, ax = plt.subplots()
        plot_and_text_error(e, ax, filename)