In [None]:
year = 2024
month = 4
program = "default"
study_type = "study"
mode_of_interest = 'e-bike'
include_test_users = False
dynamic_labels = {}
use_imperial = False
survey_info =  {}

In [None]:
from collections import defaultdict
import urllib.request
import numpy as np
import pandas as pd
from xml.dom import minidom

from emcommon.survey import conditional_surveys

from plots import *
import scaffolding
import re

sns.set_style("whitegrid")
sns.set()
%matplotlib inline

# get metric vs imperial vars
label_units, short_label, label_units_lower, distance_col, weight_unit = scaffolding.get_units(use_imperial)

In [None]:
# Do not run this notebook at all unless it is for a survey configuration; nbclient will run up through this cell
if not survey_info['trip-labels'] == 'ENKETO':
    ipython = get_ipython()
    ipython._showtraceback = scaffolding.no_traceback_handler
    raise Exception("The plots in this notebook are only relevant to deployments with trip-level surveys")

In [None]:
#input: list of survey names from the config
#output: list of links to the sheets where questions/answers are
#will run n surveys times
def get_sheet_links(survey_list):
    sheet_list = {}
    for name in survey_list:
        form_path = survey_info['surveys'][name]['formPath']
        #THIS ASSUMES THE FILENAME IS THE SAME AS THE FORM PATH BUT WITH xml FILE TYPE
        l_path = form_path.split('.')
        l_path[-1] = 'xml'
        s = '.'
        sheet_path = s.join(l_path)
        sheet_list[name] = sheet_path

    return sheet_list

#input: list of urls for the survey xlsx files
#output: two dictionaries to translate the ?s/ans (?s dict is nested per survey)
#will run n surveys times
def build_dictionaries(sheet_list):
    opt_dict = {}
    quest_dict = {}
    
    for survey_name in sheet_list:
        url = sheet_list[survey_name]
        result = urllib.request.urlopen(url)
        doc = minidom.parse(result) 
        
        #nested dictionaries to keep surveys grouped
        survey_questions = {}

        labels = doc.getElementsByTagName("label") 
        for label in labels:
            if(bool(label.parentNode.getAttribute("ref"))):
                #label appearance = not a question?
                #nodeName input -- to word cloud as a later addition?
                if label.parentNode.getAttribute("appearance") != "label" and label.parentNode.nodeName != "input": 
                    survey_questions[str(label.parentNode.getAttribute("ref").split('/')[-1])] = label.firstChild.data
            
            if label.parentNode.nodeName == 'item':
                if label.parentNode.parentNode.getAttribute("appearance") == "likert":
                    if label.parentNode.getElementsByTagName("value")[0].firstChild.data not in opt_dict.keys():
                        opt_dict[label.parentNode.getElementsByTagName("value")[0].firstChild.data] = label.parentNode.getElementsByTagName("value")[0].firstChild.data
                elif label.parentNode.getElementsByTagName("value"):
                    if label.parentNode.getElementsByTagName("value")[0].firstChild.data not in opt_dict.keys():
                        opt_dict[label.parentNode.getElementsByTagName("value")[0].firstChild.data] = label.firstChild.data  
                elif label.parentNode.getElementsByTagName("name"):
                    if label.parentNode.getElementsByTagName("name")[0].firstChild.data not in opt_dict.keys():
                        opt_dict[label.parentNode.getElementsByTagName("name")[0].firstChild.data] = label.firstChild.data  
        
        quest_dict[survey_name] = survey_questions
        
    return opt_dict, quest_dict


#input: dataframe containing all trips that have non-blank user_input
#output: dataframe with questions in the columns and answers in the rows
#for loop will run n survey responses times (this could get big!)
def create_dataframe(df_trips_w_surveys):
    df = df_trips_w_surveys.reset_index()
    rows = []
    for i in range(len(df)):
        data_key = list(df.loc[i].user_input['trip_user_input']['data']['jsonDocResponse'].keys())[0]
        row = pd.json_normalize(df.loc[i].user_input['trip_user_input']['data']['jsonDocResponse'][data_key])
        row['user_id'] = df.loc[i].user_id
        row['survey_name'] = df.loc[i]['survey_name']
        rows.append(row)
        
    if len(rows) > 0:
        df = pd.concat(rows)

        rename_nests = {}
        for col in df.columns:
            rename_nests[col] = col.split('.')[-1]

        df = df.rename(columns=rename_nests)
    else:
        df = pd.DataFrame()

    return df

#input: list of labels that will end up on the chart
#output: translated to readable list, with multiples handled
#the for loop will run n times, where num_options <= n < all possible combinations of options
#if people are selecting many different combinations, could be large
def traslate_options(label, opt_dict):
    try:
        l_labels = str(label).split(" ")
        for k in range(len(l_labels)):
            try:
                #workaround for the case wher we had "5.0" and need "5"
                l_labels[k] = opt_dict[str(int(float(l_labels[k])))]
            except:
                l_labels[k] = opt_dict[l_labels[k]]
        sep = "\n"
        
        final = sep.join(l_labels)
        print(label, final)
        return final
    except:
        return label #probably a row without an answer

#create a debug dataframe
def generate_debug_df(program, include_test_users, full_df, labeled_df):
    debug_df = pd.DataFrame.from_dict({
            "year": year,
            "month": month,
            "Registered_participants": len(scaffolding.get_participant_uuids(program, include_test_users)),
            "Participants_with_at_least_one_trip": scaffolding.unique_users(full_df),
            "Participant_with_at_least_one_labeled_trip": scaffolding.unique_users(labeled_df),
            "Trips_with_at_least_one_label": len(labeled_df)
            },
        orient='index', columns=["value"])
    
    return debug_df

In [None]:
#list of all surveys that are not a "UserProfileSurvey"
survey_list = list(survey_info['surveys'].keys())
survey_list.remove('UserProfileSurvey')
sheet_list = get_sheet_links(survey_list)

print('survey sheets: ', sheet_list)

In [None]:
#load all of the composite trips - require sections
tq = scaffolding.get_time_query(year, month)
all_confirmed_trips = scaffolding.load_all_confirmed_trips(tq)
#we need to filter out trips (based on if including test users)
all_confirmed_trips = scaffolding.filter_composite_trips(all_confirmed_trips, program, include_test_users)

if len(all_confirmed_trips) > 0:
    #remove blank inputs
    survey_trips = all_confirmed_trips[all_confirmed_trips['user_input'] != {}]

    #survey counts df
    survey_trips = survey_trips.reset_index()
    survey_trips['survey_name'] = survey_trips.user_input.apply(lambda sr: sr['trip_user_input']['data']['name'])

    #gather the cols needed for charts and text
    survey_trips = survey_trips[['survey_name', 'user_id', 'user_input']]
    
else:
    survey_trips = pd.DataFrame()

In [None]:
#create translation dictionaries
opt_dict, quest_dict = build_dictionaries(sheet_list)
print("Questions dictionary:\n",quest_dict)
print("Options dictionary:\n", opt_dict)

#color dictionary
from collections import OrderedDict
def mapping_color_surveys(dic_options):
    dictionary_values = (list(OrderedDict.fromkeys(dic_options.values())))
#     dictionary_values = list(dic_options.keys())

    colors = {}
    for i in range(len(dictionary_values)):
        colors[dictionary_values[i]] = plt.cm.tab10.colors[i%10]
    
    return colors

color_map = mapping_color_surveys(opt_dict)

print("\n", color_map)

#format survey trips into responses dataframe
df_responses = create_dataframe(survey_trips)
file_suffix = scaffolding.get_file_suffix(year, month, program)

In [None]:
#create the total dfs
total_dfs = {}
#for conditional surveys!
wrapped_config = {'survey_info': survey_info}
if 'buttons' in survey_info.keys():
    all_composite_trips['survey_name_prompted'] = all_composite_trips.apply(lambda row: conditional_surveys.survey_prompted_for_trip(row.to_dict(), wrapped_config), axis=1)
    
    for survey_name in list(sheet_list.keys()):
        if survey_name in all_composite_trips['survey_name_prompted'].unique():
            total_dfs[survey_name] = all_composite_trips[all_composite_trips['survey_name_prompted'] == survey_name]
        else:
            #never prompted
            total_dfs[survey_name] = pd.DataFrame()
else:
    survey_name = list(sheet_list.keys())[0] #there is only one if non-conditional
    total_dfs[survey_name] = all_composite_trips
    
for key in total_dfs.keys():
    print(key, ":", len(total_dfs[key]), "trips", total_dfs[key].user_id.nunique(), "users")

In [None]:
#merge any cols with the same name into 1 col -- should have different values in their survey_name col
#https://stackoverflow.com/questions/24390645/python-pandas-merge-samed-name-columns-in-a-dataframe
def sjoin(x): return ';'.join(x[x.notnull()].astype(str))
df_responses = df_responses.groupby(level=0, axis=1).apply(lambda x: x.apply(sjoin, axis=1))

In [None]:
#create one plot per question in the survey
for survey_name in quest_dict.keys():
    print("Charts for:", survey_name)
#     debug_df = generate_debug_df(program, include_test_users, total_dfs[survey_name], total_dfs[survey_name][total_dfs[survey_name]['user_input'] != {} if len(total_dfs[survey_name]) > 0 else pd.DataFrame()])
    debug_df = pd.DataFrame()

    for col in quest_dict[survey_name].keys():
        
        print(col)
        
        filename = col + file_suffix
        plot_title_no_quality = survey_name + "\n" + quest_dict[survey_name][col]

        try:
            quest_frame = df_responses.copy()
            
#             qual_text = scaffolding.get_quality_text(total_dfs[survey_name], quest_frame, mode_of_interest, include_test_users)
            qual_text = "debug qual text later"
            plot_title = plot_title_no_quality+'\n'+qual_text
            
            fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(15,2*1), sharex=True)
            
            # We will have text results corresponding to the axes for simplicity and consistency
            
            quest_frame[col] = quest_frame[col].apply(lambda x: traslate_options(x, opt_dict))
#             quest_frame[col] = traslate_options(quest_frame[col], opt_dict)
            text_results = ["Unmodified Alt Text", "Unmodified HTML"]  

            plot_df = quest_frame.groupby(['survey_name', col]).count().reset_index()
            plot_df = plot_df[plot_df['survey_name'] == survey_name]
            plot_df = plot_df.set_index(col)[['start']]
            
#             labels = quest_frame[col].value_counts(dropna=True).keys().tolist()
#             labels = traslate_options(quest_frame[col].value_counts(dropna=True).keys().tolist(), opt_dict)
#             values = quest_frame[col].value_counts(dropna=True).tolist()       
#             plot_df = pd.DataFrame({"label": labels, "value": values}).set_index('label')
            
            plot_and_text_stacked_bar_chart(plot_df, "Responses", ax, text_results, color_map, debug_df)
            
            set_title_and_save(fig, text_results, plot_title, filename)
            

        except:
            generate_missing_plot(plot_title_no_quality, debug_df, filename)
            alt_text = store_alt_text_missing(debug_df, filename, plot_title_no_quality)