# Multiple answer cleanup experiment
Attempt at dealing with fields that contain comma-separated lists of answers from questions with multiple-select answer options that also allow respondents to enter their own 'other' answer.

In [1]:
import pandas as pd
import numpy as np
from IPython.display import display, Markdown, Latex

## Practice data set
Simulating how responses to multiple-select answers options are stored by Google Forms

In [2]:
df = pd.DataFrame({
    'a': ['foo, nonsense1', 'nonsense2, bar', 'foo, bar'],
    'b': ['nonsense3, hello', 'world, nonsense4', 'hello, world'],
    'c': ['nonesuch, forthwith', 'nonsense5, forthwith', 'nonesuch, forthwith'],
})
df

Unnamed: 0,a,b,c
0,"foo, nonsense1","nonsense3, hello","nonesuch, forthwith"
1,"nonsense2, bar","world, nonsense4","nonsense5, forthwith"
2,"foo, bar","hello, world","nonesuch, forthwith"


## Answer options to each question
A dictionary containing lists of all the questions and the answer options that the user was asked to select from for each question in the Google Form.

In [3]:
# a dictionary of columns that contain comma-separated multiple answer options
multiple_answer_options = {
    'a': ['foo', 'bar'],
    'b': ['hello', 'world'],
    'c': ['nonesuch', 'forthwith']
}

# remember that there was also an 'other' field where users could enter custom answers... we'll deal with that later.

## Remove commas from any given answer option so we can later split by comma cleanly

In [4]:
# loop through all questions that allow more than one answer
for question, answer_list in multiple_answer_options.items():
    # loop through every answer in the answer list for this question
    for answer in answer_list:
        # remove the commas, from each answer, if any
        if ',' in answer:
            # update the answers with a cleaned version
            df[question] = df[question].str.replace(answer, answer.replace(',', '') )
            
    # remove commas from the list of answers in the question/answer dictionary
    multiple_answer_options[question] = [answer.replace(',' , '') for answer in answer_list]

# show the cleaned up answer options
multiple_answer_options

{'a': ['foo', 'bar'], 'b': ['hello', 'world'], 'c': ['nonesuch', 'forthwith']}

## Tally answers
Indicate which users selected which answer options, and put any freeform text answers that users entered by clicking the 'other' option in Google Forms into '_other' columns.

In [5]:
def categorize_answers(response, question, answer_list):
    """
    Takes a given set of responses to a multi-select question, and puts them into a well-formated Series of dummy values.
    
    @param response A list of the user's responses to this question
    @param question The title of the question (these are codes)
    @param answer_list A list of the answer options that were presented to the user in the Google Form
    """
    
    # set up some lists that we will use to collect responses as dummy values
    indices = [] # will contain the text of the answer option that the user selected
    values = [] # will contain dummy values (1s) for each answer option the user selected, NaNs otherwise.
    
    # skip any nan or blank values
    if str(response) != 'nan' and type(answer_list) != float and type(question) != float and type(response) != float:
        #display(question + " - " + str(response))
        for answer_option in response:
            answer_option = answer_option.strip() # remove any leading/trailing whitespace
            if answer_option in answer_list:
                indices.append(question + '_' + answer_option)
                values.append(1)
            else:
                indices.append(question + '_' + 'other')
                values.append(answer_option)
    
        # put it all into a pandas Series
        response = pd.Series(values,index=indices).fillna(0).astype(str)
        
    return response

# slice up answers by comma and give each its own column
df2 = pd.DataFrame() # build a new dataframe from scratch

# loop through each question/answer item
for question, answer_list in multiple_answer_options.items():
    # get a new dataframe with the questions as indexes and the valuess as dummy values indicating user selections
    d = df[question].str.split(',\s*').apply(categorize_answers, question=question, answer_list=answer_list)
    df2 = df2.append(d) # append this as a row to the new dataframe

# show the new dataframe
df2

Unnamed: 0,a_bar,a_foo,a_other,b_hello,b_other,b_world,c_forthwith,c_nonesuch,c_other
0,,1.0,nonsense1,,,,,,
1,1.0,,nonsense2,,,,,,
2,1.0,1.0,,,,,,,
0,,,,1.0,nonsense3,,,,
1,,,,,nonsense4,1.0,,,
2,,,,1.0,,1.0,,,
0,,,,,,,1.0,1.0,
1,,,,,,,1.0,,nonsense5
2,,,,,,,1.0,1.0,


## Convert values to booleans, where possible
Most columns contain NaNs or 1s.  NaNs are converted to 0s then to booleans.  The 'other' columns contain text and are left alone.

In [6]:
# loop through each column
for col in df2.columns:
    try:
        # try to convert to int
        df2[col] = df2[col].fillna(0).astype(int)
    except:
        # if this column contains text, skip...
        continue

In [7]:
df2.a_foo

0    1
1    0
2    1
0    0
1    0
2    0
0    0
1    0
2    0
Name: a_foo, dtype: int64

## Try to group by 'id'

In [11]:
# add up all values in each column to get a full set of values or each id
# this loses the 'other' columns, since they are not numeric
df3 = df2.groupby(df2.index).sum()

# show the new grouped data... note that it is missing the 'other' fields... more on that later
df3

Unnamed: 0,a_bar,a_foo,b_hello,b_world,c_forthwith,c_nonesuch
0,0,1,1,0,1,1
1,1,0,0,1,1,0
2,1,1,1,1,1,1


## Combine the selected answer options with the 'other' responses users gave

In [9]:
# loop through all questions
for question, answer_options in multiple_answer_options.items():
    # get a dataframe of just the 'other' answers to this question
    other_df = df2[pd.notnull(df2[question + '_other'])][question + '_other'].to_frame()
    # concatenate those responses to the original dataframe
    df3 = df3.join(other_df)
    
# show the data
df3

Unnamed: 0,a_bar,a_foo,b_hello,b_world,c_forthwith,c_nonesuch,a_other,b_other,c_other
0,0,1,1,0,1,1,nonsense1,nonsense3,
1,1,0,0,1,1,0,nonsense2,nonsense4,nonsense5
2,1,1,1,1,1,1,,,


## Order the columns alphabetically
So it looks a bit more orderly

In [10]:
column_names = df3.columns.sort_values()
df3 = df3[column_names]
df3

Unnamed: 0,a_bar,a_foo,a_other,b_hello,b_other,b_world,c_forthwith,c_nonesuch,c_other
0,0,1,nonsense1,1,nonsense3,0,1,1,
1,1,0,nonsense2,0,nonsense4,1,1,0,nonsense5
2,1,1,,1,,1,1,1,
