# Multiple answer cleanup experiment
Attempt at dealing with fields that contain comma-separated lists of answers from questions with multiple-select answer options that also allow respondents to enter their own 'other' answer.

In [1]:
import pandas as pd
import numpy as np
from IPython.display import display, Markdown, Latex

## Practice data set
Simulating how responses to multiple-select answers options are stored by Google Forms

In [2]:
df = pd.DataFrame({
    'a': ['foo, nonsense1', 'nonsense2, bar'],
    'b': ['nonsense3, hello', 'world, nonsense4'],
    'c': ['nonesuch, forthwith', 'nonsense5, forthwith']
})
df

Unnamed: 0,a,b,c
0,"foo, nonsense1","nonsense3, hello","nonesuch, forthwith"
1,"nonsense2, bar","world, nonsense4","nonsense5, forthwith"


## Practice answer options to each question
A dictionary containing lists of all the answer options available to each question in the Google Form.

In [3]:
# a dictionary of columns that contain comma-separated multiple answer options
multiple_answer_options = {
    'a': ['foo', 'bar'],
    'b': ['hello', 'world'],
    'c': ['nonesuch', 'forthwith'],
    # other option available for some
}


## Remove commas from any given answer option so we can later split by comma cleanly

In [4]:
# loop through all questions that allow more than one answer
for question, answer_list in multiple_answer_options.items():
    # loop through every answer in the answer list for this question
    for answer in answer_list:
        # remove the commas, from each answer, if any
        if ',' in answer:
            # update the answers with a cleaned version
            df[question] = df[question].str.replace(answer, answer.replace(',', '') )
            
    # remove commas from the list of answer in the dictionary
    multiple_answer_options[question] = [answer.replace(',' , '') for answer in answer_list]
    #display(multiple_answer_options[question])

# show the cleaned up answer options
multiple_answer_options

{'a': ['foo', 'bar'], 'b': ['hello', 'world'], 'c': ['nonesuch', 'forthwith']}

## Count answers
Counts how many times each answer in the given answer list has been given to each question, and groups freeform text answers that were not in the answer list into an 'other' column.

In [5]:
def categorize_answers(response, question, answer_list):
    # convert to a series with the answer as a key
    indices = [] # will contain the indices for the series of answers
    values = [] # will contain the values for the series of answers
    
    # skip any nan or blank values
    if str(response) != 'nan' and type(answer_list) != float and type(question) != float and type(response) != float:
        #display(question + " - " + str(response))
        for answer_option in response:
            answer_option = answer_option.strip() # remove any leading/trailing whitespace
            if answer_option in answer_list:
                indices.append(question + '_' + answer_option)
                values.append(1)
            else:
                indices.append(question + '_' + 'other')
                values.append(answer_option)
    
        response = pd.Series(values,index=indices).fillna(0).astype(str)
        #display(response)
        
    return response

# slice up answers by comma and give each its own column
df2 = pd.DataFrame()
for question, answer_list in multiple_answer_options.items():
    # process each response from the user to this question
    d = df[question].str.split(',\s*').apply(categorize_answers, question=question, answer_list=answer_list)
    #display(d)
    df2 = df2.append(d, sort=False)

# replace NaN with zeros ... don't do this if counting the number of responses in each column
#df2.fillna(0, inplace=True)

# show the results
df2

Unnamed: 0,a_foo,a_other,a_bar,b_other,b_hello,b_world,c_nonesuch,c_forthwith,c_other
0,1.0,nonsense1,,,,,,,
1,,nonsense2,1.0,,,,,,
0,,,,nonsense3,1.0,,,,
1,,,,nonsense4,,1.0,,,
0,,,,,,,1.0,1.0,
1,,,,,,,,1.0,nonsense5


## Sum the number of respondents who gave each response

In [6]:
sum_column = df2.count(axis=0) # add up all the columns with a 1 in them
print(sum_column)

a_foo          1
a_other        2
a_bar          1
b_other        2
b_hello        1
b_world        1
c_nonesuch     1
c_forthwith    2
c_other        1
dtype: int64


## Display the 'other' free text responses to each question

In [10]:
for question, answer_options in multiple_answer_options.items():
    for answer in answer_options:
        display(Markdown('### {} {} responses:'.format(question, answer)))
        display(df2[pd.notnull(df2[question + '_' + answer])][question + '_' + answer])
    
    # deal with 'other' responses to this question
    display(Markdown('### {} other responses:'.format(question)))
    display(df2[pd.notnull(df2[question + '_other'])][question + '_other'])

### a foo responses:

0    1
Name: a_foo, dtype: object

### a bar responses:

1    1
Name: a_bar, dtype: object

### a other responses:

0    nonsense1
1    nonsense2
Name: a_other, dtype: object

### b hello responses:

0    1
Name: b_hello, dtype: object

### b world responses:

1    1
Name: b_world, dtype: object

### b other responses:

0    nonsense3
1    nonsense4
Name: b_other, dtype: object

### c nonesuch responses:

0    1
Name: c_nonesuch, dtype: object

### c forthwith responses:

0    1
1    1
Name: c_forthwith, dtype: object

### c other responses:

1    nonsense5
Name: c_other, dtype: object