## Import data from CSV data file
The CSV file has been exported from Google Sheets survey responses.

In [1]:
import numpy as np
import pandas as pd
df = pd.read_csv('responses.csv', index_col=None, parse_dates=['Timestamp'])

# make the timestamp into a proper datetime64 dtype
#df['Timestamp'] = pd.to_datetime(df['Timestamp'])


## Clean up column names
The original column names are the questions.... let's simplifies them.  And while we're at it, set appropriate columns as categorical data to speed up analysis.

In [2]:
# replace long column titles with shorter versions
df.columns = [ 'date', 'neighborhood', 'demographic', 'schools', 'child_bus_freq', 'child_bike_freq', 'child_driven_freq', 'child_drive_freq', 'child_walk_freq', 'child_drive_reason', 'child_no_walk_reason', 'child_no_bike_reason', 'walk_freq', 'bike_freq', 'bikes_on_sidewalk', 'self_jog_frequency', 'commutes', 'child_self_school', 'child_self_bus_freq', 'child_self_bike_freq', 'child_self_driven_freq', 'child_self_drive_freq', 'child_self_walk_freq', 'child_self_commutes', 'child_self_has_children', 'commuter_distance', 'commuter_type', 'commuter_walk_to_station_freq', 'commuter_bike_to_station_freq', 'commuter_drive_to_station_freq', 'commuter_carpool_to_station_freq', 'commuter_driven_to_station_freq', 'commuter_bus_to_station_freq', 'no_walk_reason', 'no_bike_reason', 'drive_reason', 'feelings', 'problem_areas', 'drivers_are_safe', 'bicyclists_are_safe', 'suggested_improvements', 'additional_comments', 'owns_business', 'business_type', 'business_space', 'business_pedestrian_synergy', 'business_bicyclists_synergy', 'business_bike_rack_interest', 'business_promotion_interest', 'business_additional_comments', 'final_comments', 'contact_interest', 'contact_info', 'wants_pdf' ]

# a list of the columns that contain categorical data
categorical_columns = [ 'neighborhood', 'demographic', 'schools', 'child_bus_freq', 'child_bike_freq', 'child_driven_freq', 'child_drive_freq', 'child_walk_freq', 'child_drive_reason', 'child_no_walk_reason', 'child_no_bike_reason', 'walk_freq', 'bike_freq', 'bikes_on_sidewalk', 'self_jog_frequency', 'commutes', 'child_self_school', 'child_self_bus_freq', 'child_self_bike_freq', 'child_self_driven_freq', 'child_self_drive_freq', 'child_self_walk_freq', 'child_self_commutes', 'child_self_has_children', 'commuter_distance', 'commuter_type', 'commuter_walk_to_station_freq', 'commuter_bike_to_station_freq', 'commuter_drive_to_station_freq', 'commuter_carpool_to_station_freq', 'commuter_driven_to_station_freq', 'commuter_bus_to_station_freq', 'no_walk_reason', 'no_bike_reason', 'drive_reason', 'drivers_are_safe', 'bicyclists_are_safe', 'suggested_improvements', 'owns_business', 'business_type', 'business_space', 'business_pedestrian_synergy', 'business_bicyclists_synergy', 'business_bike_rack_interest', 'business_promotion_interest', 'contact_interest', 'wants_pdf' ]
for col in categorical_columns :
    df[col] = df[col].astype('category')
    continue


## Remove commas from multiple answer question options
This helps analyze questions that allow for more than one answer

In [3]:

# a dictionary of columns that contain comma-separated multiple answer options
multiple_answer_options = {
    'schools': ['PVC', 'CET', 'CHHS', 'Homeschooled'],
    'child_drive_reason': ['No - we do not drive, or prefer not to drive', 'Our own personal preference', 'Lack of available busing where we live', 'The bus schedule does not match our schedule', 'Safety concerns with buses', 'Safety concerns with walking', 'Safety concerns with bicycling', "My child's health condition"],
    'child_no_walk_reason': ['No - they walk a lot', 'My child does not like to walk', 'We live too far to walk', "We don't have time to walk", 'Fear of dangerous driving', 'Lack of adequate sidewalks', 'Lack of adequate crosswalks at busy intersections', 'Lack of crossing guards at busy intersections', "My child's health condition", 'Visually unappealing route'], 
    'child_no_bike_reason': ['No - they bicycle a lot', 'My child does not like to bicycle', 'My child is too young to bicycle', 'We live too far to bicycle', 'Our own personal preference', 'Fear of dangerous driving', 'Lack of adequate bike lanes', "My child's health condition", 'Visually unappealing route', 'Hills']
    # other option available for some
}

# loop through all questions that allow more than one answer
for question, answer_list in multiple_answer_options.items():
    # loop through every answer in the answer list for this question
    for answer in answer_list:
        # remove the commas, from each answer, if any
        if ',' in answer:
            # update the answers with a cleaned version
            df[question] = df[question].str.replace(answer, answer.replace(',', '') )
            
    # remove commas from the list of answer in the dictionary
    multiple_answer_options[question] = [answer.replace(',' , '') for answer in answer_list]
    #display(multiple_answer_options[question])

multiple_answer_options

{'schools': ['PVC', 'CET', 'CHHS', 'Homeschooled'],
 'child_drive_reason': ['No - we do not drive or prefer not to drive',
  'Our own personal preference',
  'Lack of available busing where we live',
  'The bus schedule does not match our schedule',
  'Safety concerns with buses',
  'Safety concerns with walking',
  'Safety concerns with bicycling',
  "My child's health condition"],
 'child_no_walk_reason': ['No - they walk a lot',
  'My child does not like to walk',
  'We live too far to walk',
  "We don't have time to walk",
  'Fear of dangerous driving',
  'Lack of adequate sidewalks',
  'Lack of adequate crosswalks at busy intersections',
  'Lack of crossing guards at busy intersections',
  "My child's health condition",
  'Visually unappealing route'],
 'child_no_bike_reason': ['No - they bicycle a lot',
  'My child does not like to bicycle',
  'My child is too young to bicycle',
  'We live too far to bicycle',
  'Our own personal preference',
  'Fear of dangerous driving',
  'Lac

## Split up multiple answer questions into multiple columns
This will allow us to independently count how many respondents included each answer option in their multiple answers.

In [4]:
def categorize_answers(response, question, answer_list):
    # convert to a series with the answer as a key
    indices = [] # will contain the indices for the series of answers
    values = [] # will contain the values for the series of answers
    
    # skip any nan or blank values
    if str(response) != 'nan' and type(answer_list) != float and type(question) != float and type(response) != float:
        for answer_option in response:
            answer_option = answer_option.strip() # remove any leading/trailing whitespace
            if answer_option in answer_list:
                indices.append(answer_option)
                values.append(1)
            else:
                indices.append('other')
                values.append(answer_option)
    
        response = pd.Series(values,index=indices).fillna(0).astype(bool)
        
    return response

# slice up answers by comma and give each its own column
for question, answer_list in multiple_answer_options.items():
    # process each response from the user to this question
    df[question] = df[question].str.split(',\s*').apply(categorize_answers, question=question, answer_list=answer_list)


TypeError: object of type 'float' has no len()

## Clean up neighborhood names
Neighorhood names were verbose in the actual survey... and some respondents wrote their own locations.  We group those respondents into their nearest neighborhoods and use consistent neighborhood names here.

In [None]:
df['neighborhood'] = df['neighborhood'].str.strip() # remove white space
df['neighborhood'] = df['neighborhood'].replace({
    
    # simplifying neighborhood names
    'Albany Post Road / Prickly Pear Hill / Scenic Dr area': 'Albany Post Road', 
    'Cortlandt outside of Croton': 'Cortlandt',
    'Old Post Road N area': 'Old Post Road North',
    'Quaker Ridge / Quaker Bridge area': 'Quaker Ridge',
    'Sunset Park area': 'Sunset Park',
    'Teatown area': 'Teatown',
    'Upper Village (the area nearest the Black Cow coffee shop)': 'Upper Village',

    # lumping CET/library area into Harmon
    'CET': 'Harmon', 
    'By CET': 'Harmon',
    'By the library/cemetery/CET': 'Harmon',
    'End of Cleveland near path': 'Harmon',
    'Cleveland near CET/Library': 'Harmon',
    'Cleveland/Park': 'Harmon',
    'Duck Pond': 'Harmon',
    'Harmon Park': 'Harmon',
    'Irving Ave': 'Harmon',
    'Ridge Rd.': 'Harmon',
    'Truesdale Drive': 'Harmon',
    'along the croton river': 'Harmon',
    'Behind high school': 'Harmon',
    'Wells/Beekman Area': 'Harmon',
    
    # lumping Old Post Road South and Sunset Park together
    'Old Post Road S': 'Sunset Park', 
    
    # lumping North Riverside area into Croton Landing
    'North Riverside': 'Croton Landing',
    'Palmer on High St': 'Croton Landing',
    'lower village': 'Croton Landing',
    'Brook St': 'Croton Landing',
    
    # lumping Old Post Road North and Croton Landing together
    'Croton Landing': 'Old Post Road North', 
    'wolf road': 'Albany Post Road',
    
    # lumping nearby streets into Upper Village
    'Bari Manor': 'Upper Village',
    'Harrison st': 'Upper Village',
    'Wells Ave': 'Upper Village',
    
    # lumping Batten Rd and Crompond Rd areas with Mount Airy
    'Batten Road': 'Mount Airy',
    'The trails': 'Mount Airy',
    '129 near dam': 'Mount Airy', 
    
    #lumping Teatown into Quaker Ridge, since there were few Teatown respondents
    'Teatown': 'Quaker Ridge' 
})

df['neighborhood'].value_counts()

## Clean up demographics
Some respondents indicated their own demographic titles... we're standardizing these.

In [None]:

df['demographic'] = df['demographic'].str.strip() # remove white space
df['demographic'] = df['demographic'].replace({
    
    # creating new category for adults with small children
    'Adult with a 22 year old and a 3 year old living with me': 'Adult with small child',
    'Adult with an infant': 'Adult with small child',
    'Adult with new baby': 'Adult with small child',
    'Adult with toddler': 'Adult with small child',
    'Adult with toddler living with me': 'Adult with small child',
    'Adult with toddlers living with me': 'Adult with small child',
    'Adult with young children': 'Adult with small child',
    'Adult with young children living with me': 'Adult with small child',
    'Adult with 3year old': 'Adult with small child',
    'Adult with children not yet in CET (pre-K)': 'Adult with small child',
    
    # lump adults with infants & toddlers into adults with no school age children
    'Adult with small child': 'Adult with no school age children living with me',
    'Adult with spouse and adult children living with us.': 'Adult with no school age children living with me',
    
    # lumping adults with college kids into the adults with no school age children category
    'adult with post college child living with me': 'Adult with no school age children living with me',
    'Adult with College Children': 'Adult with no school age children living with me',
    'Adult with College Children': 'Adult with no school age children living with me',

        # lump adults with school children in addition to others
    'Adult with school age children and a senior living with me.': 'Adult with school kid'

})    

# simplify response text
df['demographic'] = df['demographic'].replace({
    'Adult with school age children living with me': 'Adult with school kid',
    'Adult with no school age children living with me': 'Adult without school kid',
    'College student': 'College kid',
    'High School student': 'High School kid',
    'Middle School student': 'Middle School kid',
})

df['demographic'].value_counts()

## Split up columns with comma-separated values into separate columns

In [None]:
# get a dataframe with each school in its own column, and 0 or 1 as the values indicating which row indicated that school
#df2 = df[pd.notnull(df['schools'])]
#df3 = df2['schools'].str.get_dummies(sep=', ')
#for column in df3:
#    column = column.strip() #remove whitespace
#df3

# split a column with comma-separated values into separate columns
def breakout_comma_separated_values(old_column_name):
    # expand comma-separated values in the column into their own dataframe with multiple columns for each value
    df[old_column_name] = df[old_column_name].str.split(', ') #split by comma into a list
    series = df[old_column_name].apply(pd.Series) # convert each list into a series

    # rename each variable with a prefixed column name in a new dataframe
    new_df = series.rename(columns = lambda x : old_column_name + '_' + str(x))
    
    # return the new dataframe
    return new_df


# try it out
fields = [ 'schools', 'child_drive_reason', 'child_no_walk_reason', 'child_no_bike_reason', 'no_walk_reason', 'no_bike_reason', 'drive_reason' ]
#fields = ['schools']

for column_name in fields:
    
    # split up the column into multiple columns
    df2 = breakout_comma_separated_values(column_name)
    
    # join the new dataframe to the original dataframe
    df = pd.concat([df[:], df2[:]], axis=1)

    # delete the original column from the original dataframe
    del df[column_name]

    display(df2)
    
column_names = [c for c in df]
display(column_names)


## Save cleaned up data to CSV file
So it can be analyzed in subsequent programs.

In [None]:
df.to_csv('responses_scrubbed.csv', index=False)