# Step 4:  Survey Data Normalization

### E-Bike Project under Center for Community Energy (CCE)

##### Contributors
- Zhihan Li (Claire)
- Ethan Hu

### Introduciton

In this crucial step, our aim is to standardize our survey response data based on the proportion of respondents from each residency population. We consider this process vital as it allows us to assign equal weight to responses from every community. It's important to acknowledge that the limitation of this research lies in the restricted number of survey responses. We aspire for our efforts to serve as a foundation for future researchers undertaking similar experiments with larger response pools, thereby enhancing the accuracy of our research findings.

### Setup

In [1]:
import numpy as np
import pandas as pd
import copy

### Read CSV Files

In [2]:
# Read the Survey dataframe
survey_df = pd.read_csv('labeled_cce_survey_data.csv')
survey_df

Unnamed: 0,bike_rider,zipcode,ebike_rider,by_public,by_car,by_bike,by_ebike,by_walking,no_commute,miles,try_bike,ebike_class,private_locker,group_locker,locker,ebike_rules,safty,interested,label
0,no,92115,no,0,0,0.0,0,0,1,,maybe,no,very,very,,don't know,not very,no,3.0
1,no,92129,no,0,1,0.0,0,0,0,,no,a bit,,,,not very,somewhat,no,0.0
2,no,92115,yes,1,0,0.0,0,0,0,1.0,maybe,a bit,very,somewhat,,don't know,very,yes,3.0
3,no,92562,no,0,0,0.0,0,0,1,,,no,,,,don't know,not concerned,maybe,
4,yes,92109,no,0,0,0.0,0,0,1,,,no,,,,don't know,not concerned,no,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,no,91945,no,0,1,,0,0,0,20.0,,a bit,,,somewhat,not very,,yes,3.0
142,yes,92108,no,0,1,,0,0,0,10.0,,no,,,somewhat,not very,,yes,
143,yes,92117,no,0,1,,0,0,0,15.0,,no,,,very,not very,,maybe,1.0
144,yes,92102,no,0,1,,0,0,0,12.0,,no,,,somewhat,somewhat,,maybe,3.0


In [3]:
# Read the Zipcode clusters dataframe
cluster_df = pd.read_csv('labeled SD zipcode 6 clusters.csv')
sum_cluster_df = cluster_df.groupby('label').sum()
sum_cluster_df

Unnamed: 0_level_0,Zipcode,Population,Land Area (Sq. Miles),Population Density (People per Square Mile),Median Age,Population By Age % (Under 18 Years),Population By Age % (18 to 34),Population By Age % (35 to 64),Population By Age % (65 and Over),Male %,...,Carpooled %,Public Transit %,Motorcycle %,Bicycle %,Walked %,Other %.1,Worked at Home %,Health Insurance Coverage %,Married %,Median House Value $
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1749357,597291.0,535.62,48550.53,822.2,426.37,318.81,797.57,357.23,931.88,...,114.41,19.08,12.15,10.16,31.71,18.21,384.33,1817.72,1116.3,19561501.0
1,2300971,1080171.0,490.93,92181.71,933.8,566.31,605.33,965.05,363.34,1248.05,...,193.01,36.35,12.91,6.78,32.49,28.49,289.31,2339.89,1309.17,15481000.0
2,1288625,107883.0,1738.57,1214.26,672.4,274.47,242.88,555.79,326.87,739.88,...,73.32,2.23,1.65,0.0,27.83,11.98,178.1,1293.16,786.48,6833200.0
3,2209018,1219481.0,403.93,146298.97,823.6,594.2,632.63,888.73,1569.43,1185.92,...,254.19,101.63,9.97,6.88,50.5,24.79,198.63,2125.9,1130.34,11667000.0
4,1197907,410933.0,103.01,81906.77,467.2,173.09,465.38,482.19,179.35,680.47,...,71.72,38.49,7.22,21.78,66.62,17.35,215.47,1230.53,544.73,11241600.0
5,92004,2779.0,914.88,3.04,62.8,10.69,9.86,32.82,46.64,42.75,...,0.1,0.0,0.0,0.0,0.2,0.0,35.98,94.6,69.04,320600.0


In [4]:
# Extract the Population associated to each Labeled Community
pop0 = sum_cluster_df['Population'][0]
pop1 = sum_cluster_df['Population'][1]
pop2 = sum_cluster_df['Population'][2]
pop3 = sum_cluster_df['Population'][3]
pop4 = sum_cluster_df['Population'][4]
pop5 = sum_cluster_df['Population'][5]

print(f'Population 0: {pop0}\nPopulation 1: {pop1}\nPopulation 2: {pop2}\nPopulation 3: {pop3}\nPopulation 4: {pop4}\nPopulation 5: {pop5}')

Population 0: 597291.0
Population 1: 1080171.0
Population 2: 107883.0
Population 3: 1219481.0
Population 4: 410933.0
Population 5: 2779.0


### Transform Answer Proportions into a Dictionary

In [5]:
# Create a method to transform the count of answer for each qustion into question:{name:proportion}
def answer_proportion(labeled_df):
    
    num_response = len(labeled_df)
    
    bike_rider = round((labeled_df['bike_rider'].value_counts() / sum(labeled_df['bike_rider'].value_counts())), 2).to_dict()
    
    ebike_rider = round((labeled_df['ebike_rider'].value_counts() / sum(labeled_df['ebike_rider'].value_counts())), 2).to_dict()  
    
    num_by_public = round(sum(labeled_df['by_public']) / num_response, 2)
    num_by_car = round(sum(labeled_df['by_car']) / num_response, 2)
    num_by_bike = round(sum(labeled_df['by_bike']) / num_response, 2)
    num_by_ebike = round(sum(labeled_df['by_ebike']) / num_response, 2)
    num_by_walking = round(sum(labeled_df['by_walking']) / num_response, 2)
    num_no_commute = round(sum(labeled_df['no_commute']) / num_response, 2)
    
    commute_num_lst = [num_by_public, num_by_car, num_by_bike, num_by_ebike, num_by_walking, num_no_commute]
    commute_name_lst = ['num_by_public', 'num_by_car', 'num_by_bike', 'num_by_ebike', 'num_by_walking', 'num_no_commute']
    commute_type = {}
    
    for i in range(len(commute_num_lst)):
        if np.isnan(commute_num_lst[i]):
            commute_type[commute_name_lst[i]] = 0
        else:
            commute_type[commute_name_lst[i]] = commute_num_lst[i]
    
    miles_lst = list(labeled_df['miles'].dropna())
    
    try_bike = round((labeled_df['try_bike'].value_counts() / sum(labeled_df['try_bike'].value_counts())), 2).to_dict()

    ebike_class = round((labeled_df['ebike_class'].value_counts() / sum(labeled_df['ebike_class'].value_counts())), 2).to_dict()

    ebike_rules = round((labeled_df['ebike_rules'].value_counts() / sum(labeled_df['ebike_rules'].value_counts())), 2).to_dict()

    safty = round(labeled_df['safty'].value_counts() / sum(labeled_df['safty'].value_counts()), 2).to_dict()

    interested = round((labeled_df['interested'].value_counts() / sum(labeled_df['interested'].value_counts())), 2).to_dict()
    
    dict_lst = {'num_response':num_response, 
                'bike_rider':bike_rider, 
                'ebike_rider':ebike_rider, 
                'commute_type':commute_type, 
                'try_bike':try_bike, 
                'ebike_class':ebike_class, 
                'ebike_rules':ebike_rules, 
                'safty':safty, 
                'interested':interested}
    
    return dict_lst, miles_lst

In [6]:
# Check the label of communities from our survey
survey_df['label'].unique()

array([ 3.,  0., nan,  4.,  1.,  2.])

In [7]:
# Data collected from community of label 0
label_0 = survey_df[survey_df['label'] == 0]
ans_prop_0 = answer_proportion(label_0)
ans_prop_0[0]

{'num_response': 17,
 'bike_rider': {'no': 0.59, 'yes': 0.41},
 'ebike_rider': {'no': 0.94, 'yes': 0.06},
 'commute_type': {'num_by_public': 0.18,
  'num_by_car': 0.59,
  'num_by_bike': 0,
  'num_by_ebike': 0.06,
  'num_by_walking': 0.18,
  'num_no_commute': 0.12},
 'try_bike': {'no': 0.5, 'maybe': 0.29, 'yes': 0.21},
 'ebike_class': {'no': 0.71, 'a bit': 0.18, 'yes': 0.12},
 'ebike_rules': {"don't know": 0.38,
  'not very': 0.25,
  'very': 0.25,
  'somewhat': 0.12},
 'safty': {'somewhat': 0.62,
  'not very': 0.23,
  'very': 0.08,
  'not concerned': 0.08},
 'interested': {'no': 0.53, 'yes': 0.27, 'maybe': 0.2}}

In [8]:
# Data collected from community of label 1
label_1 = survey_df[survey_df['label'] == 1]
ans_prop_1 = answer_proportion(label_1)
ans_prop_1[0]

{'num_response': 12,
 'bike_rider': {'yes': 0.67, 'no': 0.33},
 'ebike_rider': {'no': 0.75, 'yes': 0.25},
 'commute_type': {'num_by_public': 0.25,
  'num_by_car': 0.67,
  'num_by_bike': 0,
  'num_by_ebike': 0.17,
  'num_by_walking': 0.0,
  'num_no_commute': 0.25},
 'try_bike': {'maybe': 0.56, 'yes': 0.33, 'no': 0.11},
 'ebike_class': {'no': 0.75, 'yes': 0.17, 'a bit': 0.08},
 'ebike_rules': {'not very': 0.36,
  'very': 0.27,
  "don't know": 0.27,
  'somewhat': 0.09},
 'safty': {'somewhat': 0.38,
  'very': 0.25,
  'not concerned': 0.25,
  'not very': 0.12},
 'interested': {'yes': 0.55, 'maybe': 0.36, 'no': 0.09}}

In [9]:
# Data collected from community of label 2
label_2 = survey_df[survey_df['label'] == 2]
ans_prop_2 = answer_proportion(label_2)
ans_prop_2[0]

{'num_response': 1,
 'bike_rider': {'no': 1.0},
 'ebike_rider': {'yes': 1.0},
 'commute_type': {'num_by_public': 0.0,
  'num_by_car': 1.0,
  'num_by_bike': 0,
  'num_by_ebike': 0.0,
  'num_by_walking': 0.0,
  'num_no_commute': 0.0},
 'try_bike': {},
 'ebike_class': {'no': 1.0},
 'ebike_rules': {"don't know": 1.0},
 'safty': {},
 'interested': {'no': 1.0}}

In [10]:
# Data collected from community of label 3
label_3 = survey_df[survey_df['label'] == 3]
ans_prop_3 = answer_proportion(label_3)
ans_prop_3[0]

{'num_response': 55,
 'bike_rider': {'yes': 0.56, 'no': 0.44},
 'ebike_rider': {'no': 0.73, 'yes': 0.27},
 'commute_type': {'num_by_public': 0.15,
  'num_by_car': 0.65,
  'num_by_bike': 0,
  'num_by_ebike': 0.04,
  'num_by_walking': 0.04,
  'num_no_commute': 0.16},
 'try_bike': {'yes': 0.4, 'no': 0.31, 'maybe': 0.29},
 'ebike_class': {'no': 0.74, 'a bit': 0.13, 'yes': 0.13},
 'ebike_rules': {"don't know": 0.39,
  'not very': 0.2,
  'somewhat': 0.2,
  'very': 0.2},
 'safty': {'somewhat': 0.39,
  'very': 0.22,
  'not concerned': 0.22,
  'not very': 0.14,
  'no opinion': 0.03},
 'interested': {'yes': 0.6, 'no': 0.25, 'maybe': 0.15}}

In [11]:
# Data collected from community of label 4
label_4 = survey_df[survey_df['label'] == 4]
ans_prop_4 = answer_proportion(label_4)
ans_prop_4[0]

{'num_response': 32,
 'bike_rider': {'yes': 0.53, 'no': 0.47},
 'ebike_rider': {'no': 0.87, 'yes': 0.13},
 'commute_type': {'num_by_public': 0.31,
  'num_by_car': 0.59,
  'num_by_bike': 0,
  'num_by_ebike': 0.06,
  'num_by_walking': 0.16,
  'num_no_commute': 0.12},
 'try_bike': {'maybe': 0.71, 'yes': 0.29},
 'ebike_class': {'no': 0.73, 'a bit': 0.17, 'yes': 0.1},
 'ebike_rules': {"don't know": 0.37,
  'very': 0.23,
  'somewhat': 0.23,
  'not very': 0.17},
 'safty': {'somewhat': 0.47,
  'very': 0.26,
  'not concerned': 0.11,
  "don't know": 0.05,
  'not very': 0.05,
  'no opinion': 0.05},
 'interested': {'yes': 0.37, 'no': 0.33, 'maybe': 0.3}}

In [12]:
# Data collected from community without assigned label
label_nan = survey_df[np.isnan(survey_df['label'])]
ans_prop_nan = answer_proportion(label_nan)
ans_prop_nan[0]

{'num_response': 29,
 'bike_rider': {'yes': 0.59, 'no': 0.41},
 'ebike_rider': {'no': 0.83, 'yes': 0.17},
 'commute_type': {'num_by_public': 0.21,
  'num_by_car': 0.31,
  'num_by_bike': 0,
  'num_by_ebike': 0.1,
  'num_by_walking': 0.41,
  'num_no_commute': 0.17},
 'try_bike': {'yes': 0.55, 'maybe': 0.36, 'no': 0.09},
 'ebike_class': {'no': 0.7, 'yes': 0.19, 'a bit': 0.11},
 'ebike_rules': {'not very': 0.33,
  'somewhat': 0.26,
  'very': 0.22,
  "don't know": 0.19},
 'safty': {'somewhat': 0.29,
  'not very': 0.25,
  'not concerned': 0.21,
  'very': 0.12,
  'no opinion': 0.12},
 'interested': {'yes': 0.46, 'maybe': 0.31, 'no': 0.23}}

### Data Normalization
Next, we aggregate survey responses from all the communities we've gathered data from. This yields an adjusted population size for each community. During this phase, we focus solely on communities labeled 0, 1, 3, and 4, as we've received only a single response from the community labeled 2, and none from the community labeled 5.

In [13]:
def scale_by_population(ans_prop_x, pop_x):
    out_dict = copy.deepcopy(ans_prop_x)
    
    del out_dict[0]['num_response']
    
    for question in out_dict[0].keys():
        
        if question == 'num_response':
            continue
            
        for answer in out_dict[0][question].keys():
            out_dict[0][question][answer] *= pop_x
    
    return out_dict[0]

In [14]:
# Scaled Data from community of label 0
scaled_0 = scale_by_population(ans_prop_0, pop0)
scaled_0

{'bike_rider': {'no': 352401.69, 'yes': 244889.31},
 'ebike_rider': {'no': 561453.5399999999, 'yes': 35837.46},
 'commute_type': {'num_by_public': 107512.37999999999,
  'num_by_car': 352401.69,
  'num_by_bike': 0.0,
  'num_by_ebike': 35837.46,
  'num_by_walking': 107512.37999999999,
  'num_no_commute': 71674.92},
 'try_bike': {'no': 298645.5, 'maybe': 173214.38999999998, 'yes': 125431.11},
 'ebike_class': {'no': 424076.61,
  'a bit': 107512.37999999999,
  'yes': 71674.92},
 'ebike_rules': {"don't know": 226970.58000000002,
  'not very': 149322.75,
  'very': 149322.75,
  'somewhat': 71674.92},
 'safty': {'somewhat': 370320.42,
  'not very': 137376.93,
  'very': 47783.28,
  'not concerned': 47783.28},
 'interested': {'no': 316564.23000000004,
  'yes': 161268.57,
  'maybe': 119458.20000000001}}

In [15]:
# Scaled Data from community of label 1
scaled_1 = scale_by_population(ans_prop_1, pop1)
scaled_1

{'bike_rider': {'yes': 723714.5700000001, 'no': 356456.43},
 'ebike_rider': {'no': 810128.25, 'yes': 270042.75},
 'commute_type': {'num_by_public': 270042.75,
  'num_by_car': 723714.5700000001,
  'num_by_bike': 0.0,
  'num_by_ebike': 183629.07,
  'num_by_walking': 0.0,
  'num_no_commute': 270042.75},
 'try_bike': {'maybe': 604895.76, 'yes': 356456.43, 'no': 118818.81},
 'ebike_class': {'no': 810128.25,
  'yes': 183629.07,
  'a bit': 86413.68000000001},
 'ebike_rules': {'not very': 388861.56,
  'very': 291646.17000000004,
  "don't know": 291646.17000000004,
  'somewhat': 97215.39},
 'safty': {'somewhat': 410464.98,
  'very': 270042.75,
  'not concerned': 270042.75,
  'not very': 129620.51999999999},
 'interested': {'yes': 594094.05, 'maybe': 388861.56, 'no': 97215.39}}

In [16]:
# Scaled Data from community of label 3
scaled_3 = scale_by_population(ans_prop_3, pop3)
scaled_3

{'bike_rider': {'yes': 682909.3600000001, 'no': 536571.64},
 'ebike_rider': {'no': 890221.13, 'yes': 329259.87},
 'commute_type': {'num_by_public': 182922.15,
  'num_by_car': 792662.65,
  'num_by_bike': 0.0,
  'num_by_ebike': 48779.24,
  'num_by_walking': 48779.24,
  'num_no_commute': 195116.96},
 'try_bike': {'yes': 487792.4, 'no': 378039.11, 'maybe': 353649.49},
 'ebike_class': {'no': 902415.94, 'a bit': 158532.53, 'yes': 158532.53},
 'ebike_rules': {"don't know": 475597.59,
  'not very': 243896.2,
  'somewhat': 243896.2,
  'very': 243896.2},
 'safty': {'somewhat': 475597.59,
  'very': 268285.82,
  'not concerned': 268285.82,
  'not very': 170727.34000000003,
  'no opinion': 36584.43},
 'interested': {'yes': 731688.6, 'no': 304870.25, 'maybe': 182922.15}}

In [17]:
# Scaled Data from community of label 4
scaled_4 = scale_by_population(ans_prop_4, pop4)
scaled_4

{'bike_rider': {'yes': 217794.49000000002, 'no': 193138.50999999998},
 'ebike_rider': {'no': 357511.71, 'yes': 53421.29},
 'commute_type': {'num_by_public': 127389.23,
  'num_by_car': 242450.47,
  'num_by_bike': 0.0,
  'num_by_ebike': 24655.98,
  'num_by_walking': 65749.28,
  'num_no_commute': 49311.96},
 'try_bike': {'maybe': 291762.43, 'yes': 119170.56999999999},
 'ebike_class': {'no': 299981.08999999997, 'a bit': 69858.61, 'yes': 41093.3},
 'ebike_rules': {"don't know": 152045.21,
  'very': 94514.59000000001,
  'somewhat': 94514.59000000001,
  'not very': 69858.61},
 'safty': {'somewhat': 193138.50999999998,
  'very': 106842.58,
  'not concerned': 45202.63,
  "don't know": 20546.65,
  'not very': 20546.65,
  'no opinion': 20546.65},
 'interested': {'yes': 152045.21, 'no': 135607.89, 'maybe': 123279.9}}

In [18]:
# A list containing dictionaries of community answers, each scaled by its corresponding population size
scaled_dict_lst = [scaled_0, scaled_1, scaled_3, scaled_4]

In [19]:
# Create a dictionary that aggregates responses from each community, adjusted for population size
sum_scaled_dict = copy.deepcopy(scaled_0)
    
for commmunity_dict in scaled_dict_lst[1:]:
    for question in commmunity_dict.keys():      
        for answer in commmunity_dict[question].keys():
            if answer in sum_scaled_dict[question].keys():
                sum_scaled_dict[question][answer] += commmunity_dict[question][answer]
            else:
                sum_scaled_dict[question][answer] = commmunity_dict[question][answer]

sum_scaled_dict            

{'bike_rider': {'no': 1438568.27, 'yes': 1869307.7300000002},
 'ebike_rider': {'no': 2619314.63, 'yes': 688561.3700000001},
 'commute_type': {'num_by_public': 687866.51,
  'num_by_car': 2111229.3800000004,
  'num_by_bike': 0.0,
  'num_by_ebike': 292901.75,
  'num_by_walking': 222040.9,
  'num_no_commute': 586146.59},
 'try_bike': {'no': 795503.4199999999, 'maybe': 1423522.07, 'yes': 1088850.51},
 'ebike_class': {'no': 2436601.8899999997,
  'a bit': 422317.19999999995,
  'yes': 454929.82},
 'ebike_rules': {"don't know": 1146259.55,
  'not very': 851939.12,
  'very': 779379.7100000001,
  'somewhat': 507301.10000000003},
 'safty': {'somewhat': 1449521.5,
  'not very': 458271.44,
  'very': 692954.43,
  'not concerned': 631314.4800000001,
  'no opinion': 57131.08,
  "don't know": 20546.65},
 'interested': {'no': 854257.7600000001,
  'yes': 1639096.4300000002,
  'maybe': 814521.81}}

In [20]:
def convert_to_proportions(input_list):
    total_sum = sum(input_list)
    proportions = [x / total_sum for x in input_list]
    rounded_prop = []
    for i in proportions:
        rounded_prop.append(round(i, 2))
    return rounded_prop

In [21]:
percent_scaled_dict = copy.deepcopy(sum_scaled_dict)

for question in percent_scaled_dict.keys(): 
    total = sum(percent_scaled_dict[question].values())
    for answer in percent_scaled_dict[question].keys():
        percent_scaled_dict[question][answer] = round(percent_scaled_dict[question][answer]/total, 4)

percent_scaled_dict

{'bike_rider': {'no': 0.4349, 'yes': 0.5651},
 'ebike_rider': {'no': 0.7918, 'yes': 0.2082},
 'commute_type': {'num_by_public': 0.1764,
  'num_by_car': 0.5413,
  'num_by_bike': 0.0,
  'num_by_ebike': 0.0751,
  'num_by_walking': 0.0569,
  'num_no_commute': 0.1503},
 'try_bike': {'no': 0.2405, 'maybe': 0.4303, 'yes': 0.3292},
 'ebike_class': {'no': 0.7353, 'a bit': 0.1274, 'yes': 0.1373},
 'ebike_rules': {"don't know": 0.349,
  'not very': 0.2594,
  'very': 0.2373,
  'somewhat': 0.1544},
 'safty': {'somewhat': 0.438,
  'not very': 0.1385,
  'very': 0.2094,
  'not concerned': 0.1907,
  'no opinion': 0.0173,
  "don't know": 0.0062},
 'interested': {'no': 0.2582, 'yes': 0.4955, 'maybe': 0.2462}}

In [22]:
# Transform the dictionary as a dataframe
percent_scaled_df = pd.DataFrame(percent_scaled_dict)
percent_scaled_df

Unnamed: 0,bike_rider,ebike_rider,commute_type,try_bike,ebike_class,ebike_rules,safty,interested
no,0.4349,0.7918,,0.2405,0.7353,,,0.2582
yes,0.5651,0.2082,,0.3292,0.1373,,,0.4955
num_by_public,,,0.1764,,,,,
num_by_car,,,0.5413,,,,,
num_by_bike,,,0.0,,,,,
num_by_ebike,,,0.0751,,,,,
num_by_walking,,,0.0569,,,,,
num_no_commute,,,0.1503,,,,,
maybe,,,,0.4303,,,,0.2462
a bit,,,,,0.1274,,,


In [24]:
# Export the dataframe as an excel file
excel_filename = 'scaled survey data in percentage.xlsx'
percent_scaled_df.to_excel(excel_filename, index=True)