In [None]:
# Import Packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd

# Center For Community Service
#### Project: Data Analysis on E-bike Survey Collected on 04/29/2023

## Part I: Data Cleaning

##### CSV File 1: 4.29.2023_Survey_Results.csv

In [None]:
# Read survey data from the csv file
df = pd.read_csv("4.29.2023_Survey_Results.csv")
df.head()

In [None]:
df.shape

In [None]:
# Keep numerical and binary columns
quant_df = df.drop("Who took this survey (initials)", axis=1)
quant_df.drop("Any additional comments?", axis=1, inplace=True)
quant_df.drop("Who's survey is this?", axis=1, inplace=True)
quant_df.drop("Timestamp", axis=1, inplace=True)
quant_df.drop("Unnamed: 16", axis=1, inplace=True)

# Change headers
quant_df.columns = ['bike_rider', 'zipcode', 'ebike_rider', 'commute_purpose',
                   'miles', 'try_bike', 'ebike_class', 'private_locker', 'group_locker',
                   'ebike_rules', 'safty', 'interested']

quant_df.head()

In [None]:
quant_df.shape

Here's a description of each feature (column) in the quant_df DataFrame:

Header | Description
-------|------------
bike_rider | Are you a bike rider?
zipcode | What is your zip code?
ebike_rider | Do you or anyone in your family ride an e-bike?
commute_purpose | How do you commute - to work, school, or for other purposes?
miles | How far (ROUND TRIP) do you commute for any purpose each day?
try_bike | Would you commute by e-bike if given the opportunity?
ebike_class | Do you know the differences between Class 1, Class 1, and Class 3 e-bikes?
private_locker | How comfortable would you be storing your bike in a private bike locker while you are commuting?
group_locker | How comfortable would you be storing your bike in a group bike locker while you are commuting?
ebike_rules | How familiar are you and those in your family with the rules of the road for e-bikes?
safty | How concerned are you about e-bikes and e-bike safety in San Diego County?
interested | Are you interested in learning more about e-bikes?


In [None]:
# convert all letters to lowercase
columns = quant_df.columns
for column in columns:
    if column != 'zipcode':
        quant_df[column] = quant_df[column].str.lower()


# remove unit in miles
quant_df.head()

In [None]:
# Remove cell with a time unit
quant_df['miles'] = quant_df['miles'].astype(str)
quant_df['miles'] = quant_df['miles'].apply(lambda x: np.nan if pd.isnull(x) or 'min' in x else x)

# miles column cleaning - keep numbers only
quant_df['miles'] = quant_df['miles'].str.extract(r'(\d+)', expand=False).astype(float).astype(pd.Int64Dtype())
quant_df.head()

##### CSV File 2: 6.08.2023_Online_Survey_Results.csv

In [None]:
df2= pd.read_csv("6.08.2023_Online_Survey_Results.csv")
df2

In [None]:
# Keep question columns only
df2 = df2[['Q1', 'Q2', 'Q3',
           'Q4_1', 'Q4_2', 'Q4_3', 'Q4_4', 
           'Q4_5', 'Q4_6', 'Q4_7', 'Q5', 'Q6',
           'Q7', 'Q8', 'Q9', 'Q10', 'Q11']]

# Drop row 1
df2 = df2.drop(1)

In [None]:
# Change column names
df2.columns = ['bike_rider', 'zipcode', 'ebike_rider', 'commute_purpose_1', 'commute_purpose_2',
               'commute_purpose_3', 'commute_purpose_4', 'commute_purpose_5', 'commute_purpose_6',
               'commute_purpose_7', 'miles', 'try_bike', 'ebike_class', 'locker',
               'ebike_rules', 'safty', 'interested']
df2 = df2.drop(0)

In [None]:
# Set cells to null if its unit is time
df2['miles'] = df2['miles'].apply(lambda x: np.nan if pd.isnull(x) or 'min' in x else x)
df2['miles'] = df2['miles'].str.extract(r'(\d+)', expand=False).astype(float).astype(pd.Int64Dtype())
df2

## Part II: Analysis on Numerical and Categorical Features

#### Survey Question: Are you a bike rider? | Column: bike_rider

In [None]:
# count number of bike rider and non-bike rider
bike_rider_count = quant_df['bike_rider'].value_counts()
bike_rider_yes = bike_rider_count[0] + bike_rider_count[2] + bike_rider_count[4]
bike_rider_no = bike_rider_count[1] + bike_rider_count[3]

# create bar plot
x = ['Bike Riders', 'None Bike Riders']
y = [bike_rider_yes, bike_rider_no]
plt.bar(x, y)

# add label to bar plot
plt.xlabel('Riders')
plt.ylabel('Number of People')
plt.title('Number of Bike Riders vs. None Bike Riders')

# display the plot
plt.show()

# show number of bike riders vs. none bike riders
bike_percent_yes = round(bike_rider_yes / (bike_rider_yes + bike_rider_no) * 100)
bike_percent_no = round(bike_rider_no / (bike_rider_no + bike_rider_yes) * 100)
print(f'{bike_percent_yes}% of the participants are bike riders')
print(f'{bike_percent_no}% of the participants are NOT bike riders')

#### Survey Question: What is your zip code? | Column: zipcode

In [None]:
# ENTER CODE HERE
...

#### Survey Question: Are you a ebike_rider? | Column: ebike_rider

In [None]:
# count number of ebike rider vs. non-ebike rider
ebike_rider_no = quant_df[quant_df['ebike_rider'] == 'no'].shape[0]
ebike_rider_yes = quant_df.shape[0] - ebike_rider_no

# create bar plot
x = ['E-Bike Riders', 'None E-Bike Riders']
y = [ebike_rider_yes, ebike_rider_no]
plt.bar(x, y)

# add label to bar plot
plt.xlabel('Riders')
plt.ylabel('Number of People')
plt.title('Number of E-Bike Riders vs. None E-Bike Riders')

# display the plot
plt.show()

# show number of bike riders vs. none bike riders
ebike_percent_yes = round(ebike_rider_yes / (ebike_rider_yes + ebike_rider_no) * 100)
ebike_percent_no = round(ebike_rider_no / (ebike_rider_no + ebike_rider_yes) * 100)
print(f'{ebike_percent_yes}% of the participants are e-bike riders')
print(f'{ebike_percent_no}% of the participants are NOT e-bike riders')

#### Survey Question: How do you commute - to work, school, or for other purposes? | Column: commute_purpose

In [None]:
...

#### Survey Question: How far (ROUND TRIP) do you commute for any purpose each day? | Column: miles

In [None]:
# count number of miles
miles_count = quant_df['miles'].value_counts()

# create bar plot
x = miles_count.index
y = miles_count.values
plt.bar(x, y)

# add label to bar plot
plt.xlabel('Miles')
plt.ylabel('Number of People')
plt.title('Number of People by Miles')

# display the plot
plt.show()

In [None]:
# count number of miles
mile_counts = quant_df['miles'].value_counts(sort = False)

# create bar plot
fig, ax = plt.subplots(figsize = (8, 6))
bars = ax.bar(mile_counts.index, mile_counts.values, color = 'teal')
ax.set_xlabel('Miles')
ax.set_ylabel('Number of People')
ax.set_title('Number of Participants by Miles')

# add a vertical line to show the average and median miles
ax.grid(axis = 'y', alpha = 0.5)
avg_miles = quant_df['miles'].mean()
med_miles = quant_df['miles'].median()
ax.axvline(x = avg_miles, color = 'orange', linestyle = '--', label = f'Average Miles: {avg_miles:.2f}')
ax.axvline(x = med_miles, color = 'green', linestyle = '--', label = f'Median Miles: {med_miles}')
ax.set_xticks(range(0, 51, 10))

# show the plot
ax.legend()
plt.show()

#### Survey Question: Would you commute by e-bike if given the opportunity? | Column: try_bike

In [None]:
# Count the number of occurrences of each unique value in the try_bike column
try_bike_counts = quant_df['try_bike'].value_counts()

# Create a bar plot
plt.bar(try_bike_counts.index, try_bike_counts.values)

# Set the title and axis labels
plt.title('Have you ever tried a bike before?')
plt.xlabel('Answer')
plt.ylabel('Count')

# Show the plot
plt.show()

#### Survey Question: Do you know the differences between Class 1, Class 1, and Class 3 e-bikes? | Column: ebike_class

In [None]:
# Count the number of occurrences of each unique value in the ebike_class column
ebike_class_count = quant_df['ebike_class'].value_counts()

# Create a bar graph
f, ax = plt.subplots(figsize=(7, 5))
sns.despine(f)
sns.barplot(x=ebike_class_count.index, y = ebike_class_count.values, palette='magma', order = ['yes', 'a bit', 'no'])

# Set title and labels to the graph
plt.title('Do you know the differences between Class 1, Class 1, and Class 3 e-bikes?')
plt.xlabel('Answer')
plt.ylabel('Count')


#### Survey Question: How comfortable would you be storing your bike in a private bike locker while you are commuting? | Column: private_locker

In [None]:
# Count the occurrences of each option in the 'private_locker' column
private_locker_counts = quant_df['private_locker'].value_counts()

# Plotting the bar graph
plt.figure(figsize=(8, 6))
sns.barplot(x=private_locker_counts.index, y=private_locker_counts.values)
plt.xlabel('Private Locker Option')
plt.ylabel('Number of People')
plt.title('Number of People by Private Locker Option')
plt.show()

#### Survey Question: How comfortable would you be storing your bike in a group bike locker while you are commuting? | Column: group_locker

In [None]:
category_order = ['very', 'somewhat', 'not very', 'depends', "don't know"]
quant_df['group_locker'] = pd.Categorical(quant_df['group_locker'], categories=category_order, ordered=True)
group_locker_counts = quant_df['group_locker'].value_counts()
plt.figure(figsize=(8, 6))
sns.barplot(x=group_locker_counts.index, y=group_locker_counts.values)
plt.xlabel('Group Locker Option')
plt.ylabel('Number of People')
plt.title('Number of People by Group Locker Option')
plt.show()

#### Survey Question: How familiar are you and those in your family with the rules of the road for e-bikes? | Column: ebike_rules

In [None]:
order = ['very', 'somewhat', 'not very', "don't know"]
ebike_rules_counts = quant_df['ebike_rules'].value_counts()
plt.figure(figsize=(8, 6))
sns.barplot(x=ebike_rules_counts.index, y=ebike_rules_counts.values, order=order)
plt.xlabel('Ebike Rules Option')
plt.ylabel('Number of People')
plt.title('Number of People by Ebike Rules Option')
plt.show()

#### Survey Question: How concerned are you about e-bikes and e-bike safety in San Diego County? | Column: safty

In [None]:
order = ['very', 'somewhat', 'not very', 'not concerned', "don't know"]
safety_counts = quant_df['safty'].value_counts()
plt.figure(figsize=(8, 6))
sns.barplot(x=safety_counts.index, y=safety_counts.values, order=order)
plt.xlabel('Safety Option')
plt.ylabel('Number of People')
plt.title('Number of People by Safety Option')
plt.show()

#### Survey Question: Are you interested in learning more about e-bikes? | Column: interested

In [None]:
# Fix typo and category in answers
quant_df['interested'] = quant_df['interested'].str.strip()
quant_df['interested'] = quant_df['interested'].str.replace('may be', 'maybe')
quant_df['interested'] = quant_df['interested'].str.replace('already know', 'yes')
quant_df['interested'] = quant_df['interested'].str.replace('knows enough', 'yes')

# Count the number of occurrences of each unique value in the interested column
interested_count = quant_df['interested'].value_counts()

f, ax = plt.subplots(figsize=(7, 5))
sns.despine(f)
sns.barplot(x=interested_count.index, y = interested_count.values, palette='magma', order = ['yes', 'maybe', 'no'])

# Set title and labels to the graph
plt.title('Are you interested in learning more about e-bikes?')
plt.xlabel('Answer')
plt.ylabel('Count')

## Part III: Correlation Analysis

### bike_rider vs. ibike_rider, persentage show by text

In [None]:
correlation = quant_df['bike_rider'].eq('yes').astype(int).corr(quant_df['ebike_rider'].eq('yes').astype(int))

yes_yes_count = quant_df[(quant_df['bike_rider'] == 'yes') & (quant_df['ebike_rider'] == 'yes')].shape[0]
yes_no_count = quant_df[(quant_df['bike_rider'] == 'yes') & (quant_df['ebike_rider'] == 'no')].shape[0]
no_yes_count = quant_df[(quant_df['bike_rider'] == 'no') & (quant_df['ebike_rider'] == 'yes')].shape[0]
no_no_count = quant_df[(quant_df['bike_rider'] == 'no') & (quant_df['ebike_rider'] == 'no')].shape[0]

total_count = len(quant_df)
yes_yes_percent = round((yes_yes_count / total_count) * 100, 2)
yes_no_percent = round((yes_no_count / total_count) * 100, 2)
no_yes_percent = round((no_yes_count / total_count) * 100, 2)
no_no_percent = round((no_no_count / total_count) * 100, 2)

print(f"Correlation between bike_rider and ebike_rider: {correlation}")
print(f"Percentage of participants who answered 'yes-yes': {yes_yes_percent}%")
print(f"Percentage of participants who answered 'yes-no': {yes_no_percent}%")
print(f"Percentage of participants who answered 'no-yes': {no_yes_percent}%")
print(f"Percentage of participants who answered 'no-no': {no_no_percent}%")

### bike rider vs. miles, bar graph

### ebike_rules vs. safety, persentage by text

In [None]:
ebike_rules_categories = ['very', 'somewhat', 'not very', "don't know"]
safety_categories = ['very', 'somewhat', 'not very', 'not concerned', "don't know"]

percentages = []
for ebike_rule in ebike_rules_categories:
    row = []
    for safety in safety_categories:
        count = quant_df[(quant_df['ebike_rules'] == ebike_rule) & (quant_df['safty'] == safety)].shape[0]
        percent = round((count / total_count) * 100, 2)
        row.append(percent)
    percentages.append(row)

print("Percentage of Participants:")
print("--------------------------------------")
print("ebike_rules  |   Safety       |  Percentage")
print("--------------------------------------")
for i in range(len(ebike_rules_categories)):
    for j in range(len(safety_categories)):
        print(f"{ebike_rules_categories[i]:<12} | {safety_categories[j]:<15} | {percentages[i][j]:>10}%")

### ebike_rules vs. safety, persentage by grouped bar chart

In [None]:
#grouped bar chart
safety_categories = ['very', 'somewhat', 'not very', 'not concerned', "don't know"]
ebike_rules_categories = ['very', 'somewhat', 'not very', "don't know"]

percentages = []
for safety in safety_categories:
    row = []
    for ebike_rule in ebike_rules_categories:
        count = quant_df[(quant_df['safty'] == safety) & (quant_df['ebike_rules'] == ebike_rule)].shape[0]
        percentage = (count / total_count) * 100
        row.append(percentage)
    percentages.append(row)

percentages = np.array(percentages)
bar_width = 0.2
x = np.arange(len(ebike_rules_categories))

fig, ax = plt.subplots(figsize=(10, 6))
for i in range(len(safety_categories)):
    ax.bar(x + (i * bar_width), percentages[i], bar_width, label=safety_categories[i])

ax.set_xlabel('ebike_rules')
ax.set_ylabel('Percentage')
ax.set_title('Percentage of Safety Responses by ebike_rules')
ax.set_xticks(x + bar_width * (len(safety_categories) - 1) / 2)
ax.set_xticklabels(ebike_rules_categories)
ax.legend()

plt.show()

### ebike_rules vs. safety, persentage by Clustered Bar Chart

In [None]:
#Clustered Bar Chart
ebike_rules_categories = ['very', 'somewhat', 'not very', "don't know"]
safety_categories = ['very', 'somewhat', 'not very', 'not concerned', "don't know"]

percentages = []
for ebike_rule in ebike_rules_categories:
    row = []
    for safety in safety_categories:
        count = quant_df[(quant_df['ebike_rules'] == ebike_rule) & (quant_df['safty'] == safety)].shape[0]
        percentage = (count / total_count) * 100
        row.append(percentage)
    percentages.append(row)

percentages = np.array(percentages)
bar_width = 0.15

r1 = np.arange(len(ebike_rules_categories))
r2 = [x + bar_width for x in r1]
r3 = [x + bar_width for x in r2]
r4 = [x + bar_width for x in r3]
r5 = [x + bar_width for x in r4]

fig, ax = plt.subplots(figsize=(10, 6))
rects1 = ax.bar(r1, percentages[:, 0], width=bar_width, label=safety_categories[0])
rects2 = ax.bar(r2, percentages[:, 1], width=bar_width, label=safety_categories[1])
rects3 = ax.bar(r3, percentages[:, 2], width=bar_width, label=safety_categories[2])
rects4 = ax.bar(r4, percentages[:, 3], width=bar_width, label=safety_categories[3])
rects5 = ax.bar(r5, percentages[:, 4], width=bar_width, label=safety_categories[4])

ax.set_xlabel('eBike Rules')
ax.set_ylabel('Percentage')
ax.set_title('Percentage of eBike Rules by Safety Response')
ax.set_xticks(r3)
ax.set_xticklabels(ebike_rules_categories)
ax.legend()

plt.tight_layout()
plt.show()

### ebike_rules vs. safety, persentage by heatmap

In [None]:
#heatmap
import matplotlib.patheffects as path_effects
safety_categories = ['very', 'somewhat', 'not very', 'not concerned', "don't know"]
ebike_rules_categories = ['very', 'somewhat', 'not very', "don't know"]

percentages = []
for safety in safety_categories:
    row = []
    for ebike_rule in ebike_rules_categories:
        count = quant_df[(quant_df['safty'] == safety) & (quant_df['ebike_rules'] == ebike_rule)].shape[0]
        percentage = (count / total_count) * 100
        row.append(percentage)
    percentages.append(row)

percentages = np.array(percentages)

fig, ax = plt.subplots(figsize=(8, 6))
im = ax.imshow(percentages, cmap='hot')
cbar = ax.figure.colorbar(im, ax=ax)
cbar.ax.set_ylabel('Percentage', rotation=-90, va='bottom')

ax.set_xticks(np.arange(len(ebike_rules_categories)))
ax.set_yticks(np.arange(len(safety_categories)))
ax.set_xticklabels(ebike_rules_categories)
ax.set_yticklabels(safety_categories)

plt.setp(ax.get_xticklabels(), rotation=45, ha='right', rotation_mode='anchor')

for i in range(len(safety_categories)):
    for j in range(len(ebike_rules_categories)):
        percentage = percentages[i, j]
        text_color = 'black' if percentage >= 10 else 'white'
        box_color = im.cmap(im.norm(percentage))
        text = ax.text(j, i, f'{percentage:.2f}%', ha='center', va='center', color=text_color, weight='bold')
        text.set_path_effects([path_effects.Stroke(linewidth=3, foreground=box_color), path_effects.Normal()])

ax.set_xlabel('eBike Rules')
ax.set_ylabel('Safety Response')
ax.set_title('Percentage of Safety Responses by eBike Rules')

plt.tight_layout()
plt.show()

### private_locker vs. group_locker, persentage by heatmap

In [None]:
#heatmap
import seaborn as sns

private_locker_categories = ['very', 'somewhat', 'not very', "don't know"]
group_locker_categories = ['very', 'somewhat', 'not very', 'depends', "don't know"]

percentages = []
for private_locker in private_locker_categories:
    row = []
    for group_locker in group_locker_categories:
        count = quant_df[(quant_df['private_locker'] == private_locker) & (quant_df['group_locker'] == group_locker)].shape[0]
        percentage = (count / total_count) * 100
        row.append(percentage)
    percentages.append(row)

percentages = np.array(percentages)
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(percentages, cmap='YlOrBr', annot=True, fmt=".1f", cbar=True, linewidths=0.5)

ax.set_xticks(np.arange(len(group_locker_categories)) + 0.5)
ax.set_yticks(np.arange(len(private_locker_categories)) + 0.5)
ax.set_xticklabels(group_locker_categories)
ax.set_yticklabels(private_locker_categories)

plt.setp(ax.get_xticklabels(), rotation=45, ha='right', rotation_mode='anchor')

ax.set_xlabel('Group Locker')
ax.set_ylabel('Private Locker')
ax.set_title('Percentage of Private Locker vs Group Locker')

plt.tight_layout()
plt.show()