In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
df = pd.read_csv('../data/FULL_preprocessed_data_RM_weather.csv')

In [12]:
df.Mode_confirm.value_counts()

E-bike                  20029
Gas Car, drove alone    18412
Gas Car, with others    16684
Walk                    10857
Not a Trip               2420
Regular Bike             1703
Bus                      1363
Other                    1145
Taxi/Uber/Lyft            310
Bikeshare                  84
Train                      67
Scooter share              52
Free Shuttle               50
Skate board                41
Name: Mode_confirm, dtype: int64

In [43]:
# hyp = df[df.Mode_confirm == 'E-bike']
hyp = df.copy()
hyp = hyp[['income_category', 'n_motor_vehicles',
       'n_residence_members', 'n_residents_u18', 'is_student',
       'n_residents_with_license', 'duration', 'distance_miles',
       'age', 'is_overnight_trip', 'n_working_residents', 'is_male',
        'start:sin_HOD', 
       'start:cos_HOD',
       'end:sin_HOD', 'end:cos_HOD', 'temperature_2m (°F)',
       'relative_humidity_2m (%)', 'dew_point_2m (°F)', 'rain (inch)',
       'snowfall (inch)', 'cloud_cover (%)', 'wind_speed_10m (mp/h)',
       'wind_gusts_10m (mp/h)', 'section_distance_argmax',
       'section_duration_argmax', 'mph', 'chosen', 'av_no_trip', 'av_s_car',
       'av_p_micro', 'av_walk', 'av_s_micro', 'av_transit', 'av_ridehail',
       'av_car', 'av_unknown', 'cost_p_micro', 'cost_no_trip', 'cost_s_car',
       'cost_transit', 'cost_car', 'cost_s_micro', 'cost_ridehail',
       'cost_walk', 'cost_unknown']]

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier

In [45]:
X_tr, X_te, Y_tr, Y_te = train_test_split(
    hyp.drop(columns=['chosen']), hyp[['chosen']], test_size=0.2, shuffle=True, stratify=hyp[['chosen']]
)

In [46]:
model = RandomForestClassifier().fit(X_tr, Y_tr.values.ravel())

In [47]:
y_tr_pred = model.predict(X_tr)
y_te_pred = model.predict(X_te)

In [48]:
f1_score(y_true=Y_tr.values.ravel(), y_pred=y_tr_pred, average='weighted')

1.0

In [49]:
f1_score(y_true=Y_te.values.ravel(), y_pred=y_te_pred, average='weighted')

0.7247243014745804

In [50]:
print(
    sorted(list(zip(model.feature_names_in_, model.feature_importances_)), key=lambda x: x[-1], reverse=True)
)

[('age', 0.07034490454052576), ('mph', 0.046077458861916464), ('distance_miles', 0.04227922185622416), ('income_category', 0.042045300984451986), ('dew_point_2m (°F)', 0.040326971736931075), ('temperature_2m (°F)', 0.04026013502695752), ('n_residence_members', 0.04011768641381229), ('duration', 0.038452743580322295), ('section_distance_argmax', 0.038170155240570436), ('cost_transit', 0.03781163632826536), ('section_duration_argmax', 0.035462758618913716), ('cost_s_car', 0.034390094889762884), ('n_motor_vehicles', 0.03379247604608051), ('n_residents_with_license', 0.03070869700490467), ('cost_ridehail', 0.03048846642958717), ('relative_humidity_2m (%)', 0.030090075033090597), ('wind_gusts_10m (mp/h)', 0.029081664383436266), ('n_working_residents', 0.028816144221162413), ('wind_speed_10m (mp/h)', 0.02858556819484081), ('cloud_cover (%)', 0.02495871571321774), ('cost_car', 0.023892323875139673), ('end:cos_HOD', 0.023765393583672044), ('start:cos_HOD', 0.023437391395931484), ('start:sin_HO

In [57]:
survey_data = pd.read_csv('../viz_scripts/Can Do Colorado eBike Program - en.csv')
df = pd.read_csv('../data/FULL_preprocessed_data_RM_weather.csv')

In [66]:
# column renaming here!

survey_data.rename(
    {
        "Unique User ID (auto-filled, do not edit)": "user_id",
        "In which year were you born?": "birth_year",
        "What is your gender?": "gender",
        "Do you have a valid driver's license?": "has_drivers_license",
        "Are you a student?": "is_student",
        "What is the highest grade or degree that you have completed?": "highest_education",
        "Do you work for either pay or profit?": "is_paid",
        "Do you have more than one job?": "has_multiple_jobs",
        "Do you work full-time or part-time at your primary job?": "primary_job_type",
        "Which best describes your primary job?": "primary_job_description",
        "How did you usually get to your primary job last week? ": "primary_job_commute_mode",
        "Thinking about your daily commute to work last week, how many minutes did it usually take to get from home to the primary job/work place?": "primary_job_commute_time",
        "At your primary job, do you have the ability to set or change your own start time?": "is_primary_job_flexible",
        "Do you have the option of working from home or an alternate location instead of going into your primary work place?": "primary_job_can_wfh",
        "How many days per week do you usually work from home or an alternate location?": "wfh_days",
        "Do you own or rent your place of residence?": "residence_ownership_type",
        "What is your home type?": "residence_type",
        "Please identify which category represents your total household income, before taxes, for last year.": "income_category",
        "Including yourself, how many people live in your home?": "n_residence_members",
        "How many children under age 18 live in your home?": "n_residents_u18",
        "Including yourself, how many people have a driver's license in your household?": "n_residents_with_license",
        "How many motor vehicles are owned, leased, or available for regular use by the people who currently live in your household?": "n_motor_vehicles",
        "If you were unable to use your household vehicle(s), which of the following options would be available to you to get you from place to place?": "available_modes",
        "Do you have a medical condition that makes it difficult to travel outside of the home?": "has_medical_condition",
        "How long have you had this condition?": "medical_condition_duration"
    },
    axis='columns',
    inplace=True
)

In [67]:
display(survey_data.user_id.head())

0    a2d48b05d5454d428c0841432c7467b6
1    f2799dc202bc4249b42a4fda8770d1b6
2    b2bbe715b6a14fd19f751cae8adf6b4e
3    6373dfb8cb9b47e88e8f76adcfadde20
4    93c6e0f156a44e07b920ded664419dc6
Name: user_id, dtype: object

In [68]:
display(df.user_id.head())

0    6373dfb8cb9b47e88e8f76adcfadde20
1    6373dfb8cb9b47e88e8f76adcfadde20
2    6373dfb8cb9b47e88e8f76adcfadde20
3    6373dfb8cb9b47e88e8f76adcfadde20
4    6373dfb8cb9b47e88e8f76adcfadde20
Name: user_id, dtype: object

In [70]:
len(survey_data.user_id.unique())

203

In [74]:
f_survey = survey_data.loc[survey_data.user_id.isin(df.user_id), :]

In [75]:
len(f_survey.user_id.unique())

156