# Gender mode and purpose correlations

Data needs: Mongodump of trip data, csv with all demographics responses

In [None]:
import pandas as pd

import emission.core.get_database as edb
import emission.core.wrapper.entry as ecwe
import emission.storage.decorations.analysis_timeseries_queries as esda
import emission.storage.decorations.trip_queries as esdt
import emission.storage.decorations.timeline as esdl
import emission.storage.timeseries.abstract_timeseries as esta
import emission.storage.timeseries.timequery as estt
import scaffolding
from uuid import UUID

from collections import defaultdict
import matplotlib.pyplot as plt

%matplotlib inline

import folium
import folium.plugins as fpl
import numpy as np
import branca.element as bre
import functools

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model

sns.set_style("whitegrid")
sns.set()
%matplotlib inline

params = {'legend.fontsize': 'small',
          'figure.figsize': (10, 8),
         'axes.labelsize': 'small',
         'axes.titlesize':'small',
         'xtick.labelsize':'small',
         'ytick.labelsize':'small'}
plt.rcParams.update(params)


import importlib

In [None]:
year = None
month = None
# program = "study"
study_type = "study"
mode_of_interest = "e-bike"
include_test_users = False #NO TEST USERS
program = 'usaid-laos-ev'

dynamic_labels = {
    "MODE": [
      {"value":"walk", "baseMode":"WALKING", "met_equivalent":"WALKING", "kgCo2PerKm": 0},
      {"value":"e-auto_rickshaw", "baseMode":"MOPED", "met_equivalent":"IN_VEHICLE", "kgCo2PerKm": 0.085416859},
      {"value":"auto_rickshaw", "baseMode":"MOPED", "met_equivalent":"IN_VEHICLE", "kgCo2PerKm": 0.231943784},
      {"value":"motorcycle", "baseMode":"MOPED", "met_equivalent":"IN_VEHICLE", "kgCo2PerKm": 0.113143309},
      {"value":"e-bike", "baseMode":"E_BIKE", "met": {"ALL": {"range": [0, -1], "mets": 4.9}}, "kgCo2PerKm": 0.00728},
      {"value":"bike", "baseMode":"BICYCLING", "met_equivalent":"BICYCLING", "kgCo2PerKm": 0},
      {"value":"drove_alone", "baseMode":"CAR", "met_equivalent":"IN_VEHICLE", "kgCo2PerKm": 0.22031},
      {"value":"shared_ride", "baseMode":"CAR", "met_equivalent":"IN_VEHICLE", "kgCo2PerKm": 0.11015},
      {"value":"e_car_drove_alone", "baseMode":"E_CAR", "met_equivalent":"IN_VEHICLE", "kgCo2PerKm": 0.08216},
      {"value":"e_car_shared_ride", "baseMode":"E_CAR", "met_equivalent":"IN_VEHICLE", "kgCo2PerKm": 0.04108},
      {"value":"taxi", "baseMode":"TAXI", "met_equivalent":"IN_VEHICLE", "kgCo2PerKm": 0.30741},
      {"value":"bus", "baseMode":"BUS", "met_equivalent":"IN_VEHICLE", "kgCo2PerKm": 0.20727},
      {"value":"train", "baseMode":"TRAIN", "met_equivalent":"IN_VEHICLE", "kgCo2PerKm": 0.12256},
      {"value":"free_shuttle", "baseMode":"BUS", "met_equivalent":"IN_VEHICLE", "kgCo2PerKm": 0.20727},
      {"value":"air", "baseMode":"AIR", "met_equivalent":"IN_VEHICLE", "kgCo2PerKm": 0.09975},
      {"value":"not_a_trip", "baseMode":"UNKNOWN", "met_equivalent":"UNKNOWN", "kgCo2PerKm": 0},
      {"value":"other", "baseMode":"OTHER", "met_equivalent":"UNKNOWN", "kgCo2PerKm": 0}
    ],
    "PURPOSE": [
      {"value":"home"},
      {"value":"work"},
      {"value":"at_work"},
      {"value":"school"},
      {"value":"transit_transfer"},
      {"value":"shopping"},
      {"value":"meal"},
      {"value":"pick_drop_person"},
      {"value":"pick_drop_item"},
      {"value":"personal_med"},
      {"value":"access_recreation"},
      {"value":"exercise"},
      {"value":"entertainment"},
      {"value":"religious"},
      {"value":"other"}
    ],
    "translations": {
      "en": {
        "walk": "Walk",
        "e-auto_rickshaw":"E-tuk tuk",
        "auto_rickshaw":"Tuk Tuk",
        "motorcycle":"Motorcycle",
        "e-bike": "E-bike",
        "bike": "Bicycle",
        "drove_alone": "Car Drove Alone",
        "shared_ride": "Car Shared Ride",
        "e_car_drove_alone": "E-Car Drove Alone",
        "e_car_shared_ride": "E-Car Shared Ride",
        "taxi": "Taxi/Loca/inDrive",
        "bus": "Bus",
        "train": "Train",
        "free_shuttle": "Free Shuttle",
        "air": "Airplane",
        "not_a_trip": "Not a trip",
        "home": "Home",
        "work": "To Work",
        "at_work": "At Work",
        "school": "School",
        "transit_transfer": "Transit transfer",
        "shopping": "Shopping",
        "meal": "Meal",
        "pick_drop_person": "Pick-up/ Drop off Person",
        "pick_drop_item": "Pick-up/ Drop off Item",
        "personal_med": "Personal/ Medical",
        "access_recreation": "Access Recreation",
        "exercise": "Recreation/ Exercise",
        "entertainment": "Entertainment/ Social",
        "religious": "Religious",
        "other": "Other"
      },
      "lo": {
        "walk": "ດ້ວຍການຍ່າງ",
        "e-auto_rickshaw":"ລົດ 3 ລໍ້ໄຟຟ້າ ຫລື ຕຸກຕຸກໄຟຟ້າ",
        "auto_rickshaw":"ເດີນທາດ້ວຍ ລົດຕຸກຕຸກ ຫລື ລົດສາມລໍ້",
        "motorcycle":"ລົດຈັກ",
        "e-bike": "ວຍລົດຈັກໄຟຟ້າ",
        "bike": "ລົດຖີບ",
        "drove_alone": "ເດີນທາງ ດ້ວຍລົດໃຫ່ຍ ເຊີ່ງເປັນລົດທີ່ຂັບເອງ",
        "shared_ride": "ເດີນທາງດ້ວຍລົດໃຫ່ຍ ຮ່ວມກັບລົດຄົນອຶ່ນ",
        "e_car_drove_alone": "ດ້ວຍການຂັບລົດໄຟຟ້າໄປເອງ",
        "e_car_shared_ride": "ດ້ວຍການຈ້າງລົດໄຟຟ້າໄປ",
        "taxi": "ແທັກຊີ",
        "bus": "ລົດເມ",
        "train": "ລົດໄຟ",
        "free_shuttle": "ລົດຮັບສົ່ງຟຣີ",
        "air": "ຍົນ",
        "not_a_trip": "ບໍ່ແມ່ນການເດີນທາງ",
        "home": "ບ້ານ",
        "work": "ໄປເຮັດວຽກ",
        "at_work": "ຢູ່ບ່ອນເຮັດວຽກ",
        "school": "ໄປໂຮງຮຽນ",
        "transit_transfer": "ການຖ່າຍໂອນການເດີນທາງ",
        "shopping": "ຊອບປິ້ງ",
        "meal": "ອາຫານ",
        "pick_drop_person": "ໄປຮັບ ຫລື ສົນ ຄົນ",
        "pick_drop_item": "ໄປຮັບ ຫລື ສົ່ງສິນຄ້າ",
        "personal_med": "ໄປຫາໝໍ",
        "access_recreation": "ເຂົ້າເຖິງການພັກຜ່ອນ",
        "exercise": "ພັກຜ່ອນ/ອອກກຳລັງກາຍ",
        "entertainment": "ບັນເທີງ/ສັງຄົມ",
        "religious": "ຈຸດປະສົງທາງສາດສະໜາ",
        "other": "ອື່ນໆ"
      }
    }
  }

In [None]:
# #workaround for custom labels :)
#TODO: remove and just use underlying labels
def mapping_labels(dynamic_labels, label_type):
    if "translations" in dynamic_labels and "en" in dynamic_labels["translations"]:
        translations = dynamic_labels["translations"]["en"]
        dic_mapping = dict()

        def translate_labels(labels):
            translation_mapping = {}
            for label in labels:
                value = label["value"]
                translation = translations.get(value)
                translation_mapping[value] = translation
            return defaultdict(lambda: 'Other', translation_mapping)
        dic_mapping = translate_labels(dynamic_labels[label_type])
        print(dic_mapping)
        return dic_mapping

dic_re = mapping_labels(dynamic_labels, 'MODE')

dic_pur = mapping_labels(dynamic_labels, 'PURPOSE')


## Load data and match with surveys

In [None]:
expanded_ct, file_suffix, quality_text, debug_df = await scaffolding.load_viz_notebook_data(year,
                                                                            month,
                                                                            program,
                                                                            study_type,
                                                                            labels=dynamic_labels,
                                                                            include_test_users=include_test_users)

In [None]:
surveys = pd.read_csv('DemographicData.csv')
print(len(surveys), 'surveys')

#drop any null ids
socio_data = surveys[~surveys['user_id'].isnull()]
print(len(socio_data), 'surveys after dropping null ids')

#drop duplicates
socio_data = socio_data.sort_values(by=['user_id', 'data.ts'])
socio_data.drop_duplicates(subset=['user_id'], keep='last', inplace=True)
print(len(socio_data),'surveys', socio_data['user_id'].nunique(), 'users after dropping duplicates')

#prepare survey ids for merging
socio_data['user_id_socio'] = socio_data['user_id'].astype(str)
socio_data['user_id_socio'] = socio_data['user_id_socio'].str.strip() #remove leading or trailing whitespace!!
socio_data['user_id_socio'] = [i.replace('-','') for i in socio_data.user_id_socio]
socio_data['user_id_socio'] = socio_data['user_id_socio']
socio_data = socio_data.drop(labels='user_id', axis=1)

In [None]:
#prepare trip ids for merging
trips = expanded_ct.copy()
print(len(trips), 'trips')
print(trips.user_id.nunique(), 'people')
trips['user_id_socio'] = trips.user_id.astype(str)
trips['user_id_socio'] = trips['user_id_socio'].str.strip() #remove leading or trailing whitespace!!
trips.user_id_socio = [i.replace('-','') for i in trips.user_id_socio] # remove all dashes from strings

#merge the data
data = trips.merge(socio_data, on='user_id_socio')
print(len(data), 'trips after merging')
print(data.user_id_socio.nunique(), 'people after merging')

In [None]:
data.columns

## Group data by gender of traveler

In [None]:
#gender - mode correlation
mode_gender_data= data.copy()
mode_gender_data = mode_gender_data[['What_is_your_gender', 'Mode_confirm', 'distance', 'user_id_socio']]
mode_gender_data = mode_gender_data[mode_gender_data.Mode_confirm.notna()]

print(len(mode_gender_data), "trips")
mode_gender_data['What_is_your_gender'] = mode_gender_data['What_is_your_gender'].astype('category')
mode_gender_data['Mode_confirm'] = mode_gender_data['Mode_confirm'].astype('category')

gender_man = mode_gender_data[mode_gender_data['What_is_your_gender'] == 'man']
print(len(gender_man), "trips by men")
print(gender_man.user_id_socio.nunique(), "men")
gender_woman = mode_gender_data[mode_gender_data['What_is_your_gender'] == 'woman']
print(len(gender_woman), "trips by women")
print(gender_woman.user_id_socio.nunique(), "women")

unique_man_mode = gender_man.groupby(['Mode_confirm'], as_index=False).nunique()[['Mode_confirm','user_id_socio']]
unique_woman_mode = gender_woman.groupby(['Mode_confirm'], as_index=False).nunique()[['Mode_confirm','user_id_socio']]

t1 = gender_man.groupby(['Mode_confirm'], as_index=False).count()[['Mode_confirm','distance']]
t1['proportion'] = t1['distance'] / np.sum(t1['distance'])
t1['trip_type'] = 'Man'
t1.columns = ['Mode','Count','Proportion','Gender']

# processing long data
t2 = gender_woman.groupby(['Mode_confirm'], as_index=False).count()[['Mode_confirm','distance']]
t2['proportion'] = t2['distance'] / np.sum(t2['distance'])
t2['trip_type'] = 'Woman'
t2.columns = ['Mode','Count','Proportion','Gender']

plot_data = pd.concat([t1, t2])
plot_data = plot_data.reset_index()
plot_data = plot_data.sort_values(["Gender", "Count"], ascending=False)
plot_data = plot_data.drop('index', axis='columns')

plot_data

In [None]:
def mapping_color_labels(dynamic_labels, dic_re, dic_pur):
    if len(dynamic_labels) > 0:
        mode_values = list(mapping_labels(dynamic_labels, "MODE").values()) if "MODE" in dynamic_labels else []
        replaced_mode_values = list(mapping_labels(dynamic_labels, "REPLACED_MODE").values()) if "REPLACED_MODE" in dynamic_labels else []
        purpose_values = list(mapping_labels(dynamic_labels, "PURPOSE").values()) if "PURPOSE" in dynamic_labels else []
        combined_mode_values = mode_values + replaced_mode_values
    else:
        # Addition of 'Other' is required to the list since it's missing from auxillary_files/mode_labels.csv and auxillary_files/purpose_labels.csv
        combined_mode_values = (list(OrderedDict.fromkeys(dic_re.values())) + ['Other'])
        purpose_values = (list(OrderedDict.fromkeys(dic_pur.values())) + ['Other'])

    colors_mode = dict(zip(combined_mode_values, plt.cm.tab20.colors[:len(combined_mode_values)]))
    colors_purpose = dict(zip(purpose_values, plt.cm.tab20.colors[:len(purpose_values)]))

    return colors_mode, colors_purpose

In [None]:
## code from minipilot vs full pilot regular / commute trip split -- has better labels!!
#TODO: use scaffolding colors and underlying labels
mode_colors, purpose_colors = mapping_color_labels(dynamic_labels, dic_re, dic_pur)
print(mode_colors)

width = 0.8
fig, ax = plt.subplots(figsize=(10, 6))
running_total = [0,0]
fig_data = plot_data.copy()

# plt.set_cmap('tab20')

for mode in pd.unique(fig_data.Mode):
    band_data = fig_data[fig_data['Mode']==mode]
    
    labels = band_data['Gender']
    vals = band_data['Proportion']*100
    bar_labels = band_data['Count']
    
    vals_str = [f'{y:.1f} %\n({x:,})' if y>6 else '' for x, y in zip(bar_labels, vals)]
    bar = ax.barh(labels, vals, width, left=running_total, label=mode, color=mode_colors[mode])
    ax.bar_label(bar, label_type='center', labels=vals_str, rotation=90, fontsize=12)
    running_total[0] = running_total[0]+vals.iloc[0]
    running_total[1] = running_total[1]+vals.iloc[1]

ax.set_title('Mode Distribution', fontsize=25)
ax.legend(bbox_to_anchor=(1,1), fancybox=True, shadow=True, fontsize=12)
plt.subplots_adjust(bottom=0.20)
# plt.set_cmap('tab20')
fig.tight_layout()
plt.show()

In [None]:
#gender - mode correlation
mode_gender_data= data.copy()
mode_gender_data.columns

mode_gender_data = mode_gender_data[['What_is_your_gender', 'Trip_purpose', 'distance', 'user_id_socio']]
mode_gender_data = mode_gender_data[mode_gender_data.Trip_purpose.notna()]

print(len(mode_gender_data))
mode_gender_data['What_is_your_gender'] = mode_gender_data['What_is_your_gender'].astype('category')
mode_gender_data['Trip_purpose'] = mode_gender_data['Trip_purpose'].astype('category')

gender_man = mode_gender_data[mode_gender_data['What_is_your_gender'] == 'man']
print(len(gender_man))
gender_woman = mode_gender_data[mode_gender_data['What_is_your_gender'] == 'woman']
print(len(gender_woman))

unique_man_mode = gender_man.groupby(['Trip_purpose'], as_index=False).nunique()[['Trip_purpose','user_id_socio']]
unique_woman_mode = gender_woman.groupby(['Trip_purpose'], as_index=False).nunique()[['Trip_purpose','user_id_socio']]

t1 = gender_man.groupby(['Trip_purpose'], as_index=False).count()[['Trip_purpose','distance']]
t1['proportion'] = t1['distance'] / np.sum(t1['distance'])
t1['trip_type'] = 'Man'
t1.columns = ['Purpose','Count','Proportion','Gender']

# processing long data
t2 = gender_woman.groupby(['Trip_purpose'], as_index=False).count()[['Trip_purpose','distance']]
t2['proportion'] = t2['distance'] / np.sum(t2['distance'])
t2['trip_type'] = 'Woman'
t2.columns = ['Purpose','Count','Proportion','Gender']

plot_data = pd.concat([t1, t2])
plot_data = plot_data.reset_index()
plot_data = plot_data.sort_values(["Gender", "Count"], ascending=False)
plot_data = plot_data.drop('index', axis='columns')

plot_data

In [None]:
## code from minipilot vs full pilot regular / commute trip split -- has better labels!!
mode_colors, purpose_colors = mapping_color_labels(dynamic_labels, dic_re, dic_pur)


width = 0.8
fig, ax = plt.subplots(figsize=(10, 6))
running_total = [0,0]
fig_data = plot_data.copy()

# plt.set_cmap('tab20')
#TODO: debug this chart
for purp in pd.unique(fig_data.Purpose):
    band_data = fig_data[fig_data['Purpose']==purp]
    
    labels = band_data['Gender']
    vals = band_data['Proportion']*100
    bar_labels = band_data['Count']
    
    vals_str = [f'{y:.1f} %\n({x:,})' if y>6 else '' for x, y in zip(bar_labels, vals)]
    bar = ax.barh(labels, vals, width, left=running_total, label=purp, color=purpose_colors[purp])
    ax.bar_label(bar, label_type='center', labels=vals_str, rotation=90, fontsize=12)
    running_total[0] = running_total[0]+vals.iloc[0]
    running_total[1] = running_total[1]+vals.iloc[1]

ax.set_title('Purpose Distribution', fontsize=25)
ax.legend(bbox_to_anchor=(1,1), fancybox=True, shadow=True, fontsize=12)
plt.subplots_adjust(bottom=0.20)
# plt.set_cmap('tab20')
fig.tight_layout()
plt.show()

In [None]:
# Function to categorize slices into "other" if percentage is less than limit%
def categorize_into_other(grouped_data, limit):
    total_count = grouped_data['count'].sum()
    grouped_data['percentage'] = (grouped_data['count'] / total_count) * 100
    other_slice = grouped_data[grouped_data['percentage'] < limit]
    other_count = other_slice['count'].sum()
    grouped_data = grouped_data[grouped_data['percentage'] >= limit]
    if other_count > 0:
        grouped_data = grouped_data.append({'purpose_confirm': 'Other', 'count': other_count}, ignore_index=True)
    return grouped_data

In [None]:
print(data['purpose_confirm'].dropna().size, "trips")
# Grouping the data by user and purpose_confirm and calculating the count for each group
grouped_data = data.groupby(['user_id', 'Trip_purpose']).size().reset_index(name='count')

grouped_data = grouped_data.sort_values(by=['Trip_purpose'])

# Grouping the DataFrame by 'user_id'
grouped_df = grouped_data.groupby('user_id')

# Calculate the total count of trips for each user
total_trips = grouped_df['count'].transform('sum')

# Calculate the percentage of each purpose_confirm for each user
grouped_data['percentage'] = (grouped_data['count'] / total_trips) * 100

#grouped_data = grouped_data[grouped_data['purpose_confirm']=='work' | grouped_data['purpose_confirm']=='at_work' | grouped_data['purpose_confirm']=='pick_drop_person']
grouped_data = grouped_data.query('Trip_purpose=="To Work" | Trip_purpose=="School" | Trip_purpose=="Meal" | Trip_purpose=="Shopping"')

# Plotting the box and whisker plot
fig, ax = plt.subplots(figsize=(10,8))
plot_title = 'Percentage of Purpose Choice for Each User'
ylab = 'Percentage of trips'
sns.boxplot(ax=ax, data=grouped_data, x='Trip_purpose', y='percentage', hue='Trip_purpose', showfliers=False).set(title=plot_title, xlabel='', ylabel=ylab)

plt.subplots_adjust(bottom=0.25)
plt.xticks(rotation=35, ha='right')
plt.legend([])

print(grouped_data['user_id'].nunique(), "people")

In [None]:
men_full_data = data[data['What_is_your_gender'] == 'man']

print(men_full_data['mode_confirm'].dropna().size, "trips")
# Grouping the data by user and purpose_confirm and calculating the count for each group
grouped_data = men_full_data.groupby(['user_id', 'Trip_purpose']).size().reset_index(name='count')

grouped_data = grouped_data.sort_values(by=['Trip_purpose'])

# Grouping the DataFrame by 'user_id'
grouped_df = grouped_data.groupby('user_id')

# Calculate the total count of trips for each user
total_trips = grouped_df['count'].transform('sum')

# Calculate the percentage of each purpose_confirm for each user
grouped_data['percentage'] = (grouped_data['count'] / total_trips) * 100

#grouped_data = grouped_data[grouped_data['purpose_confirm']=='work' | grouped_data['purpose_confirm']=='at_work' | grouped_data['purpose_confirm']=='pick_drop_person']
grouped_data = grouped_data.query('Trip_purpose=="To Work" | Trip_purpose=="School" | Trip_purpose=="Meal" | Trip_purpose=="Shopping"')

# Plotting the box and whisker plot
fig, ax = plt.subplots(figsize=(10,8))
plot_title = '[Men] Percentage of Purpose Choice for Each User'
ylab = 'Percentage of trips'
sns.boxplot(ax=ax, data=grouped_data, x='Trip_purpose', y='percentage', hue='Trip_purpose', showfliers=False).set(title=plot_title, xlabel='', ylabel=ylab)

plt.subplots_adjust(bottom=0.25)
plt.xticks(rotation=35, ha='right')
plt.legend([])

print(grouped_data['user_id'].nunique(), "people")

In [None]:
women_full_data = data[data['What_is_your_gender'] == 'woman']

print(women_full_data['mode_confirm'].dropna().size, "trips")
# Grouping the data by user and purpose_confirm and calculating the count for each group
grouped_data = women_full_data.groupby(['user_id', 'Trip_purpose']).size().reset_index(name='count')

grouped_data = grouped_data.sort_values(by=['Trip_purpose'])

# Grouping the DataFrame by 'user_id'
grouped_df = grouped_data.groupby('user_id')

# Calculate the total count of trips for each user
total_trips = grouped_df['count'].transform('sum')

# Calculate the percentage of each purpose_confirm for each user
grouped_data['percentage'] = (grouped_data['count'] / total_trips) * 100

#grouped_data = grouped_data[grouped_data['purpose_confirm']=='work' | grouped_data['purpose_confirm']=='at_work' | grouped_data['purpose_confirm']=='pick_drop_person']
grouped_data = grouped_data.query('Trip_purpose=="To Work" | Trip_purpose=="School" | Trip_purpose=="Meal" | Trip_purpose=="Shopping"')

# Plotting the box and whisker plot
fig, ax = plt.subplots(figsize=(10,8))
plot_title = '[Women] Percentage of Purpose Choice for Each User'
ylab = 'Percentage of trips'
sns.boxplot(ax=ax, data=grouped_data, x='Trip_purpose', y='percentage', hue='Trip_purpose', showfliers=False).set(title=plot_title, xlabel='', ylabel=ylab)

plt.subplots_adjust(bottom=0.25)
plt.xticks(rotation=35, ha='right')
plt.legend([])

print(grouped_data['user_id'].nunique(), "people")

In [None]:
print(data['mode_confirm'].dropna().size, "trips")
# Grouping the data by user and purpose_confirm and calculating the count for each group
grouped_data = data.groupby(['user_id', 'Mode_confirm']).size().reset_index(name='count')

grouped_data = grouped_data.sort_values(by=['Mode_confirm'])

# Grouping the DataFrame by 'user_id'
grouped_df = grouped_data.groupby('user_id')

# Calculate the total count of trips for each user
total_trips = grouped_df['count'].transform('sum')

# Calculate the percentage of each purpose_confirm for each user
grouped_data['percentage'] = (grouped_data['count'] / total_trips) * 100
grouped_data.head(50)

#grouped_data = grouped_data[grouped_data['purpose_confirm']=='work' | grouped_data['purpose_confirm']=='at_work' | grouped_data['purpose_confirm']=='pick_drop_person']
grouped_data = grouped_data.query('Mode_confirm=="Motorcycle" | Mode_confirm=="Walk" | Mode_confirm=="Car Drove Alone" | Mode_confirm=="Car Shared Ride"')

# Plotting the box and whisker plot
fig, ax = plt.subplots(figsize=(10,8))
plot_title = 'Percentage of Mode Choice for Each User'
ylab = 'Percentage of trips'
sns.boxplot(ax=ax, data=grouped_data, x='Mode_confirm', y='percentage', hue='Mode_confirm', showfliers=False).set(title=plot_title, xlabel='', ylabel=ylab)

plt.subplots_adjust(bottom=0.25)
plt.xticks(rotation=35, ha='right')
plt.legend([])
print(grouped_data['user_id'].nunique(), "men")

In [None]:
men_full_data = data[data['What_is_your_gender'] == 'man']

print(men_full_data['mode_confirm'].dropna().size, "trips")
# Grouping the data by user and purpose_confirm and calculating the count for each group
grouped_data = men_full_data.groupby(['user_id', 'Mode_confirm']).size().reset_index(name='count')

grouped_data = grouped_data.sort_values(by=['Mode_confirm'])

# Grouping the DataFrame by 'user_id'
grouped_df = grouped_data.groupby('user_id')

# Calculate the total count of trips for each user
total_trips = grouped_df['count'].transform('sum')

# Calculate the percentage of each purpose_confirm for each user
grouped_data['percentage'] = (grouped_data['count'] / total_trips) * 100
grouped_data.head(50)

#grouped_data = grouped_data[grouped_data['purpose_confirm']=='work' | grouped_data['purpose_confirm']=='at_work' | grouped_data['purpose_confirm']=='pick_drop_person']
grouped_data = grouped_data.query('Mode_confirm=="Motorcycle" | Mode_confirm=="Walk" | Mode_confirm=="Car Drove Alone" | Mode_confirm=="Car Shared Ride"')
men_mode = grouped_data.copy()

# Plotting the box and whisker plot
fig, ax = plt.subplots(figsize=(10,8))
plot_title = '[Men] Percentage of Mode Choice for Each User'
ylab = 'Percentage of trips'
sns.boxplot(ax=ax, data=grouped_data, x='Mode_confirm', y='percentage', hue='Mode_confirm', showfliers=False).set(title=plot_title, xlabel='', ylabel=ylab)

plt.subplots_adjust(bottom=0.25)
plt.xticks(rotation=35, ha='right')
plt.legend([])
print(grouped_data['user_id'].nunique(), "men")

man_mode_data = grouped_data.copy()

In [None]:
women_full_data = data[data['What_is_your_gender'] == 'woman']

print(women_full_data['mode_confirm'].dropna().size, "trips")
# Grouping the data by user and purpose_confirm and calculating the count for each group
grouped_data = women_full_data.groupby(['user_id', 'Mode_confirm']).size().reset_index(name='count')

grouped_data = grouped_data.sort_values(by=['Mode_confirm'])

# Grouping the DataFrame by 'user_id'
grouped_df = grouped_data.groupby('user_id')

# Calculate the total count of trips for each user
total_trips = grouped_df['count'].transform('sum')

# Calculate the percentage of each purpose_confirm for each user
grouped_data['percentage'] = (grouped_data['count'] / total_trips) * 100
grouped_data.head(50)

#grouped_data = grouped_data[grouped_data['purpose_confirm']=='work' | grouped_data['purpose_confirm']=='at_work' | grouped_data['purpose_confirm']=='pick_drop_person']
grouped_data = grouped_data.query('Mode_confirm=="Motorcycle" | Mode_confirm=="Walk" | Mode_confirm=="Car Drove Alone" | Mode_confirm=="Car Shared Ride"')
women_mode = grouped_data.copy()

# Plotting the box and whisker plot
fig, ax = plt.subplots(figsize=(10,8))
plot_title = '[Women] Percentage of Mode Choice for Each User'
ylab = 'Percentage of trips'
sns.boxplot(ax=ax, data=grouped_data, x='Mode_confirm', y='percentage', hue='Mode_confirm', showfliers=False).set(title=plot_title, xlabel='', ylabel=ylab)

plt.subplots_adjust(bottom=0.25)
plt.xticks(rotation=35, ha='right')
plt.legend([])
print(grouped_data['user_id'].nunique(), "men")

woman_mode_data = grouped_data.copy()

In [None]:
man_mode_data = man_mode_data.rename(columns={"percentage": "Man"})
woman_mode_data = woman_mode_data.rename(columns={"percentage": "Woman"})

combined_mode_data = pd.concat([man_mode_data, woman_mode_data])
combined_mode_data = combined_mode_data.drop(columns=['user_id', 'count'])

combined_mode_data

In [None]:
combined_mode_data=pd.melt(combined_mode_data,id_vars=['Mode_confirm'],value_vars=['Man','Woman'],var_name='Gender')

In [None]:
combined_mode_data = combined_mode_data.rename(columns={"value": "Percentage"})
combined_mode_data.head()

In [None]:
sns.boxplot(x='Mode_confirm',y='Percentage',data=combined_mode_data,hue='Gender')

# combined_mode_data.boxplot(by='Mode_confirm')