In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import folium
import folium.plugins
from folium import Figure
from folium.plugins import HeatMapWithTime
from datetime import datetime, timedelta

# For real distance
import osmnx as ox
import networkx as nx

import src.cleaning as cleaning
import src.visualizations as vis
import src.your_trip as trip

In [5]:
# Sample to test functions:
apr_14 = pd.read_csv('../../../Desktop/final_project_data/2014/2014-04 - Citi Bike trip data.csv')

# 1. CLEANING PROCESS

# Rename columns

In [6]:
apr_14 = cleaning.rename_columns(apr_14)

#  Straight line Distance

In [7]:
apr_14 = cleaning.trip_distance(apr_14)

# Hour: start_hour & end_hour (same function, 2 columns)

- Integer value for the hour, e.g. 12, 18.

In [8]:
apr_14 = cleaning.get_hour (apr_14)

# Date: trip_date
- time series date format for date, including year, month and day, and excluding hour, minute and second.

In [9]:
apr_14 = cleaning.get_date (apr_14)

# Formatting: started_at and ended_at in datetime format

In [10]:
apr_14 = cleaning.datetime_format (apr_14)

#  Month, weekday, weekend

In [11]:
apr_14 = cleaning.get_categorical_date (apr_14)

# Cleaned and enriched dataframe to CSV

In [12]:
apr_14.to_csv('data/april_2014.csv', index=False)

In [13]:
april_14 = pd.read_csv('data/april_2014.csv')

# Open Street Map: real_distance
### Function to create subdataframes and save them as csv.

In [14]:
# in Collab

# cleaning.dataframe_split (df, n): Splits the dataframe in n subdataframes to make it more processable.
# cleaning.get_real_distance (df): uses osmnx and networkx to compute the shortest available path distance.

# rideable_type

In [15]:
# Infer from 2022 datasets.

# trip_cost ()

¡You need to get bike type before!

- float $ value in function of rideable_type, duration and member_casual.

Subscriber
- Classic.
If trip_duration < 45*60, cost = 0
Elif trip_duration > 45*60, cost = (trip_duration - 45 * 60) / 60 * 0.17

- Electric
If trip_duration =< 45*60: cost = trip_duration / 60 * 0.17, limit 3.
If trip_duration > 45*60: cost = 3 + (trip_duration - 45 * 60) / 60 * 0.17

Casual
- Single trip 
    - Casual & - Electric
    If trip_duration <= 30*60 -> 4.49
    Elif trip_duration > 30 * 60 -> 4.49 + (trip_duration - 30 * 60) / 60 * 0.26

- Day Pass
    - Casual 
    - Electric

In [16]:
# need to know rideable type.

# Station non-bike trips balance

## Bike Route (not by human usage):

All bikes journey function includes bike route function.

In [17]:
all_bikes_journey_list = cleaning.all_bikes_journey (april_14)

## Non user trip mobility dictionary and information:

### Three functions that enrich a dictionary containing all info regarding mobility from one station to another through trucks.

In [18]:
stations_transfers_dictionary = cleaning.non_trip_mobility_dict (apr_14)
stations_transfers_dictionary = cleaning.transportations (stations_transfers_dictionary, all_bikes_journey_list)
stations_transfers_dictionary = cleaning.station_balance (stations_transfers_dictionary)

# Truck trips

Record all bike movements not attributable to user trips.
- Bike ID
- Date range
- Transported from
- Transported to

Sort by date, group by station.

### All transfers use single bike truck transfers function:

In [20]:
total_transfers = cleaning.all_transfers (april_14)

In [23]:
total_transfers = cleaning.datetime_format_trucks (total_transfers)

In [27]:
total_transfers.to_csv('data/truck_transfers.csv', index=False)

# 2. Visualizations

In [30]:
# Functions defined in source folder, deployed in streamlit app.

In [1]:
# Hourly distribution by day:
def hourly_dist (df, day):
    test_map_viz = df[['start_hour', 'started_at', 'bike_id', 'start_lat', 'start_lng', 'weekday']]
    test_map_viz = test_map_viz[test_map_viz['weekday'] == day]

    lat_lng_list = []
    for i in range(24):
        temp=[]
        for index, row in test_map_viz[test_map_viz['start_hour'] == i].iterrows():
            temp.append([row['start_lat'],row['start_lng']])
        lat_lng_list.append(temp)

    figure1 = Figure(width=850,height=550)
    new_york1 = folium.Map(location=[40.712776, -74.005974],zoom_start=12)

    figure1.add_child(new_york1)
    folium.TileLayer('cartodbpositron').add_to(new_york1)
    gradient = {.33: 'white', .66: 'lightblue', 1: 'blue'}

    HeatMapWithTime(lat_lng_list, radius=5, auto_play=True, position='bottomright', gradient=gradient).add_to(new_york1)

    return figure1

In [None]:
hourly_dist (april_14, 'Monday')

# 3. Your Trip

In [2]:
trucks = pd.read_csv('data/truck_transfers.csv')
april = pd.read_csv('data/april_2014.csv')

In [3]:
ALL_TRIPS = cleaning.concat_all_bike_trips (april)

  result_df = pd.DataFrame({'last_end': last_end, 'ended_at': etime, 'next_start': next_start, 'started_at': stime})


In [4]:
ALL_TRIPS.to_csv('data/all_trips.csv', index = False)

In [2]:
ALL_TRIPS = pd.read_csv('data/all_trips.csv')

In [4]:
capacity_dict = cleaning.capacity_dictionary (ALL_TRIPS)

In [6]:
# But this is an approx, precise in a daily scale.
capacity_dict

{519.0: 50.0,
 521.0: 200.0,
 497.0: 40.0,
 293.0: 50.0,
 435.0: 30.0,
 285.0: 40.0,
 477.0: 50.0,
 426.0: 30.0,
 490.0: 70.0,
 402.0: 30.0,
 379.0: 60.0,
 492.0: 50.0,
 499.0: 40.0,
 151.0: 30.0,
 444.0: 40.0,
 368.0: 40.0,
 318.0: 30.0,
 382.0: 40.0,
 284.0: 40.0,
 358.0: 40.0,
 168.0: 40.0,
 504.0: 40.0,
 347.0: 30.0,
 523.0: 40.0,
 465.0: 20,
 462.0: 40.0,
 509.0: 40.0,
 488.0: 40.0,
 127.0: 40.0,
 457.0: 40.0,
 327.0: 40.0,
 445.0: 70.0,
 281.0: 30.0,
 540.0: 30.0,
 442.0: 20,
 483.0: 50.0,
 116.0: 30.0,
 528.0: 40.0,
 459.0: 30.0,
 387.0: 30.0,
 375.0: 30.0,
 348.0: 40.0,
 380.0: 40.0,
 446.0: 30.0,
 537.0: 30.0,
 472.0: 30.0,
 237.0: 40.0,
 294.0: 30.0,
 229.0: 30.0,
 417.0: 40.0,
 496.0: 30.0,
 453.0: 40.0,
 505.0: 30.0,
 161.0: 30.0,
 236.0: 50.0,
 432.0: 40.0,
 251.0: 30.0,
 404.0: 30.0,
 3002.0: 30.0,
 345.0: 30.0,
 493.0: 20,
 527.0: 50.0,
 529.0: 60.0,
 153.0: 20,
 518.0: 40.0,
 147.0: 30.0,
 503.0: 30.0,
 401.0: 50.0,
 514.0: 50.0,
 498.0: 20,
 461.0: 40.0,
 433.0: 50.0,


In [14]:
station_519 = trip.station_load_time_series (ALL_TRIPS, 519)

In [19]:
bike_availability = station_519[0]['bikes_in_station'].to_list()
time_range = trip.time_range

In [22]:
definitive_df = pd.DataFrame({'time': time_range[:-1], 'bikes_available': bike_availability[:-1]})
definitive_df['time'] = pd.to_datetime(definitive_df['time'], infer_datetime_format = True)

definitive_df

Unnamed: 0,time,bikes_available
0,2014-04-01 00:00:00,0.0
1,2014-04-01 00:15:00,0.0
2,2014-04-01 00:30:00,0.0
3,2014-04-01 00:45:00,0.0
4,2014-04-01 01:00:00,0.0
...,...,...
2875,2014-04-30 22:45:00,0.0
2876,2014-04-30 23:00:00,0.0
2877,2014-04-30 23:15:00,0.0
2878,2014-04-30 23:30:00,0.0


In [25]:
fig = px.area(definitive_df, 
        x='time',
        y= 'bikes_available',
        labels={'time': 'Time', 'bikes_available': 'Number of Bikes parked in Station'},
        color_discrete_sequence =['darkblue']*len(definitive_df),
        title = 'Bike Availability in Station ID')
fig.show()