In [94]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import folium
import folium.plugins
from folium import Figure
from folium.plugins import HeatMapWithTime
from datetime import datetime, timedelta

# For real distance
import osmnx as ox
import networkx as nx

import src.cleaning as cleaning
import src.visualizations as vis

In [5]:
# Sample to test functions:
apr_14 = pd.read_csv('../../../Desktop/final_project_data/2014/2014-04 - Citi Bike trip data.csv')

# 1. CLEANING PROCESS

# Rename columns

In [6]:
apr_14 = cleaning.rename_columns(apr_14)

#  Straight line Distance

In [7]:
apr_14 = cleaning.trip_distance(apr_14)

# Hour: start_hour & end_hour (same function, 2 columns)

- Integer value for the hour, e.g. 12, 18.

In [8]:
apr_14 = cleaning.get_hour (apr_14)

# Date: trip_date
- time series date format for date, including year, month and day, and excluding hour, minute and second.

In [9]:
apr_14 = cleaning.get_date (apr_14)

# Formatting: started_at and ended_at in datetime format

In [10]:
apr_14 = cleaning.datetime_format (apr_14)

#  Month, weekday, weekend

In [11]:
apr_14 = cleaning.get_categorical_date (apr_14)

# Cleaned and enriched dataframe to CSV

In [12]:
apr_14.to_csv('data/april_2014.csv', index=False)

In [13]:
april_14 = pd.read_csv('data/april_2014.csv')

# Open Street Map: real_distance
### Function to create subdataframes and save them as csv.

In [14]:
# in Collab

# cleaning.dataframe_split (df, n): Splits the dataframe in n subdataframes to make it more processable.
# cleaning.get_real_distance (df): uses osmnx and networkx to compute the shortest available path distance.

# rideable_type

In [15]:
# Infer from 2022 datasets.

# trip_cost ()

¡You need to get bike type before!

- float $ value in function of rideable_type, duration and member_casual.

Subscriber
- Classic.
If trip_duration < 45*60, cost = 0
Elif trip_duration > 45*60, cost = (trip_duration - 45 * 60) / 60 * 0.17

- Electric
If trip_duration =< 45*60: cost = trip_duration / 60 * 0.17, limit 3.
If trip_duration > 45*60: cost = 3 + (trip_duration - 45 * 60) / 60 * 0.17

Casual
- Single trip 
    - Casual & - Electric
    If trip_duration <= 30*60 -> 4.49
    Elif trip_duration > 30 * 60 -> 4.49 + (trip_duration - 30 * 60) / 60 * 0.26

- Day Pass
    - Casual 
    - Electric

In [16]:
# need to know rideable type.

# Station non-bike trips balance

## Bike Route (not by human usage):

All bikes journey function includes bike route function.

In [17]:
all_bikes_journey_list = cleaning.all_bikes_journey (april_14)

## Non user trip mobility dictionary and information:

### Three functions that enrich a dictionary containing all info regarding mobility from one station to another through trucks.

In [18]:
stations_transfers_dictionary = cleaning.non_trip_mobility_dict (apr_14)
stations_transfers_dictionary = cleaning.transportations (stations_transfers_dictionary, all_bikes_journey_list)
stations_transfers_dictionary = cleaning.station_balance (stations_transfers_dictionary)

# Truck trips

Record all bike movements not attributable to user trips.
- Bike ID
- Date range
- Transported from
- Transported to

Sort by date, group by station.

### All transfers use single bike truck transfers function:

In [20]:
total_transfers = cleaning.all_transfers (april_14)

In [23]:
total_transfers = cleaning.datetime_format_trucks (total_transfers)

In [27]:
total_transfers.to_csv('data/truck_transfers.csv', index=False)

# 2. Visualizations

In [30]:
# Functions defined in source folder, deployed in streamlit app.

In [1]:
# Hourly distribution by day:
def hourly_dist (df, day):
    test_map_viz = df[['start_hour', 'started_at', 'bike_id', 'start_lat', 'start_lng', 'weekday']]
    test_map_viz = test_map_viz[test_map_viz['weekday'] == day]

    lat_lng_list = []
    for i in range(24):
        temp=[]
        for index, row in test_map_viz[test_map_viz['start_hour'] == i].iterrows():
            temp.append([row['start_lat'],row['start_lng']])
        lat_lng_list.append(temp)

    figure1 = Figure(width=850,height=550)
    new_york1 = folium.Map(location=[40.712776, -74.005974],zoom_start=12)

    figure1.add_child(new_york1)
    folium.TileLayer('cartodbpositron').add_to(new_york1)
    gradient = {.33: 'white', .66: 'lightblue', 1: 'blue'}

    HeatMapWithTime(lat_lng_list, radius=5, auto_play=True, position='bottomright', gradient=gradient).add_to(new_york1)

    return figure1

In [None]:
hourly_dist (april_14, 'Monday')

# 3. Your Trip

In [59]:
trucks = pd.read_csv('data/truck_transfers.csv')
april = pd.read_csv('data/april_2014.csv')

In [3]:
ALL_TRIPS = cleaning.concat_all_bike_trips (april)

  result_df = pd.DataFrame({'last_end': last_end, 'ended_at': etime, 'next_start': next_start, 'started_at': stime})


In [4]:
ALL_TRIPS.to_csv('data/all_trips.csv', index = False)

In [2]:
ALL_TRIPS = pd.read_csv('data/all_trips.csv')

In [35]:
ALL_TRIPS.head()

Unnamed: 0,last_end,ended_at,next_start,started_at,time_difference,bike_id
0,2008.0,2014-04-01 00:09:25.000,2008.0,2014-04-01 07:41:30.000,0 days 07:32:05,21062.0
1,224.0,2014-04-01 07:48:08.000,224.0,2014-04-01 08:54:32.000,0 days 01:06:24,21062.0
2,360.0,2014-04-01 08:58:20.000,360.0,2014-04-01 15:03:40.800,0 days 06:05:20.800000,21062.0
3,306.0,2014-04-02 09:19:43.200,306.0,2014-04-02 15:25:04.000,0 days 06:05:20.800000,21062.0
4,147.0,2014-04-02 15:32:24.000,147.0,2014-04-02 17:22:38.000,0 days 01:50:14,21062.0


In [50]:
def station_max_min_differences (df):
    # time to datetime format
    df = cleaning.time_difference (df)

    date_range = []
    for i in range(1,10):
        date_range.append(f'2014-04-0{i}')

    for i in range(10,31):
        date_range.append(f'2014-04-{i}')

    date_range.append('2014-05-01')
    
    difference_list = []
    difference_dict = {}

    for date in date_range:
        date_time_obj = datetime.strptime(date, '%Y-%m-%d')
        ends = df[df['ended_at'] <= date_time_obj].last_end.value_counts().to_frame().reset_index().rename(columns = {'index': 'end_id', 'last_end': 'counts'})
        starts = df[df['started_at'] <= date_time_obj].next_start.value_counts().to_frame().reset_index().rename(columns = {'index': 'start_id', 'next_start': 'counts'})

        # for index, row in ends.iterrows():

        try:
            difference_list.append(ends.iloc[0]['counts'] - starts.iloc[0]['counts'])
            difference_dict[df.iloc[0]['last_end']] = difference_list
        except:
            difference_list.append(0)

    return difference_dict

In [36]:
def capacity_dictionary (df):
    difference_dict = {}
    bench_date = '2014-05-01'
    datetime_obj_bench = datetime.strptime(bench_date, '%Y-%m-%d')

    ends = df[df['ended_at'] <= datetime_obj_bench].last_end.value_counts().to_frame().reset_index().rename(columns = {'index': 'id', 'last_end': 'counts'})
    starts = df[df['started_at'] <= datetime_obj_bench].next_start.value_counts().to_frame().reset_index().rename(columns = {'index': 'id', 'next_start': 'counts'})

    total = ends.merge(starts, how='outer', on='id')
    total['difference'] = total['counts_x'] - total['counts_y']

    for i, row in total.iterrows():
        difference_dict[row['id']] = []

    date_range = []
    for i in range(1,10):
        date_range.append(f'2014-04-0{i}')

    for i in range(10,31):
        date_range.append(f'2014-04-{i}')

    date_range.append('2014-05-01')
        
    for date in date_range:
        date_time_obj = datetime.strptime(date, '%Y-%m-%d')

        ends = df[df['ended_at'] <= date_time_obj].last_end.value_counts().to_frame().reset_index().rename(columns = {'index': 'id', 'last_end': 'counts'})
        starts = df[df['started_at'] <= date_time_obj].next_start.value_counts().to_frame().reset_index().rename(columns = {'index': 'id', 'next_start': 'counts'})

        total = ends.merge(starts, how='outer', on='id')
        total['difference'] = total['counts_x'] - total['counts_y']

        for i, row in total.iterrows():
            difference_dict[row['id']].append(row['difference'])

    return difference_dict

In [54]:
trying_dict = capacity_dictionary (ALL_TRIPS)

In [55]:
for key, value in trying_dict.items():
    trying_dict[key] = max(value)

In [90]:
trying_dict

{519.0: 41.0,
 521.0: 194.0,
 497.0: 40.0,
 293.0: 45.0,
 435.0: 30.0,
 285.0: 33.0,
 477.0: 47.0,
 426.0: 26.0,
 490.0: 68.0,
 402.0: 24.0,
 379.0: 56.0,
 492.0: 44.0,
 499.0: 31.0,
 151.0: 24.0,
 444.0: 40.0,
 368.0: 32.0,
 318.0: 28.0,
 382.0: 32.0,
 284.0: 36.0,
 358.0: 36.0,
 168.0: 33.0,
 504.0: 36.0,
 347.0: 23.0,
 523.0: 36.0,
 465.0: 10.0,
 462.0: 40.0,
 509.0: 33.0,
 488.0: 36.0,
 127.0: 32.0,
 457.0: 39.0,
 327.0: 35.0,
 445.0: 65.0,
 281.0: 23.0,
 540.0: 24.0,
 442.0: 13.0,
 483.0: 42.0,
 116.0: 28.0,
 528.0: 34.0,
 459.0: 27.0,
 387.0: 27.0,
 375.0: 21.0,
 348.0: 31.0,
 380.0: 36.0,
 446.0: 26.0,
 537.0: 25.0,
 472.0: 24.0,
 237.0: 35.0,
 294.0: 23.0,
 229.0: 23.0,
 417.0: 31.0,
 496.0: 24.0,
 453.0: 33.0,
 505.0: 29.0,
 161.0: 27.0,
 236.0: 45.0,
 432.0: 33.0,
 251.0: 24.0,
 404.0: 28.0,
 3002.0: 24.0,
 345.0: 29.0,
 493.0: 15.0,
 527.0: 42.0,
 529.0: 53.0,
 153.0: 17.0,
 518.0: 33.0,
 147.0: 25.0,
 503.0: 22.0,
 401.0: 42.0,
 514.0: 47.0,
 498.0: 18.0,
 461.0: 39.0,
 433

In [93]:
date_range = []
for i in range(1,10):
    for n in range(96):
        date_range.append(f'2014-04-0{i}')

for i in range(10,31):
    for n in range(96):
        date_range.append(f'2014-04-{i}')

date_range.append('2014-05-01')

date_range

['2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-01',
 '2014-04-

In [102]:
def datetime_range(start, end, delta):
    current = start
    while current < end:
        yield current
        current += delta

time_range = [dt.strftime('%Y-%m-%d T%H:%M') for dt in 
       datetime_range(datetime(2014, 4, 1, 0), datetime(2014, 5, 1, 0, 5), 
       timedelta(minutes=15))]

['2014-04-01 T00:00', '2014-04-01 T00:15', '2014-04-01 T00:30', '2014-04-01 T00:45', '2014-04-01 T01:00', '2014-04-01 T01:15', '2014-04-01 T01:30', '2014-04-01 T01:45', '2014-04-01 T02:00', '2014-04-01 T02:15', '2014-04-01 T02:30', '2014-04-01 T02:45', '2014-04-01 T03:00', '2014-04-01 T03:15', '2014-04-01 T03:30', '2014-04-01 T03:45', '2014-04-01 T04:00', '2014-04-01 T04:15', '2014-04-01 T04:30', '2014-04-01 T04:45', '2014-04-01 T05:00', '2014-04-01 T05:15', '2014-04-01 T05:30', '2014-04-01 T05:45', '2014-04-01 T06:00', '2014-04-01 T06:15', '2014-04-01 T06:30', '2014-04-01 T06:45', '2014-04-01 T07:00', '2014-04-01 T07:15', '2014-04-01 T07:30', '2014-04-01 T07:45', '2014-04-01 T08:00', '2014-04-01 T08:15', '2014-04-01 T08:30', '2014-04-01 T08:45', '2014-04-01 T09:00', '2014-04-01 T09:15', '2014-04-01 T09:30', '2014-04-01 T09:45', '2014-04-01 T10:00', '2014-04-01 T10:15', '2014-04-01 T10:30', '2014-04-01 T10:45', '2014-04-01 T11:00', '2014-04-01 T11:15', '2014-04-01 T11:30', '2014-04-01 