This notebook takes citi bike trip data and creates either directed or undirected networks for a specific timeframe. The dataframe shows the source station, target station, and number of trips between those stations.

In [3]:
import pandas as pd
pd.options.display.max_rows = 10000
pd.options.display.max_columns = 10000
import datetime

In [2]:
#import citi bike trip data (2017 or 2018 depending on timeframe)
trips = pd.read_csv('~/Desktop/Pratt/fall2018/ad_data_vis/analysis/tripdata_2018.csv')
#trips = pd.read_csv('~/Desktop/Pratt/fall2018/ad_data_vis/analysis/tripdata_2017.csv')

In [21]:
trips.head()

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,970,2018-01-01 13:50:57.4340,2018-01-01 14:07:08.1860,72.0,W 52 St & 11 Ave,40.767272,-73.993929,505.0,6 Ave & W 33 St,40.749013,-73.988484,31956,Subscriber,1992,1
1,723,2018-01-01 15:33:30.1820,2018-01-01 15:45:33.3410,72.0,W 52 St & 11 Ave,40.767272,-73.993929,3255.0,8 Ave & W 31 St,40.750585,-73.994685,32536,Subscriber,1969,1
2,496,2018-01-01 15:39:18.3370,2018-01-01 15:47:35.1720,72.0,W 52 St & 11 Ave,40.767272,-73.993929,525.0,W 34 St & 11 Ave,40.755942,-74.002116,16069,Subscriber,1956,1
3,306,2018-01-01 15:40:13.3720,2018-01-01 15:45:20.1910,72.0,W 52 St & 11 Ave,40.767272,-73.993929,447.0,8 Ave & W 52 St,40.763707,-73.985162,31781,Subscriber,1974,1
4,306,2018-01-01 18:14:51.5680,2018-01-01 18:19:57.6420,72.0,W 52 St & 11 Ave,40.767272,-73.993929,3356.0,Amsterdam Ave & W 66 St,40.774667,-73.984706,30319,Subscriber,1992,1


In [22]:
#change data and station number data types
trips['starttime'] = pd.to_datetime(trips['starttime'])
trips['start station id'] = trips['start station id'].fillna(0).astype(int)
trips['end station id'] = trips['end station id'].fillna(0).astype(int)

In [23]:
trips.head()

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,970,2018-01-01 13:50:57.434,2018-01-01 14:07:08.1860,72,W 52 St & 11 Ave,40.767272,-73.993929,505,6 Ave & W 33 St,40.749013,-73.988484,31956,Subscriber,1992,1
1,723,2018-01-01 15:33:30.182,2018-01-01 15:45:33.3410,72,W 52 St & 11 Ave,40.767272,-73.993929,3255,8 Ave & W 31 St,40.750585,-73.994685,32536,Subscriber,1969,1
2,496,2018-01-01 15:39:18.337,2018-01-01 15:47:35.1720,72,W 52 St & 11 Ave,40.767272,-73.993929,525,W 34 St & 11 Ave,40.755942,-74.002116,16069,Subscriber,1956,1
3,306,2018-01-01 15:40:13.372,2018-01-01 15:45:20.1910,72,W 52 St & 11 Ave,40.767272,-73.993929,447,8 Ave & W 52 St,40.763707,-73.985162,31781,Subscriber,1974,1
4,306,2018-01-01 18:14:51.568,2018-01-01 18:19:57.6420,72,W 52 St & 11 Ave,40.767272,-73.993929,3356,Amsterdam Ave & W 66 St,40.774667,-73.984706,30319,Subscriber,1992,1


In [24]:
#select timeframe
start_date = '09-11-2018'
end_date = '10-30-2018'

In [25]:
#filter trip data by desired start and end dates
trips_df = trips[trips['starttime'] >= start_date]
trips_df = trips_df[trips_df['starttime'] <= end_date]

In [26]:
#create trips directed edgelist 

#group df by start and end stations
stations = trips_df.groupby(['start station id', 'end station id'])['bikeid'].count().reset_index()
stations.head()

Unnamed: 0,start station id,end station id,bikeid
0,0,0,633
1,72,72,135
2,72,79,12
3,72,82,1
4,72,127,49


In [27]:
#create trips undirected edgelist

#get min of start and end stations as 'source'
stations['source'] = stations[['start station id', 'end station id']].min(axis=1).astype(str)

#get max of start and end stations as 'target'
stations['target'] = stations[['start station id', 'end station id']].max(axis=1).astype(str)

#group df by source and target
undir_edgelist = stations.groupby(['source', 'target'])['bikeid'].sum().reset_index()
undir_edgelist.head()

Unnamed: 0,source,target,bikeid
0,0,0,633
1,119,119,11
2,119,120,4
3,119,128,1
4,119,143,1


In [28]:
#change 'bikeid' column name to 'weight'
undir_edgelist = undir_edgelist.rename(index=str,columns={'bikeid':'weight',})
undir_edgelist.sort_values(by='weight', ascending=False).head() 

Unnamed: 0,source,target,weight
111497,460,3093,1729
27696,293,445,1573
8167,2006,3282,1542
8081,2006,2006,1498
88580,363,3002,1478


In [29]:
#export edgelist to csv
undir_edgelist.to_csv('station_network_' + start_date + '-' + end_date + '.csv', index=False)

In [14]:
#total rides in edgelist
undir_edgelist['weight'].sum(axis=0)

4337683

In [17]:
# station network function

def create_network(start_date, end_date):
    
    #import jan 2018 - oct 2018 citi bike trip data
    trips = pd.read_csv('~/Desktop/Pratt/fall2018/ad_data_vis/analysis/tripdata_2018.csv')  
    
    #change data and station number data types
    trips['starttime'] = pd.to_datetime(trips['starttime'])
    trips['start station id'] = trips['start station id'].fillna(0).astype(int)
    trips['end station id'] = trips['end station id'].fillna(0).astype(int)
    
    #filter trip data by desired start and end dates
    trips_df = trips[trips['starttime'] >= start_date]
    trips_df = trips_df[trips_df['starttime'] <= end_date]

    #trips directed edgelist 

    #group df by start and end stations
    stations = trips_df.groupby(['start station id', 'end station id'])['bikeid'].count().reset_index()

    #trips undirected edgelist

    #get min of start and end stations as 'source'
    stations['source'] = stations[['start station id', 'end station id']].min(axis=1).astype(str)

    #get max of start and end stations as 'target'
    stations['target'] = stations[['start station id', 'end station id']].max(axis=1).astype(str)

    #group df by source and target
    undir_edgelist = stations.groupby(['source', 'target'])['bikeid'].sum().reset_index()

    #change 'bikeid' column name to 'count'
    undir_edgelist = undir_edgelist.rename(index=str,columns={'bikeid':'count',})
    undir_edgelist.sort_values(by='count', ascending=False).head() 

    #export edgelist to csv
    undir_edgelist.to_csv('station_network_' + start_date + '-' + end_date + '.csv', index=False)

In [18]:
create_network('06-11-2018','08-19-2018')
