This notebook generates a displacement adjacement matrix for stations.

In [1]:
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from functools import reduce
from geopy import Point
from geopy import distance

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
INPUT_DIR = '../data/exports'
OUTPUT_DIR = '../data/exports/adjacency_matrix'
SHAPEFILE_DIR = '../data/shapefiles/zipcodes'
TRIPS_DIR = '../data/raw'

# TODO(cpcarey): Convert to enum.
# Options include: 'displacement', 'elevation', 'trip_count',
# 'trip_count_classic', 'trip_count_electric'
VARIABLE = 'trip_count_electric'

TRIP_DATES = [
    '202007',
    '202008',
    '202009',
    '202010',
    '202011',
    '202012',
    '202101',
    '202102',
]

In [3]:
class AnalysisConfig:

    def __init__(self,
                 csv_output_path='',
                 stations_path='',
                 trips_path_suffix=''):
        self.csv_output_path = csv_output_path
        self.stations_path = stations_path
        self.trips_path_suffix = trips_path_suffix
        self.station_ids = None

    def get_station_ids(self):
        # Cache value after calculation.
        if self.station_ids == None:
            self.station_ids = set(
                pd.read_csv(self.stations_path)['station_id'].astype(str))
        return self.station_ids

    def get_trips_dfs(self):
        trips_paths = [
            '{}/{}{}'.format(TRIPS_DIR, date, self.trips_path_suffix)
            for date in TRIP_DATES
        ]
        dfs = [pd.read_csv(path) for path in trips_paths]
        for df in dfs:
            df['start_station_id'] = df['start_station_id'].astype(str)
            df['end_station_id'] = df['end_station_id'].astype(str)
        return dfs

In [4]:
config_sf = AnalysisConfig(
    csv_output_path='{}/{}_sf.csv'.format(OUTPUT_DIR, VARIABLE),
    stations_path='{}/SF_ele_single station.csv'.format(INPUT_DIR),
    trips_path_suffix='-baywheels-tripdata.csv',
)

config_dc = AnalysisConfig(
    csv_output_path='{}/{}_dc.csv'.format(OUTPUT_DIR, VARIABLE),
    stations_path='{}/DC_ele_single station.csv'.format(INPUT_DIR),
    trips_path_suffix='-capitalbikeshare-tripdata.csv',
)

In [5]:
config = config_dc

In [6]:
def clean_trips(df, config):
    """Drops missing and non-matching station IDs."""
    REQUIRED_COLUMNS = ['start_station_id', 'end_station_id']
    
    # Drop missing station IDs.
    new_df = df.dropna(subset=REQUIRED_COLUMNS)
    
    # Drop non-matching station IDs.
    for column in REQUIRED_COLUMNS:
        new_df = new_df[new_df[column].isin(config.get_station_ids())]
    return new_df

In [7]:
if 'trip_count' in VARIABLE:
    trips_dfs = [clean_trips(df, config) for df in config.get_trips_dfs()]
    all_trips_df = pd.concat(trips_dfs, ignore_index=True)

In [8]:
if 'trip_count' in VARIABLE:
    grouping_df = all_trips_df
    if 'classic' in VARIABLE:
        grouping_df = grouping_df[grouping_df['rideable_type'] == 'classic_bike']
    if 'electric' in VARIABLE:
        grouping_df = grouping_df[grouping_df['rideable_type'] == 'electric_bike']
    
    all_trips_counts = grouping_df.groupby(['start_station_id',
                                             'end_station_id']).agg({
                                                 'ride_id': 'count'
                                             }).rename(columns={
                                                 'ride_id': 'trip_count',
                                             }).reset_index()
        
if 'trip_count' in VARIABLE:
    display(all_trips_counts)

Unnamed: 0,start_station_id,end_station_id,trip_count
0,31100,31124,2
1,31100,31125,1
2,31100,31203,1
3,31100,31218,1
4,31100,31229,1
...,...,...,...
5370,31820,31801,1
5371,31820,31820,6
5372,31824,31815,1
5373,31826,31613,1


In [9]:
def get_distance(point1, point2):
    return distance.geodesic(point1, point2).m


def get_point(station_id):
    return Point(stations_df.loc[station_id]['lat'],
                 stations_df.loc[station_id]['lng'])


def get_displacement(station_id1, station_id2):
    return get_distance(get_point(station_id1), get_point(station_id2))

def get_elevation_change(station_id1, station_id2):
    return (stations_df.loc[station_id2]['elevation'] - 
            stations_df.loc[station_id1]['elevation'])

def get_gradient(station_id1, station_id2):
    return get_elevation_change(station_id1, station_id2) / get_displacement(station_id1, station_id2)

def get_trip_count(station_id1, station_id2):
    df = all_trips_counts
    row = df[(df['start_station_id'] == station_id1) &
             (df['end_station_id'] == station_id2)]
    if len(row) == 0:
        return 0
    return row.iloc[:, -1:].values[0][0]

In [10]:
stations_df = pd.read_csv(config.stations_path)
stations_df.set_index('station_id', inplace=True)
stations_df.index = stations_df.index.astype(str)
stations_df.head()

Unnamed: 0_level_0,lat,lng,elevation
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
31267,38.908147,-77.03837,22
31298,38.905414,-77.034643,22
31250,38.903552,-77.044822,15
31224,38.903723,-77.04244,17
31221,38.905062,-77.041768,17


In [11]:
adj_matrix = pd.DataFrame(index=stations_df.index, columns=stations_df.index)

if VARIABLE == 'displacement':
    adj_matrix = adj_matrix.apply(lambda row: row.index.to_series().apply(
        lambda col_name: get_displacement(row.name, col_name)),
                                  axis=1)
elif VARIABLE == 'elevation':
    adj_matrix = adj_matrix.apply(lambda row: row.index.to_series().apply(
        lambda col_name: get_elevation_change(row.name, col_name)),
                                  axis=1)
elif 'trip_count' in VARIABLE:
    adj_matrix = adj_matrix.apply(lambda row: row.index.to_series().apply(
        lambda col_name: get_trip_count(row.name, col_name)), axis=1)
    
adj_matrix.head()

station_id,31267,31298,31250,31224,31221,31285,31213,31212,31239,31257,...,31823,31813,31818,31811,31806,31817,31816,31824,31800,31261
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
31267,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31298,0,1,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
31250,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31224,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31221,1,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
adj_matrix.to_csv(config.csv_output_path)

In [13]:
csv_adj_matrix = pd.read_csv(config.csv_output_path).set_index('station_id')
display(csv_adj_matrix.head())

Unnamed: 0_level_0,31267,31298,31250,31224,31221,31285,31213,31212,31239,31257,...,31823,31813,31818,31811,31806,31817,31816,31824,31800,31261
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
31267,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31298,0,1,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
31250,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31224,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31221,1,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
