This notebook generates a displacement adjacement matrix for stations.

In [1]:
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from functools import reduce
from geopy import Point
from geopy import distance

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
INPUT_DIR = '../data/exports'
OUTPUT_DIR = '../data/exports/adjacency_matrix'
SHAPEFILE_DIR = '../data/shapefiles/zipcodes'
TRIPS_DIR = '../data/raw'
VARIABLE = 'trip_count'

TRIP_DATES = [
    '202007',
    '202008',
    '202009',
    '202010',
    '202011',
    '202012',
    '202101',
    '202102',
]

In [3]:
class AnalysisConfig:

    def __init__(self,
                 csv_output_path='',
                 stations_path='',
                 trips_path_suffix=''):
        self.csv_output_path = csv_output_path
        self.stations_path = stations_path
        self.trips_path_suffix = trips_path_suffix
        self.station_ids = None

    def get_station_ids(self):
        # Cache value after calculation.
        if self.station_ids == None:
            self.station_ids = set(
                pd.read_csv(self.stations_path)['station_id'])
        return self.station_ids

    def get_trips_dfs(self):
        trips_paths = [
            '{}/{}{}'.format(TRIPS_DIR, date, self.trips_path_suffix)
            for date in TRIP_DATES
        ]
        return [pd.read_csv(path) for path in trips_paths]

In [4]:
config_sf = AnalysisConfig(
    csv_output_path='{}/{}_sf.csv'.format(OUTPUT_DIR, VARIABLE),
    stations_path='{}/SF_ele_single station.csv'.format(INPUT_DIR),
    trips_path_suffix='-baywheels-tripdata.csv',
)

config_dc = AnalysisConfig(
    csv_output_path='{}/{}_dc.csv'.format(OUTPUT_DIR, VARIABLE),
    stations_path='{}/DC_ele_single station.csv'.format(INPUT_DIR),
    trips_path_suffix='-capitalbikeshare-tripdata.csv',
)

In [5]:
config = config_sf

In [6]:
def clean_trips(df, config):
    """Drops missing and non-matching station IDs."""
    REQUIRED_COLUMNS = ['start_station_id', 'end_station_id']
    
    # Drop missing station IDs.
    new_df = df.dropna(subset=REQUIRED_COLUMNS)
    
    # Drop non-matching station IDs.
    for column in REQUIRED_COLUMNS:
        new_df = new_df[new_df[column].isin(config.get_station_ids())]
    return new_df

In [7]:
trips_dfs = [clean_trips(df, config) for df in config_sf.get_trips_dfs()]
all_trips_df = pd.concat(trips_dfs, ignore_index=True)

In [8]:
all_trips_counts = all_trips_df.groupby(['start_station_id',
                                         'end_station_id']).agg({
                                             'ride_id': 'count'
                                         }).rename(columns={
                                             'ride_id': 'trip_count',
                                         }).reset_index()
display(all_trips_counts)

Unnamed: 0,start_station_id,end_station_id,trip_count
0,SF I29-1,SF I29-1,79
1,SF I29-1,SF-A20,6
2,SF I29-1,SF-A27,16
3,SF I29-1,SF-B19,3
4,SF I29-1,SF-B22,5
...,...,...,...
41830,SF-Z7,SF-W19,3
41831,SF-Z7,SF-W8,3
41832,SF-Z7,SF-X14,1
41833,SF-Z7,SF-Z6,21


In [9]:
def get_distance(point1, point2):
    return distance.geodesic(point1, point2).m


def get_point(station_id):
    return Point(stations_df.loc[station_id]['lat'],
                 stations_df.loc[station_id]['lng'])


def get_displacement(station_id1, station_id2):
    return get_distance(get_point(station_id1), get_point(station_id2))

def get_elevation_change(station_id1, station_id2):
    return (stations_df.loc[station_id2]['elevation'] - 
            stations_df.loc[station_id1]['elevation'])

def get_trip_count(station_id1, station_id2):
    df = all_trips_counts
    row = df[(df['start_station_id'] == station_id1) &
             (df['end_station_id'] == station_id2)]
    if len(row) == 0:
        return 0
    return row.iloc[:, -1:].values[0][0]

In [10]:
stations_df = pd.read_csv(config.stations_path)
stations_df.set_index('station_id', inplace=True)
stations_df.head()

Unnamed: 0_level_0,lat,lng,elevation
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SF-M26,37.767088,-122.40738,12
SF-M25,37.7671,-122.410662,15
SF-M26-2,37.768568,-122.404238,4
SF-L27,37.771075,-122.402732,4
SF-K26,37.771461,-122.405788,4


In [11]:
adj_matrix = pd.DataFrame(index=stations_df.index, columns=stations_df.index)

if VARIABLE == 'displacement':
    adj_matrix = adj_matrix.apply(lambda row: row.index.to_series().apply(
        lambda col_name: get_displacement(row.name, col_name)),
                                  axis=1)
elif VARIABLE == 'elevation':
    adj_matrix = adj_matrix.apply(lambda row: row.index.to_series().apply(
        lambda col_name: get_elevation_change(row.name, col_name)),
                                  axis=1)
elif VARIABLE == 'trip_count':
    adj_matrix = adj_matrix.apply(lambda row: row.index.to_series().apply(
        lambda col_name: get_trip_count(row.name, col_name)), axis=1)
    
adj_matrix.head()

station_id,SF-M26,SF-M25,SF-M26-2,SF-L27,SF-K26,SF-J27,SF-M30,SF-L27-2,SF-J29-3,SF-J29,...,SF-Z7,SF-Z6,SF-W8,SF-BB17,SF-AA15,SF-Z16,SF-Y14,SF-X14,SF-X19,SF-Y25
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SF-M26,42,3,8,2,4,1,5,4,1,23,...,0,0,0,0,0,0,0,0,0,0
SF-M25,3,44,13,8,30,4,14,1,7,16,...,0,0,0,0,0,1,2,0,1,1
SF-M26-2,9,17,154,26,11,4,7,10,7,82,...,0,0,0,0,1,0,0,0,0,1
SF-L27,0,7,4,59,9,7,32,5,10,85,...,0,0,0,0,0,0,0,0,0,0
SF-K26,9,34,19,7,77,11,17,3,18,46,...,0,0,0,0,1,0,0,0,0,1


In [12]:
adj_matrix.to_csv(config.csv_output_path)

In [13]:
# Example for retrieving displacement from SF.
csv_adj_matrix = pd.read_csv(config.csv_output_path).set_index('station_id')
display(csv_adj_matrix.head())

csv_adj_matrix['SF-AA15']['SF-Y14']

Unnamed: 0_level_0,SF-M26,SF-M25,SF-M26-2,SF-L27,SF-K26,SF-J27,SF-M30,SF-L27-2,SF-J29-3,SF-J29,...,SF-Z7,SF-Z6,SF-W8,SF-BB17,SF-AA15,SF-Z16,SF-Y14,SF-X14,SF-X19,SF-Y25
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SF-M26,42,3,8,2,4,1,5,4,1,23,...,0,0,0,0,0,0,0,0,0,0
SF-M25,3,44,13,8,30,4,14,1,7,16,...,0,0,0,0,0,1,2,0,1,1
SF-M26-2,9,17,154,26,11,4,7,10,7,82,...,0,0,0,0,1,0,0,0,0,1
SF-L27,0,7,4,59,9,7,32,5,10,85,...,0,0,0,0,0,0,0,0,0,0
SF-K26,9,34,19,7,77,11,17,3,18,46,...,0,0,0,0,1,0,0,0,0,1


8