This notebook generates an adjacency matrix for stations or clusters.

In [1]:
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from functools import reduce
from geopy import Point
from geopy import distance

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
EXPORT_DIR = '../data/exports/adjacency_matrix'
CENTROIDS_DIR = '../data/exports'
LABELS_DIR = '../data/exports'
SHAPEFILE_DIR = '../data/shapefiles/zipcodes'
STATIONS_DIR = '../data/exports'
TRIPS_DIR = '../data/raw'

# TODO(cpcarey): Convert to enum.
# Options include: 'displacement', 'elevation', 'trip_count',
# 'trip_count_classic', 'trip_count_electric'
VARIABLE = 'elevation'

TRIP_DATES = [
    '202007',
    '202008',
    '202009',
    '202010',
    '202011',
    '202012',
    '202101',
    '202102',
]

In [3]:
CLUSTER = True
K = 9
NODE_TYPE = 'label' if CLUSTER else 'station_id'
ID1 = f'start_{NODE_TYPE}'
ID2 = f'end_{NODE_TYPE}'

if CLUSTER:
    EXPORT_DIR = f'{EXPORT_DIR}/k{K}'

In [4]:
class AnalysisConfig:

    def __init__(self,
                 centroids_path='',
                 export_path='',
                 labels_path='',
                 stations_path='',
                 trips_path_suffix=''):
        self.centroids_path = centroids_path
        self.export_path = export_path
        self.labels_path = labels_path
        self.stations_path = stations_path
        self.trips_path_suffix = trips_path_suffix
        self.station_ids = None

    def get_station_ids(self):
        # Cache value after calculation.
        if self.station_ids == None:
            self.station_ids = set(
                pd.read_csv(self.stations_path)['station_id'].astype(str))
        return self.station_ids

    def get_trips_dfs(self):
        trips_paths = [
            '{}/{}{}'.format(TRIPS_DIR, date, self.trips_path_suffix)
            for date in TRIP_DATES
        ]
        dfs = [pd.read_csv(path) for path in trips_paths]
        for df in dfs:
            df['start_station_id'] = df['start_station_id'].astype(str)
            df['end_station_id'] = df['end_station_id'].astype(str)
        return dfs

In [5]:
config_sf = AnalysisConfig(
    centroids_path=f'{CENTROIDS_DIR}/centroids_k{K}_sf.csv',
    export_path=f'{EXPORT_DIR}/{VARIABLE}_sf.csv',
    labels_path=f'{LABELS_DIR}/cluster_labels_k{K}_sf.csv',
    stations_path=f'{STATIONS_DIR}/SF_ele_single station.csv',
    trips_path_suffix='-baywheels-tripdata.csv',
)

config_dc = AnalysisConfig(
    centroids_path=f'{CENTROIDS_DIR}/centroids_k{K}_dc.csv',
    export_path=f'{EXPORT_DIR}/{VARIABLE}_dc.csv',
    labels_path=f'{LABELS_DIR}/cluster_labels_k{K}_dc.csv',
    stations_path=f'{STATIONS_DIR}/DC_ele_single station.csv',
    trips_path_suffix='-capitalbikeshare-tripdata.csv',
)

In [6]:
config = config_sf

In [7]:
def clean_trips(df, config):
    """Drops missing and non-matching station IDs."""
    REQUIRED_COLUMNS = ['start_station_id', 'end_station_id']
    
    # Drop missing station IDs.
    new_df = df.dropna(subset=REQUIRED_COLUMNS)
    
    # Drop non-matching station IDs.
    for column in REQUIRED_COLUMNS:
        new_df = new_df[new_df[column].isin(config.get_station_ids())]
    return new_df

In [8]:
if 'trip_count' in VARIABLE:
    trips_dfs = [clean_trips(df, config) for df in config.get_trips_dfs()]
    all_trips_df = pd.concat(trips_dfs, ignore_index=True)

In [9]:
if 'trip_count' in VARIABLE:
    grouping_df = all_trips_df
    if 'classic' in VARIABLE:
        # WARNING: SF changes 'docked_bike' to 'classic_bike' over time period.
        grouping_df = grouping_df[grouping_df['rideable_type'].isin(['classic_bike', 'docked_bike'])]
    if 'electric' in VARIABLE:
        grouping_df = grouping_df[grouping_df['rideable_type'] == 'electric_bike']
    
    all_trips_counts = grouping_df.groupby(['start_station_id',
                                             'end_station_id']).agg({
                                                 'ride_id': 'count'
                                             }).rename(columns={
                                                 'ride_id': 'trip_count',
                                             }).reset_index()
        
if 'trip_count' in VARIABLE:
    display(all_trips_counts)

In [10]:
if 'trip_count' in VARIABLE:
    if CLUSTER:
        start_df = pd.read_csv(config.labels_path).rename(columns={
            'station_id': 'start_station_id',
            'label': 'start_label',
        })
        end_df = pd.read_csv(config.labels_path).rename(columns={
            'station_id': 'end_station_id',
            'label': 'end_label',
        })
        cluster_counts_df = pd.merge(all_trips_counts,
                                     start_df,
                                     on='start_station_id',
                                     how='left')
        cluster_counts_df = pd.merge(cluster_counts_df,
                                     end_df,
                                     on='end_station_id',
                                     how='left')
        cluster_counts_df = cluster_counts_df.drop(
            columns=['start_station_id', 'end_station_id'])
        cluster_counts_df = cluster_counts_df.groupby(['start_label',
                                                       'end_label']).agg({
                                                           'trip_count': 'sum'
                                                       }).reset_index()
        display(cluster_counts_df)

In [11]:
centroids_df = None
if CLUSTER:
    centroids_df = pd.read_csv(config.centroids_path)
    display(centroids_df.head())

Unnamed: 0,lat,lng,elevation,count
0,37.776584,-122.452796,79.083333,24
1,37.766722,-122.408793,10.46875,64
2,37.769966,-122.425102,30.815789,38
3,37.725686,-122.451802,54.0,9
4,37.738995,-122.403664,12.666667,18


In [12]:
nodes_df = centroids_df if CLUSTER else stations_df

def get_distance(point1, point2):
    return distance.geodesic(point1, point2).m

def get_point(node_id):
    return Point(nodes_df.loc[node_id]['lat'], nodes_df.loc[node_id]['lng'])

def get_displacement(node_id1, node_id2):
    return get_distance(get_point(node_id1), get_point(node_id2))

def get_elevation_change(node_id1, node_id2):
    return (nodes_df.loc[node_id2]['elevation'] - 
            nodes_df.loc[node_id1]['elevation'])

def get_gradient(node_id1, node_id2):
    return get_elevation_change(node_id1, node_id2) / get_displacement(node_id1, node_id2)

def get_trip_count(node_id1, node_id2):
    df = cluster_counts_df if CLUSTER else all_trips_counts
    NODE_ID = 'label' if CLUSTER else 'station_id'
    
    row = df[(df[f'start_{NODE_ID}'] == node_id1) &
             (df[f'end_{NODE_ID}'] == node_id2)]
    if len(row) == 0:
        return 0
    return row.iloc[:, -1:].values[0][0]

In [13]:
adj_matrix = pd.DataFrame(index=nodes_df.index, columns=nodes_df.index)

if VARIABLE == 'displacement':
    adj_matrix = adj_matrix.apply(lambda row: row.index.to_series().apply(
        lambda col_name: get_displacement(row.name, col_name)),
                                  axis=1)
elif VARIABLE == 'elevation':
    adj_matrix = adj_matrix.apply(lambda row: row.index.to_series().apply(
        lambda col_name: get_elevation_change(row.name, col_name)),
                                  axis=1)
elif 'trip_count' in VARIABLE:
    adj_matrix = adj_matrix.apply(lambda row: row.index.to_series().apply(
        lambda col_name: get_trip_count(row.name, col_name)), axis=1)
    
adj_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.0,4028.376035,2548.100208,5649.906768,6012.579703,4787.701373,2378.324012,4487.01953,1728.219076
1,4028.376035,0.0,1481.318555,5925.658307,3110.53114,2521.724718,6400.663488,5260.619833,2326.343695
2,2548.100208,1481.318555,0.0,5448.978173,3922.434916,2983.196706,4919.356818,4554.900481,877.64446
3,5649.906768,5925.658307,5448.978173,0.0,4493.181472,8304.873607,6552.279125,1187.493908,5641.561516
4,6012.579703,3110.53114,3922.434916,4493.181472,0.0,5499.357377,8105.923048,4486.356756,4718.944657


In [14]:
adj_matrix.index = adj_matrix.index.rename(NODE_TYPE)
adj_matrix.to_csv(config.export_path)

In [15]:
csv_adj_matrix = pd.read_csv(config.export_path, index_col=1)
display(csv_adj_matrix.head())

Unnamed: 0_level_0,label,1,2,3,4,5,6,7,8
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0.0,0,4028.376035,2548.100208,5649.906768,6012.579703,4787.701373,2378.324012,4487.01953,1728.219076
4028.376035,1,0.0,1481.318555,5925.658307,3110.53114,2521.724718,6400.663488,5260.619833,2326.343695
2548.100208,2,1481.318555,0.0,5448.978173,3922.434916,2983.196706,4919.356818,4554.900481,877.64446
5649.906768,3,5925.658307,5448.978173,0.0,4493.181472,8304.873607,6552.279125,1187.493908,5641.561516
6012.579703,4,3110.53114,3922.434916,4493.181472,0.0,5499.357377,8105.923048,4486.356756,4718.944657
