In [4]:
# 读入数据

import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 100)

station_locations_df = pd.read_csv(
    './data/stations-info.csv',
    header=0,
    dtype={
        'station': str,
        'longitude': float,
        'latitude': float,
    }
)

passenger_flow_df = pd.read_csv(
    './data/passenger-flow.csv',
    header=0,
    parse_dates=['datetime'],
    date_parser=lambda x: datetime.strptime(x, "%Y/%m/%d %H:%M"),
    dtype={
        'ID': str,
        'passengers-in': int,
        'passengers-out': int,
        'station': str,
    }
)

passenger_flow_df['day'] = passenger_flow_df.apply(lambda row: row['datetime'].strftime('%d'), axis=1).astype(int)

In [5]:
# 每日出入站的总客流量
#
# day  passengers-in  passengers-out
#  01          89553           89553
#  02         106890          106890
#  03          98175           98175
#  04          89815           89815
#  05         119204          119204
#  06         110510          110510
#  07         122495          122495
#  08         121978          121978
#  09         113252          113252
#  10          77984           77984
#  11         118952          118952
#  12         101863          101863
#  13         118966          118966
#  14         122422          122422

df = passenger_flow_df.copy()
df = df.groupby('day').sum()

# print(df)
# df.plot()

In [6]:
# 每日每时刻出入站的客流量

stations_list = station_locations_df['id'].tolist()

time_list = []
for hour in range(6, 22 + 1):
    for minute in range(0, 45 + 1, 15):
        time_list.append(f'{hour:02d}:{minute:02d}')

zero_data_list = [[0] * len(stations_list) for _ in range(len(time_list))]

for day in range(1, 1 + 1):
    day_in_df = pd.DataFrame(zero_data_list, columns=stations_list, index=time_list, dtype='int64')
    day_out_df = pd.DataFrame(zero_data_list, columns=stations_list, index=time_list, dtype='int64')
    for station in stations_list:
        for time in time_list:
            hour = int(time[:2])
            minute = int(time[3:])
            single_df = passenger_flow_df[
                (passenger_flow_df['station'] == station) &
                (passenger_flow_df['datetime'] == f'2020-09-{day} {time}')
                ]
            if single_df.shape[0] != 1:
                print(f'2020/09/{day} {time} {station} is not found.')
                continue
            passengers_in = single_df['passengers-in'].tolist()[0]
            passengers_out = single_df['passengers-out'].tolist()[0]
            day_in_df.loc[time, station] = passengers_in
            day_out_df.loc[time, station] = passengers_out
    # print(day_in_df)
    # print(day_out_df)


2020/09/1 22:45 2a is not found.
2020/09/1 22:45 2b is not found.
2020/09/1 22:45 2d is not found.
