In [None]:
import pandas as pd
import os.path as path
import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [None]:
def preprocess(year_month):
    pickup_summary = None
    dropoff_summary = None

    for ym in year_month:
        file = "{}-citibike-tripdata.csv".format(ym)
        if path.exists(file):
            print("Processing {}".format(ym), end=', ')
            raw = pd.read_csv(file)
            raw.columns = raw.columns.str.lower().str.replace(' ', '')
            raw.columns = raw.columns.str.lower().str.replace('_', '')
            raw.columns = raw.columns.str.lower().str.replace('startedat', 'starttime')
            raw.columns = raw.columns.str.lower().str.replace('endedat', 'stoptime')
            raw['starttime'] = pd.to_datetime(raw['starttime'])
            raw['stoptime'] = pd.to_datetime(raw['stoptime'])

            pickup = raw[["startstationid","starttime"]]
            pickup['date'] = pd.to_datetime(pickup['starttime'].dt.date)
            pickup['hour'] = pickup['starttime'].dt.hour
            pickup['half_day'] = 0
            pickup.loc[pickup['hour']>11, 'half_day'] = 1
            count_pickup = pickup.groupby(['startstationid', 'date', 'half_day']).size().to_frame(name = 'pickup').reset_index()

            dropoff = raw[["endstationid","stoptime"]]
            dropoff['date'] = pd.to_datetime(dropoff['stoptime'].dt.date)
            dropoff['hour'] = dropoff['stoptime'].dt.hour
            dropoff['half_day'] = 0
            dropoff.loc[pickup['hour']>11, 'half_day'] = 1
            count_dropoff = dropoff.groupby(['endstationid', 'date', 'half_day']).size().to_frame(name = 'dropoff').reset_index()

            if pickup_summary is not None:
                pickup_summary = pickup_summary.append(count_pickup)
            else:
                pickup_summary = count_pickup.copy()

            if dropoff_summary is not None:
                dropoff_summary = dropoff_summary.append(count_dropoff)
            else:
                dropoff_summary = count_dropoff.copy()
            print("Pickup data count: ", len(pickup_summary), end=', ')
            print("Dropoff data count: ", len(dropoff_summary))
    return pickup_summary, dropoff_summary

In [None]:
%%time
year_month = []
for y in range(2013,2022):
    for m in range(1,13):
        ym = str(y)+"{0:0=2d}".format(m)
        year_month.append(ym)
pickup_summary, dropoff_summary = preprocess(year_month[99:100])
pickup_summary.to_csv('pickup_summary.csv')
dropoff_summary.to_csv('dropoff_summary.csv')

Processing 202104, Pickup data count:  110543, Dropoff data count:  112595
Wall time: 11.3 s
