In [2]:
import pandas as pd
import numpy as np
from datetime import datetime

print("Pandas Version: " + pd.__version__)
print("Numpy Version: " + np.__version__)

Pandas Version: 1.2.0
Numpy Version: 1.18.5


In [3]:
trips = pd.read_csv("../data/clean/trips.csv", parse_dates=["date_from", "date_until"])
trips.head(3)

Unnamed: 0,bike_id,user_id,date_from,date_until,start_station_name,start_station_id,end_station_name,end_station_id,booked_via,duration_in_min,distance_in_km
7669947,116255,5BB54A7EBCD7A5A88FD410A537E10160BA120BB2,2017-05-16 07:15:40,2017-05-16 07:19:49,Heußweg/Wiesenstraße,201326,Lappenbergsallee / Bei der Apostelkirche,243618,Terminal HH_11 (-2225-),5,0.620216
7669948,119663,1024F6970D5BE146588D64F6AF427E147ADC642E,2017-05-16 07:36:36,2017-05-16 07:44:16,Bahnhof Altona Ost/Max-Brauer-Allee,131646,Neuer Pferdemarkt / Beim Grünen Jäger,131890,iPhone SRH,8,1.990734
7669949,120488,CC6405146B51242A9169AB55E88A5C472EA1B2AA,2017-05-16 07:40:17,2017-05-16 07:50:07,Weidestraße/Biedermannplatz,211922,Mundsburg / Schürbeker Straße,140799,Techniker HH_119 (-2334-),10,1.24115


In [8]:
weather = pd.read_csv("../data/clean/weather.csv", parse_dates=["datetime"])
weather.head(3)

Unnamed: 0,datetime,precip_intensity,precip_probability,precip_type,temperature,humidity,wind_speed,wind_bearing,uv_index,visibility
0,2014-01-01 00:00:00,0.0,0.0,none,1.93,0.81,3.91,161.0,0,9.988
1,2014-01-01 01:00:00,0.0,0.0,none,1.94,0.85,4.56,140.0,0,6.004
2,2014-01-01 02:00:00,0.0,0.0,none,1.81,0.85,4.12,143.0,0,9.988


In [9]:
stations = pd.read_csv("../data/clean/stations.csv")
stations.head(3)

Unnamed: 0,station_id,name,longitude,latitude
0,131543,Landungsbrücke/Hafentor,9.9723,53.5457
1,131546,Fischmarkt/Breite Straße,9.95088,53.5462
2,131547,Paulinenplatz/Wohlwillstraße,9.96246,53.5542


## Accumulate stations per hour and add weather data

TODO: Add number of customers

### Counting check-ins and check-outs per station per hour

In [10]:
trips_slice = trips[["date_from", "date_until", "start_station_name", "end_station_name"]].copy()

In [11]:
check_ins = trips_slice.groupby([pd.Grouper(key='date_until', freq='H'), 'end_station_name']).size().to_frame('check_ins')
check_ins.index.rename(['datetime', 'station_name'], inplace=True)
check_ins.sample(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,check_ins
datetime,station_name,Unnamed: 2_level_1
2015-04-22 00:00:00,Uhlandstraße / Eingang Nord,1
2016-03-17 11:00:00,Mundsburg / Schürbeker Straße,1
2015-12-29 20:00:00,Weidestraße/Biedermannplatz,1


In [12]:
check_outs = trips_slice.groupby([pd.Grouper(freq='H', key='date_from'), 'start_station_name']).size().to_frame('check_outs')
check_outs.index.rename(['datetime', 'station_name'], inplace=True)
check_outs.sample(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,check_outs
datetime,station_name,Unnamed: 2_level_1
2015-02-03 17:00:00,Schopenstehl/Alter Fischmarkt,1
2015-04-26 01:00:00,Eimsbütteler Straße/Waterloostraße,1
2017-04-05 21:00:00,Eimsbütteler Straße/Waterloostraße,1


### Create empty data set with stats per station per hour and fill in accumulated data

In [13]:
# create date range
start_date = datetime(2014, 1, 1, 0)
end_date = datetime(2017, 5, 15, 11, 59, 59)
date_range = pd.date_range(start_date, end_date, freq="H")

station_names = sorted(pd.unique(stations["name"]))

# create multiindex from hourly timestamps and station names
index = pd.MultiIndex.from_product([date_range, station_names], names=['datetime', 'station_name'])

# create empty data set with index
empty_df = pd.DataFrame(index=index)

# insert calculations and fill missing values with 0, meaning no check-outs or check-ins observed
station_count = empty_df.join(check_ins).join(check_outs).fillna(0)
station_count.check_ins = station_count.check_ins.astype(int)
station_count.check_outs = station_count.check_outs.astype(int)

# calculate change in bike inventory
station_count['diff'] = (station_count['check_ins'] - station_count['check_outs'])

station_count.reset_index(inplace=True)

In [14]:
station_trips_weather = pd.merge(station_count, weather, on='datetime', how='left', sort=False)
station_trips_weather.sample(3)

Unnamed: 0,datetime,station_name,check_ins,check_outs,diff,precip_intensity,precip_probability,precip_type,temperature,humidity,wind_speed,wind_bearing,uv_index,visibility
728510,2014-05-26 22:00:00,Innocentiapark/Oberstraße,1,1,0,0.0,0.0,none,17.29,0.71,3.7,54.0,0.0,9.988
2841743,2015-07-24 06:00:00,Emil-Andresen-Straße / Lohkoppelweg,0,0,0,0.0,0.0,none,13.19,0.92,1.91,114.0,0.0,10.003
2712517,2015-06-28 08:00:00,Wandsbeker Chaussee/Ritterstraße,1,1,0,0.0,0.0,none,13.46,0.94,4.03,247.0,1.0,10.003


In [15]:
station_trips_weather.to_csv("../data/clean/station_trips_weather.csv", index=False)

## Add weather data to each trip

In [16]:
# Add 'merge_datetime' as key to merge on

trips["merge_datetime"] = pd.to_datetime(pd.DataFrame(
    {
        "year": pd.DatetimeIndex(trips["date_from"]).year,
        "month": pd.DatetimeIndex(trips["date_from"]).month,
        "day": pd.DatetimeIndex(trips["date_from"]).day,
        "hour": pd.DatetimeIndex(trips["date_from"]).hour
    }
))

In [17]:
# Add weather data to every trip

trips_with_weather = pd.merge(trips, weather, left_on="merge_datetime", right_on="datetime", right_index=False)
trips_with_weather.drop(columns=["merge_datetime", "datetime"], inplace=True)
trips_with_weather.head(3)

Unnamed: 0,bike_id,user_id,date_from,date_until,start_station_name,start_station_id,end_station_name,end_station_id,booked_via,duration_in_min,distance_in_km,precip_intensity,precip_probability,precip_type,temperature,humidity,wind_speed,wind_bearing,uv_index,visibility
0,143517,A821059B555C7764A2FF801180874A2FCB326222,2014-01-01 00:34:54,2014-01-01 00:50:14,U-Bahn Baumwall,214170,Mönckebergstraße / Rosenstraße,131880,iPhone SRH,16,1.293661,0.0,0.0,none,1.93,0.81,3.91,161.0,0,9.988
1,120450,2C6FD52D5611B4DAC29E5197B67979D583685948,2014-01-01 00:27:47,2014-01-01 00:34:41,Lange Reihe / Lohmühlenpark,138385,Hauptbahnhof Ost / Hachmannplatz,131873,IVR,7,0.786254,0.0,0.0,none,1.93,0.81,3.91,161.0,0,9.988
2,117503,253528F56A582E05F13D8A89B20A23800DDEBEF2,2014-01-01 00:49:02,2014-01-01 00:56:32,Sternschanze / Eingang Dänenweg,139501,Bahnhof Dammtor Süd / Marseiller Straße,138382,iPhone SRH,8,1.42285,0.0,0.0,none,1.93,0.81,3.91,161.0,0,9.988


In [18]:
trips_with_weather.to_csv("../data/clean/trips_with_weather.csv", index=False)