In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import numpy as np
import csv

In [2]:
df_hospital=pd.read_csv("processed_data/dublinbikes_james_hospital.csv")
df_portobello=pd.read_csv("processed_data/dublinbikes_portobello.csv")

Lets get some basic features like day of week, weekend, time

In [3]:
def get_basic_features(df:pd.DataFrame):
    df["TIME"]=pd.to_datetime(df["TIME"])
    df["DAY OF WEEK"]=df["TIME"].dt.dayofweek-1
    df["HOUR"]=df["TIME"].dt.strftime("%H").astype(int)
    df["MINUTE"]=df["TIME"].dt.strftime("%M").astype(int)

    
    return df

In [4]:
df_hospital = get_basic_features(df_hospital)
df_portobello = get_basic_features(df_portobello)

In [5]:
df_hospital[500:520]

Unnamed: 0,TIME,BIKE STANDS,AVAILABLE BIKES,DAY OF WEEK,HOUR,MINUTE
500,2020-01-29 17:40:02,40,12,1,17,40
501,2020-01-29 17:45:02,40,10,1,17,45
502,2020-01-29 17:50:02,40,12,1,17,50
503,2020-01-29 17:55:02,40,12,1,17,55
504,2020-01-29 18:00:02,40,13,1,18,0
505,2020-01-29 18:05:02,40,13,1,18,5
506,2020-01-29 18:10:02,40,15,1,18,10
507,2020-01-29 18:15:02,40,15,1,18,15
508,2020-01-29 18:20:02,40,14,1,18,20
509,2020-01-29 18:25:02,40,14,1,18,25


Lets add some more complex features using weekly, daily and short-term seasonality trends in the data

For short term trends lets use intervals of 15 min, and get the last three intervals

In [6]:
def get_trend_features(df:pd.DataFrame, prediction_time : int):
    prediction_time = np.trunc(prediction_time/5) - 1
    prediction_time=prediction_time.astype(int)
    df["10MIN"] = df["AVAILABLE BIKES"].shift(2+prediction_time)
    df["20MIN"] = df["AVAILABLE BIKES"].shift(4+prediction_time)
    df["30MIN"] = df["AVAILABLE BIKES"].shift(6+prediction_time)
    df["1DAY"] = df["AVAILABLE BIKES"].shift(12*24)
    df["2DAY"] = df["AVAILABLE BIKES"].shift(12*24*2)
    df["3DAY"] = df["AVAILABLE BIKES"].shift(12*24*3)
    df["1WEEK"] = df["AVAILABLE BIKES"].shift(12*24*7)
    df.dropna(inplace=True)
    return df

In [7]:
def normalize_features(df:pd.DataFrame):
    df["AVAILABLE BIKES"] = df["AVAILABLE BIKES"]/df["BIKE STANDS"]
    df["DAY OF WEEK"] = df["DAY OF WEEK"] / 6
    df["HOUR"] = df["HOUR"] / max(df["HOUR"])
    df["MINUTE"] = df["MINUTE"] / max(df["MINUTE"])
    return df

In [8]:
df_hospital=normalize_features(df_hospital)
df_portobello=normalize_features(df_portobello)


In [9]:
df_hospital=get_trend_features(df_hospital, 30)
df_portobello=get_trend_features(df_portobello, 30)

Unnamed: 0,TIME,BIKE STANDS,AVAILABLE BIKES,DAY OF WEEK,HOUR,MINUTE,20MIN,40MIN,60MIN,1DAY,2DAY,3DAY,1WEEK
3016,2020-02-07 11:20:02,40,2,3,11,20,2.0,0.0,2.0,3.0,5.0,2.0,0.0
3017,2020-02-07 11:25:02,40,2,3,11,25,2.0,0.0,0.0,3.0,5.0,2.0,0.0
3018,2020-02-07 11:30:02,40,2,3,11,30,2.0,2.0,0.0,3.0,5.0,2.0,1.0
3019,2020-02-07 11:35:02,40,2,3,11,35,2.0,2.0,0.0,2.0,5.0,0.0,1.0
3020,2020-02-07 11:40:02,40,2,3,11,40,2.0,2.0,0.0,2.0,5.0,0.0,0.0
3021,2020-02-07 11:45:02,40,2,3,11,45,2.0,2.0,2.0,2.0,4.0,0.0,0.0
3022,2020-02-07 11:50:02,40,2,3,11,50,2.0,2.0,2.0,2.0,4.0,0.0,0.0
3023,2020-02-07 11:55:02,40,2,3,11,55,2.0,2.0,2.0,2.0,5.0,0.0,1.0
3024,2020-02-07 12:00:02,40,1,3,12,0,2.0,2.0,2.0,1.0,5.0,0.0,0.0
3025,2020-02-07 12:05:02,40,1,3,12,5,2.0,2.0,2.0,1.0,5.0,0.0,1.0
