# Data prep

After much trial and error, it was determined that keeping the station ids as a numeric feature yielded better results than OneHotEncoding them.

This notebook produces all the files needed to run the different versions of the models.

## Feature Engineering stations

In [1]:
import pickle
import pandas as pd
import numpy as np
import yaml
from functions import extract_dt_features

In [3]:
try:
    with open("../config.yaml", 'r') as file:
        config = yaml.safe_load(file)
except Exception as e:
    print('Error reading config file')

In [4]:
data = pd.read_csv(config['data']['data_cleaned_after_exploration'], dtype={'weekday':str, 'start_station_id':str})
data['started_at_rounded'] = pd.to_datetime(data['started_at_rounded'], format="%Y-%m-%d %H:%M:%S")

In [5]:
data.head()

Unnamed: 0,started_at_rounded,weekday,start_station_id,count,start_station_name,start_lat,start_lng,temperature_f,precipitation_in,windspeed_mph,special_event
0,2021-07-01,4,4993.13,1,Old Slip & South St,40.703367,-74.007868,78.6,0.0,3.2,0
1,2021-07-01,4,5065.12,1,William St & Pine St,40.707179,-74.008873,78.6,0.0,3.2,0
2,2021-07-01,4,5114.06,1,West Thames St,40.708347,-74.017134,78.6,0.0,3.2,0
3,2021-07-01,4,5137.11,1,Fulton St & William St,40.709601,-74.006551,78.6,0.0,3.2,0
4,2021-07-01,4,5137.13,1,Spruce St & Gold St,40.710323,-74.004323,78.6,0.0,3.2,0


## Features to include in the dataset for modeling
- We will reduce the cardinality of the station ids
- We will do some feature engineering on the datetime to extract the info we need
- We will check whether encoding the datetime cyclically will produce better results

In [6]:
select_data = data[['started_at_rounded', 'start_station_id', 'temperature_f', 'precipitation_in', 'windspeed_mph', 'special_event', 'count']].copy()

## Reducing cardinality of `start_station_id`

We will group stations into three categories in a new column called `region`:
- lower Manhattan east: `lme`
- lower Manhattan west: `lmw`
- lower Manhattan south: `lmw`

In [7]:
lme = ["5065.14", "5096.12", "5137.13", "5137.11","5065.04", "5065.12","4953.04", "5175.08","5105.01",  "5207.01"]
lmw = ["5114.06","5145.02","5184.08", "5297.02", "5329.08", "5216.04", "5288.09",  "5288.12", "5216.06", "5288.08"]
lms = ["5033.01","4962.01","5001.08","4962.02","4962.08","4993.02", "4993.13", "5073.07", "4846.01","4889.06"]

In [8]:
select_data['region'] = list(map(lambda x: 'lme' if x in lme else ('lmw' if x in lmw else 'lms'), select_data['start_station_id']))
select_data.drop('start_station_id', axis=1, inplace=True)

In [9]:
select_data.head()

Unnamed: 0,started_at_rounded,temperature_f,precipitation_in,windspeed_mph,special_event,count,region
0,2021-07-01,78.6,0.0,3.2,0,1,lms
1,2021-07-01,78.6,0.0,3.2,0,1,lme
2,2021-07-01,78.6,0.0,3.2,0,1,lmw
3,2021-07-01,78.6,0.0,3.2,0,1,lme
4,2021-07-01,78.6,0.0,3.2,0,1,lme


In [10]:
select_data.to_csv('../data/cleaned/select_data_with_regions.csv', index=False)

## Making non-cyclic and cyclic data for testing

### not cyclic

In [14]:
data_not_cyc = select_data.copy()
data_not_cyc = extract_dt_features(data_not_cyc, cyclic=False)

In [15]:
data_not_cyc.to_csv('../data/cleaned/data_not_cyc.csv', index=False)

### cyclic

In [16]:
data_cyc = select_data.copy()
data_cyc = extract_dt_features(data_cyc, cyclic=True)

In [17]:
data_cyc.to_csv('../data/cleaned/data_region_cyc.csv', index=False)

## Data set without OneHotEncoded station ids

In [4]:
data = pd.read_csv(config['data']['data_cleaned_after_exploration'], dtype={'weekday':str, 'start_station_id':str})
data['started_at_rounded'] = pd.to_datetime(data['started_at_rounded'], format="%Y-%m-%d %H:%M:%S")

In [5]:
data.head()

Unnamed: 0,started_at_rounded,weekday,start_station_id,count,start_station_name,start_lat,start_lng,temperature_f,precipitation_in,windspeed_mph,special_event
0,2021-07-01,4,4993.13,1,Old Slip & South St,40.703367,-74.007868,78.6,0.0,3.2,0
1,2021-07-01,4,5065.12,1,William St & Pine St,40.707179,-74.008873,78.6,0.0,3.2,0
2,2021-07-01,4,5114.06,1,West Thames St,40.708347,-74.017134,78.6,0.0,3.2,0
3,2021-07-01,4,5137.11,1,Fulton St & William St,40.709601,-74.006551,78.6,0.0,3.2,0
4,2021-07-01,4,5137.13,1,Spruce St & Gold St,40.710323,-74.004323,78.6,0.0,3.2,0


In [6]:
data_with_ids_cyc = extract_dt_features(data, cyclic=True)

In [7]:
data_with_ids_cyc

Unnamed: 0,weekday,start_station_id,count,start_station_name,start_lat,start_lng,temperature_f,precipitation_in,windspeed_mph,special_event,week_number_sin,week_number_cos,month_sin,month_cos,weekday_sin,weekday_cos,hour_sin,hour_cos
0,4,4993.13,1,Old Slip & South St,40.703367,-74.007868,78.6,0.0,3.2,0,-0.0,-1.0,-5.000000e-01,-0.866025,0.433884,-0.900969,0.000000,1.000000
1,4,5065.12,1,William St & Pine St,40.707179,-74.008873,78.6,0.0,3.2,0,-0.0,-1.0,-5.000000e-01,-0.866025,0.433884,-0.900969,0.000000,1.000000
2,4,5114.06,1,West Thames St,40.708347,-74.017134,78.6,0.0,3.2,0,-0.0,-1.0,-5.000000e-01,-0.866025,0.433884,-0.900969,0.000000,1.000000
3,4,5137.11,1,Fulton St & William St,40.709601,-74.006551,78.6,0.0,3.2,0,-0.0,-1.0,-5.000000e-01,-0.866025,0.433884,-0.900969,0.000000,1.000000
4,4,5137.13,1,Spruce St & Gold St,40.710323,-74.004323,78.6,0.0,3.2,0,-0.0,-1.0,-5.000000e-01,-0.866025,0.433884,-0.900969,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
321124,5,5216.06,9,Vesey St & Church St,40.712455,-74.010822,68.8,0.0,4.7,0,-0.0,-1.0,1.224647e-16,-1.000000,-0.433884,-0.900969,-0.258819,0.965926
321125,5,5288.08,1,Park Pl & Church St,40.713089,-74.009329,68.8,0.0,4.7,0,-0.0,-1.0,1.224647e-16,-1.000000,-0.433884,-0.900969,-0.258819,0.965926
321126,5,5288.09,4,Warren St & W Broadway,40.714740,-74.009106,68.8,0.0,4.7,0,-0.0,-1.0,1.224647e-16,-1.000000,-0.433884,-0.900969,-0.258819,0.965926
321127,5,5288.12,1,Murray St & Greenwich St,40.714694,-74.011219,68.8,0.0,4.7,0,-0.0,-1.0,1.224647e-16,-1.000000,-0.433884,-0.900969,-0.258819,0.965926


In [8]:
data_with_ids_cyc.to_csv('../data/cleaned/data_with_ids_cyc.csv', index=False)

In [9]:
data_with_ids_not_cyc = extract_dt_features(data, cyclic=False)

In [10]:
data_with_ids_not_cyc.to_csv("../data/cleaned/data_with_ids_not_cyc.csv", index=False)