# Modeling - KNN Regression

In [1]:
import pickle
import pandas as pd
import numpy as np
import yaml

In [2]:
try:
    with open("../config.yaml", 'r') as file:
        config = yaml.safe_load(file)
except Exception as e:
    print('Error reading config file')

In [3]:
data = pd.read_csv(config['data']['data_cleaned_after_exploration'], dtype={'weekday':str, 'start_station_id':str})
data['started_at_rounded'] = pd.to_datetime(data['started_at_rounded'], format="%Y-%m-%d %H:%M:%S")

In [4]:
data.head()

Unnamed: 0,started_at_rounded,weekday,start_station_id,count,start_station_name,start_lat,start_lng,temperature_f,precipitation_in,windspeed_mph,special_event
0,2021-07-01,4,4993.13,1,Old Slip & South St,40.703367,-74.007868,78.6,0.0,3.2,0
1,2021-07-01,4,5065.12,1,William St & Pine St,40.707179,-74.008873,78.6,0.0,3.2,0
2,2021-07-01,4,5114.06,1,West Thames St,40.708347,-74.017134,78.6,0.0,3.2,0
3,2021-07-01,4,5137.11,1,Fulton St & William St,40.709601,-74.006551,78.6,0.0,3.2,0
4,2021-07-01,4,5137.13,1,Spruce St & Gold St,40.710323,-74.004323,78.6,0.0,3.2,0


## Features to include in the dataset for modeling
- We will reduce the dimensionality of the station ids
- We will do some feature engineering on the datetime to extract the info we need
- We will check whether encoding the datetime cyclically will produce better results

In [5]:
select_data = data[['started_at_rounded', 'start_station_id', 'temperature_f', 'precipitation_in', 'windspeed_mph', 'special_event', 'count']].copy()

## Reducing cardinality of `start_station_id`

We will group stations into three categories in a new column called `region`:
- lower Manhattan east: `lme`
- lower Manhattan west: `lmw`
- lower Manhattan south: `lmw`

In [6]:
lme = ["5065.14", "5096.12", "5137.13", "5137.11","5065.04", "5065.12","4953.04", "5175.08",  "5207.01"]
lmw = ["5114.06","5145.02","5184.08","5105.01", "5297.02", "5329.08", "5216.04", "5288.09",  "5288.12", "5216.06", "5288.08"]
lms = ["5033.01","4962.01","5001.08","4962.02","4962.08","4993.02", "4993.13", "5073.07", "4846.01","4889.06"]

In [7]:
select_data['region'] = list(map(lambda x: 'lme' if x in lme else ('lmw' if x in lmw else 'lms'), select_data['start_station_id']))
select_data.drop('start_station_id', axis=1, inplace=True)

In [8]:
select_data.head()

Unnamed: 0,started_at_rounded,temperature_f,precipitation_in,windspeed_mph,special_event,count,region
0,2021-07-01,78.6,0.0,3.2,0,1,lms
1,2021-07-01,78.6,0.0,3.2,0,1,lme
2,2021-07-01,78.6,0.0,3.2,0,1,lmw
3,2021-07-01,78.6,0.0,3.2,0,1,lme
4,2021-07-01,78.6,0.0,3.2,0,1,lme


## Feature Engineering Datetime

We will try two ways of representing the datetime information: cyclically and not, to see which way produces better results

In [9]:
def extract_dt_features(df, cyclic=False):
    df_temp = df.copy()

    if cyclic == False:
        df_temp['week_number'] = df_temp['started_at_rounded'].dt.isocalendar().week
        df_temp['month'] = df_temp['started_at_rounded'].dt.month
        df_temp['weekday'] = df_temp['started_at_rounded'].dt.dayofweek # Mon =0, Sun = 6
        df_temp['hour'] = df_temp['started_at_rounded'].dt.hour #0 to 23
    
    
    else:
        df_temp['week_number_sin'] = np.sin(2 * np.pi * df_temp['started_at_rounded'].dt.isocalendar().week/52.0)
        df_temp['week_number_cos'] = np.cos(2 * np.pi * df_temp['started_at_rounded'].dt.isocalendar().week/52.0)
        
        
        df_temp['month_sin'] = np.sin(2 * np.pi * df_temp['started_at_rounded'].dt.month/12.0)
        df_temp['month_cos'] = np.cos(2 * np.pi * df_temp['started_at_rounded'].dt.month/12.0)
        
        
        df_temp['weekday_sin'] = np.sin(2 * np.pi * df_temp['started_at_rounded'].dt.dayofweek/7.0)
        df_temp['weekday_cos'] = np.sin(2 * np.pi * df_temp['started_at_rounded'].dt.dayofweek/7.0) 
        
        
        df_temp['hour_sin'] = np.sin(2 * np.pi * df_temp['started_at_rounded'].dt.hour/24.0) 
        df_temp['hour_cos'] = np.cos(2 * np.pi * df_temp['started_at_rounded'].dt.hour/24.0) 

    #drop datetime col
    df_temp.drop('started_at_rounded', axis=1, inplace=True)

    return df_temp

## Not cyclically

In [10]:
data_not_cyc = select_data.copy()
data_not_cyc = extract_dt_features(data_not_cyc, cyclic=False)

In [11]:
data_not_cyc

Unnamed: 0,temperature_f,precipitation_in,windspeed_mph,special_event,count,region,week_number,month,weekday,hour
0,78.6,0.0,3.2,0,1,lms,26,7,3,0
1,78.6,0.0,3.2,0,1,lme,26,7,3,0
2,78.6,0.0,3.2,0,1,lmw,26,7,3,0
3,78.6,0.0,3.2,0,1,lme,26,7,3,0
4,78.6,0.0,3.2,0,1,lme,26,7,3,0
...,...,...,...,...,...,...,...,...,...,...
321124,68.8,0.0,4.7,0,9,lmw,26,6,4,23
321125,68.8,0.0,4.7,0,1,lmw,26,6,4,23
321126,68.8,0.0,4.7,0,4,lmw,26,6,4,23
321127,68.8,0.0,4.7,0,1,lmw,26,6,4,23


### Fitting OneHotEncoder categorical column
- `region`

In [12]:
cat_cols = ['region']

cat_data = data_not_cyc[cat_cols].copy()

#get possible values in the nominal columns
levels = [np.sort(cat_data[cols].unique()).tolist() for cols in cat_cols]

In [13]:
#instantiating OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

#fit
encoder = OneHotEncoder(categories=levels)
encoder.fit(cat_data)

#saving encoder
relative_path_to_file = "../encoders/"
file_name = "encoder.pkl"
with open(relative_path_to_file + file_name, "wb") as file:
    pickle.dump(encoder, file)

### Define X and y

In [14]:
y = data_not_cyc['count'].copy()
X = data_not_cyc.drop('count', axis=1).copy()

### Train, test split

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=31)

In [16]:
X_train.columns

Index(['temperature_f', 'precipitation_in', 'windspeed_mph', 'special_event',
       'region', 'week_number', 'month', 'weekday', 'hour'],
      dtype='object')

In [17]:
numeric_cols = ['temperature_f', 'precipitation_in','windspeed_mph', 'special_event', 'month', 'weekday', 'hour']

categorical_cols = ['region']

In [18]:
X_train_cat = X_train[categorical_cols]
X_train_num = X_train[numeric_cols]

X_test_cat = X_test[categorical_cols]
X_test_num = X_test[numeric_cols]

### Encode cat cols

In [19]:
#transform
X_train_cat_encoded_np = encoder.transform(X_train_cat).toarray()
X_test_cat_encoded_np = encoder.transform(X_test_cat).toarray()

X_train_cat_encoded_df = pd.DataFrame(X_train_cat_encoded_np, columns=encoder.get_feature_names_out(), index=X_train_cat.index)
X_test_cat_encoded_df = pd.DataFrame(X_test_cat_encoded_np,  columns=encoder.get_feature_names_out(), index=X_test_cat.index) 


In [None]:
X_train_cat_encoded_df.head()

### Scaling numeric features

In [20]:
### StandardScaler ###
from sklearn.preprocessing import StandardScaler

#fit
scaler = StandardScaler().fit(X_train_num)


#transform
X_train_num_scaled = scaler.transform(X_train_num)
X_test_num_scaled = scaler.transform(X_test_num)

X_train_num_scaled_df = pd.DataFrame(X_train_num_scaled, columns=X_train_num.columns, index=X_train_num.index)
X_test_num_scaled_df = pd.DataFrame(X_test_num_scaled, columns = X_test_num.columns, index=X_test_num.index)

### Concating datasets

In [21]:
X_train_new = pd.concat([X_train_cat_encoded_df, X_train_num_scaled_df], axis=1)
X_test_new = pd.concat([X_test_cat_encoded_df, X_test_num_scaled_df], axis=1)

In [22]:
X_train_new.head()

Unnamed: 0,region_lme,region_lms,region_lmw,temperature_f,precipitation_in,windspeed_mph,special_event,month,weekday,hour
269024,0.0,0.0,1.0,-1.430285,-0.217327,0.225968,-0.245113,-1.096222,-1.000031,-2.035846
157245,0.0,0.0,1.0,1.778431,-0.217327,0.51871,-0.245113,0.122996,-1.000031,-0.056864
268090,0.0,0.0,1.0,-0.861192,-0.217327,1.27984,-0.245113,-1.096222,1.035552,1.097542
113527,1.0,0.0,0.0,-0.746162,-0.217327,0.255242,-0.245113,-0.791418,1.544447,1.097542
196931,1.0,0.0,0.0,-0.20734,0.976738,0.870001,-0.245113,1.037409,-0.491136,-0.716524


### KNN Regressor on non-cyclic dataset

In [23]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor()

knn.fit(X_train_new, y_train)

In [24]:
knn.score(X_test_new, y_test)

0.6424324819512734

## Datetime info represented cyclically

In [25]:
data_cyc = select_data.copy()
data_cyc = extract_dt_features(data_cyc, cyclic=True)

In [26]:
data_cyc

Unnamed: 0,temperature_f,precipitation_in,windspeed_mph,special_event,count,region,week_number_sin,week_number_cos,month_sin,month_cos,weekday_sin,weekday_cos,hour_sin,hour_cos
0,78.6,0.0,3.2,0,1,lms,-0.0,-1.0,-5.000000e-01,-0.866025,0.433884,0.433884,0.000000,1.000000
1,78.6,0.0,3.2,0,1,lme,-0.0,-1.0,-5.000000e-01,-0.866025,0.433884,0.433884,0.000000,1.000000
2,78.6,0.0,3.2,0,1,lmw,-0.0,-1.0,-5.000000e-01,-0.866025,0.433884,0.433884,0.000000,1.000000
3,78.6,0.0,3.2,0,1,lme,-0.0,-1.0,-5.000000e-01,-0.866025,0.433884,0.433884,0.000000,1.000000
4,78.6,0.0,3.2,0,1,lme,-0.0,-1.0,-5.000000e-01,-0.866025,0.433884,0.433884,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
321124,68.8,0.0,4.7,0,9,lmw,-0.0,-1.0,1.224647e-16,-1.000000,-0.433884,-0.433884,-0.258819,0.965926
321125,68.8,0.0,4.7,0,1,lmw,-0.0,-1.0,1.224647e-16,-1.000000,-0.433884,-0.433884,-0.258819,0.965926
321126,68.8,0.0,4.7,0,4,lmw,-0.0,-1.0,1.224647e-16,-1.000000,-0.433884,-0.433884,-0.258819,0.965926
321127,68.8,0.0,4.7,0,1,lmw,-0.0,-1.0,1.224647e-16,-1.000000,-0.433884,-0.433884,-0.258819,0.965926


In [39]:
#data_cyc.to_csv('../data/cleaned/data_region_cyc.csv', index=False)

### Define X and y

In [27]:
y = data_cyc['count'].copy()
X = data_cyc.drop('count', axis=1).copy()

### Train test split

In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=31)

In [29]:
X_train.columns

Index(['temperature_f', 'precipitation_in', 'windspeed_mph', 'special_event',
       'region', 'week_number_sin', 'week_number_cos', 'month_sin',
       'month_cos', 'weekday_sin', 'weekday_cos', 'hour_sin', 'hour_cos'],
      dtype='object')

In [30]:
numeric_cols = ['temperature_f', 'precipitation_in', 'windspeed_mph', 'special_event',
       'week_number_sin', 'week_number_cos', 'month_sin',
       'month_cos', 'weekday_sin', 'weekday_cos', 'hour_sin', 'hour_cos']

cat_cols = ['region']

In [31]:
X_train_cat = X_train[cat_cols]
X_train_num = X_train[numeric_cols]

X_test_cat = X_test[cat_cols]
X_test_num = X_test[numeric_cols]

### Encode cat cols

In [32]:
#transform

X_train_cat_encoded_np = encoder.transform(X_train_cat).toarray()
X_test_cat_encoded_np = encoder.transform(X_test_cat).toarray()

X_train_cat_encoded_df = pd.DataFrame(X_train_cat_encoded_np, columns=encoder.get_feature_names_out(), index=X_train_cat.index)
X_test_cat_encoded_df = pd.DataFrame(X_test_cat_encoded_np,  columns=encoder.get_feature_names_out(), index=X_test_cat.index) 


In [33]:
X_train_cat_encoded_df.head()

Unnamed: 0,region_lme,region_lms,region_lmw
269024,0.0,0.0,1.0
157245,0.0,0.0,1.0
268090,0.0,0.0,1.0
113527,1.0,0.0,0.0
196931,1.0,0.0,0.0


In [34]:
### StandardScaler ###
from sklearn.preprocessing import StandardScaler

#fit
scaler = StandardScaler().fit(X_train_num)

#transform
X_train_num_scaled = scaler.transform(X_train_num)
X_test_num_scaled = scaler.transform(X_test_num)

X_train_num_scaled_df = pd.DataFrame(X_train_num_scaled, columns=X_train_num.columns, index=X_train_num.index)
X_test_num_scaled_df = pd.DataFrame(X_test_num_scaled, columns = X_test_num.columns, index=X_test_num.index)

### Concating datasets

In [35]:
X_train_new = pd.concat([X_train_cat_encoded_df, X_train_num_scaled_df], axis=1)
X_test_new = pd.concat([X_test_cat_encoded_df, X_test_num_scaled_df], axis=1)

In [36]:
X_train_new.head()

Unnamed: 0,region_lme,region_lms,region_lmw,temperature_f,precipitation_in,windspeed_mph,special_event,week_number_sin,week_number_cos,month_sin,month_cos,weekday_sin,weekday_cos,hour_sin,hour_cos
269024,0.0,0.0,1.0,-1.430285,-0.217327,0.225968,-0.245113,1.416093,0.281362,1.463418,0.098574,1.081904,1.081904,0.653723,1.65789
157245,0.0,0.0,1.0,1.778431,-0.217327,0.51871,-0.245113,-0.476102,-1.231191,-0.65693,-1.134208,1.081904,1.081904,-0.106729,-1.153955
268090,0.0,0.0,1.0,-0.861192,-0.217327,1.27984,-0.245113,1.385522,0.451566,1.463418,0.098574,-1.415218,-1.415218,-0.998765,0.979726
113527,1.0,0.0,0.0,-0.746162,-0.217327,0.255242,-0.245113,1.385522,-0.234288,1.274036,-0.613173,-1.140743,-1.140743,-0.998765,0.979726
196931,1.0,0.0,0.0,-0.20734,0.976738,0.870001,-0.245113,-1.372268,0.281362,-1.174331,0.81032,1.356378,1.356378,1.312295,-0.777239


### KNN Regressor on non-cyclic dataset

In [37]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor()

knn.fit(X_train_new, y_train)

In [38]:
knn.score(X_test_new, y_test)

0.6428519588830555