# Optimizing - KNN, Random Forest, Gradient Boost 

In [1]:
import pickle
import pandas as pd
import numpy as np
import yaml

In [14]:
try:
    with open("../config.yaml", 'r') as file:
        config = yaml.safe_load(file)
except Exception as e:
    print('Error reading config file')

In [7]:
data = pd.read_csv(config['data']['data_region_cyc'])

In [8]:
data.head()

Unnamed: 0,temperature_f,precipitation_in,windspeed_mph,special_event,count,region,week_number_sin,week_number_cos,month_sin,month_cos,weekday_sin,weekday_cos,hour_sin,hour_cos
0,78.6,0.0,3.2,0,1,lms,-3.216245e-16,-1.0,-0.5,-0.866025,0.433884,0.433884,0.0,1.0
1,78.6,0.0,3.2,0,1,lme,-3.216245e-16,-1.0,-0.5,-0.866025,0.433884,0.433884,0.0,1.0
2,78.6,0.0,3.2,0,1,lmw,-3.216245e-16,-1.0,-0.5,-0.866025,0.433884,0.433884,0.0,1.0
3,78.6,0.0,3.2,0,1,lme,-3.216245e-16,-1.0,-0.5,-0.866025,0.433884,0.433884,0.0,1.0
4,78.6,0.0,3.2,0,1,lme,-3.216245e-16,-1.0,-0.5,-0.866025,0.433884,0.433884,0.0,1.0


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 321129 entries, 0 to 321128
Data columns (total 14 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   temperature_f     321129 non-null  float64
 1   precipitation_in  321129 non-null  float64
 2   windspeed_mph     321129 non-null  float64
 3   special_event     321129 non-null  int64  
 4   count             321129 non-null  int64  
 5   region            321129 non-null  object 
 6   week_number_sin   321129 non-null  float64
 7   week_number_cos   321129 non-null  float64
 8   month_sin         321129 non-null  float64
 9   month_cos         321129 non-null  float64
 10  weekday_sin       321129 non-null  float64
 11  weekday_cos       321129 non-null  float64
 12  hour_sin          321129 non-null  float64
 13  hour_cos          321129 non-null  float64
dtypes: float64(11), int64(2), object(1)
memory usage: 34.3+ MB


## Define X and y

In [10]:
y = data['count']
X = data.drop('count', axis=1)

## Test and Train split

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=31)

In [17]:
#get num and cat columns

X_train_cat = X_train.select_dtypes('object')
X_train_num = X_train.select_dtypes(np.number)

X_test_cat = X_test.select_dtypes('object')
X_test_num = X_test.select_dtypes(np.number)

## Encode Categorical column

In [15]:
with open(config['encoders']['encoder'], "rb") as file:
    encoder = pickle.load(file)

In [18]:
#transform
X_train_cat_encoded_np = encoder.transform(X_train_cat).toarray()
X_test_cat_encoded_np = encoder.transform(X_test_cat).toarray()

X_train_cat_encoded_df = pd.DataFrame(X_train_cat_encoded_np, columns=encoder.get_feature_names_out(), index=X_train_cat.index)
X_test_cat_encoded_df = pd.DataFrame(X_test_cat_encoded_np,  columns=encoder.get_feature_names_out(), index=X_test_cat.index)

## Scale Numeric columns
for KNN

In [22]:
### StandardScaler ###
from sklearn.preprocessing import StandardScaler

#fit
scaler = StandardScaler().fit(X_train_num)

relative_path_to_file = "../scalers/"
file_name = "scaler.pkl"
with open(relative_path_to_file + file_name, "wb") as file:
    pickle.dump(scaler, file)


In [23]:
#transform
X_train_num_scaled = scaler.transform(X_train_num)
X_test_num_scaled = scaler.transform(X_test_num)

X_train_num_scaled_df = pd.DataFrame(X_train_num_scaled, columns=X_train_num.columns, index=X_train_num.index)
X_test_num_scaled_df = pd.DataFrame(X_test_num_scaled, columns = X_test_num.columns, index=X_test_num.index)

## Concat for KNN

In [24]:
X_train_new_knn = pd.concat([X_train_cat_encoded_df, X_train_num_scaled_df], axis=1)
X_test_new_knn = pd.concat([X_test_cat_encoded_df, X_test_num_scaled_df], axis=1)

## Concat for Tree-based

In [25]:
X_train_new_tree = pd.concat([X_train_cat_encoded_df, X_train_num], axis=1)
X_test_new_tree = pd.concat([X_test_cat_encoded_df, X_test_num], axis=1)

# KNN optimization
finding optimal parameters with gridsearchcv

In [28]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor()

param_grid = {'n_neighbors':[3,5,7],
            'weights':['uniform', 'distance']}

# initialize
grid_search = GridSearchCV(knn,param_grid, cv=5, return_train_score=True)


# fit
grid_search.fit(X_train_new_knn,y_train)
grid_search.best_score_

0.6637226180915506

In [29]:
grid_search.best_params_

{'n_neighbors': 7, 'weights': 'uniform'}

In [31]:
#saving knn best params
relative_path_to_file = "../gridsearch/"
file_name = "knn_gridsearch.pkl"
with open(relative_path_to_file + file_name, "wb") as file:
    pickle.dump(grid_search, file)

# Random Forest optimization

In [33]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor()

param_grid = {'min_samples_split':[2,5],
                'min_samples_leaf':[1,2,5],
                'max_features':[1.0, 'sqrt'],
                'random_state': [5]}

# initialize
grid_search = GridSearchCV(rfr,param_grid, cv=5, return_train_score=True)


# fit
grid_search.fit(X_train_new_tree,y_train)
grid_search.best_score_

0.6907056591896203

In [34]:
grid_search.best_params_

{'max_features': 'sqrt',
 'min_samples_leaf': 5,
 'min_samples_split': 2,
 'random_state': 5}