In [None]:
# %pip install s3fs seaborn xgboost pyarrow s3fs boto3==1.26.37 awswrangler --upgrade
%pip install xgboost boto3 awswrangler seaborn --upgrade

## Downloading the data

In [None]:
import pandas as pd
import numpy as np
import boto3

# To handle import certain amount of rows for parquet files
import awswrangler as wr

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import matplotlib

matplotlib.rcParams['figure.figsize'] = 10,10


#########################################
### SELECT INPUT
#########################################

Change dataset if desired

In [None]:
%%time

# Using aws wrangler
train_s3_path = "s3://dsoaws/nyc-taxi-orig-cleaned-dropped-parquet-2019/train/"
validation_s3_path = "s3://dsoaws/nyc-taxi-orig-cleaned-dropped-parquet-2019/validation/"

train_df = wr.s3.read_parquet(path=train_s3_path, dataset=True)
validation_df = wr.s3.read_parquet(path=validation_s3_path, dataset=True)

validation_df


In [None]:
train_df

In [None]:
%%time

train_df.info()

### Identifying input and output columns

In [None]:
%%time

train_df.corr()

In [None]:
# input_col = ['vendor_id','pickup_at','dropoff_at', 'passenger_count','store_and_fwd_flag']
input_col = ['fare_amount','extra','tip_amount', 'tolls_amount','passenger_count', 'trip_distance']
target_col = 'total_amount'

In [None]:
train_inputs = train_df[input_col].copy()
train_targets = train_df[target_col].copy()

In [None]:
val_inputs = validation_df[input_col].copy()
val_targets = validation_df[target_col].copy()

In [None]:
numeric_col = train_inputs.select_dtypes(include=np.number).columns.tolist()
cate_col = train_inputs.select_dtypes('object').columns.tolist()

In [None]:
train_inputs[numeric_col]

In [None]:
train_inputs[cate_col]

### Imputing missing values


Imputing is an technique in which the fill in the missing numeric values based on some category. Here if we want we'll use SimpleImputer form sklearn moduel

Reference:https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html

In [None]:
%%time

train_inputs[numeric_col].isna().sum()

In [None]:
%%time

val_inputs[numeric_col].isna().sum()

It seems like there are no missing values in the train and validation datasets.

### Scaling numeric values

Now let's scale the numeric values to in range of (0,1) 

Reference:https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html

In [None]:
train_inputs[numeric_col].describe()

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [None]:
%%time

scaler.fit(train_inputs[numeric_col])

In [None]:
%%time

train_inputs[numeric_col] = scaler.transform(train_inputs[numeric_col])
val_inputs[numeric_col]  = scaler.transform(val_inputs[numeric_col])

In [None]:
train_inputs[numeric_col].describe()

### Encoding categorical columns

Now are going to encode categorical columns using one hot encoder into one hot numeric array
Reference:https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder  = OneHotEncoder(sparse=False,handle_unknown='ignore')

In [None]:
%%time

encoder.fit(train_inputs[cate_col])

In [None]:
enc_col = encoder.get_feature_names(cate_col).tolist()
enc_col

In [None]:
%%time

train_inputs[enc_col] = encoder.transform(train_inputs[cate_col])
val_inputs[enc_col] = encoder.transform(val_inputs[cate_col])

In [None]:
train_inputs[enc_col]

In [None]:
val_inputs

In [None]:
%%time

x_train = train_inputs[numeric_col + enc_col]
x_val = val_inputs[numeric_col+enc_col]


## Model

## XGBoost

XGBoost is an powerful approach for solving regresion models.Since we are dealing regression problems, we are now going to use xgboost model
Reference:https://xgboost.readthedocs.io/en/latest/

In [None]:
import xgboost

print(xgboost.__version__)

In [None]:
from xgboost import XGBRegressor
model =  XGBRegressor(n_jobs=-1,random_state=42)

In [None]:
%%time

model.fit(x_train,train_targets)

In [None]:
%%time

model.predict(x_train)

In [None]:
from sklearn.metrics import mean_squared_error

mean_squared_error(model.predict(x_train),train_targets,squared=False)

In [None]:
mean_squared_error(model.predict(x_val),val_targets,squared=False)

## Hyperparameter tuning (Optional)

Now we have created a model, next we have to tune the model using the parameters so that we can reduce the loss
Refer:https://towardsdatascience.com/xgboost-fine-tune-and-optimize-your-model-23d996fab663

### max_depth

In [None]:
# def test_param(**params):
#   model = XGBRegressor(n_jobs=-1,random_state=42,**params)
#   model.fit(x_train,train_targets)
#   train_rmse = mean_squared_error(model.predict(x_train),train_targets,squared=False)
#   val_rmse = mean_squared_error(model.predict(x_val),val_targets,squared=False)
#   print(train_rmse," ",val_rmse)
#   return train_rmse,val_rmse

In [None]:
# def test_param_and_plot(param_name, param_values):
#     train_errors, val_errors = [], [] 
#     for value in param_values:
#         params = {param_name: value}
#         train_rmse, val_rmse = test_param(**params)
#         train_errors.append(train_rmse)
#         val_errors.append(val_rmse)
#     plt.figure(figsize=(10,6))
#     plt.title('Overfitting curve: ' + param_name)
#     plt.plot(param_values, train_errors, 'b-o')
#     plt.plot(param_values, val_errors, 'r-o')
#     plt.xlabel(param_name)
#     plt.ylabel('RMSE')
#     plt.legend(['Training', 'Validation'])

In [None]:
# test_param(max_depth=2)

In [None]:
# test_param(max_depth=5)

In [None]:
# test_param(max_depth=10)

In [None]:
# test_param_and_plot('max_depth',[x for x in range(0,60,10)])

### n_estimators

In [None]:
# test_param(n_estimators=10)

In [None]:
# test_param(n_estimators=20)

In [None]:
# test_param(n_estimators=30)

In [None]:
# test_param(n_estimators=40)

In [None]:
# test_param(n_estimators=50)

In [None]:
# test_param(n_estimators=50,max_depth=30)

In [None]:
# test_param(n_estimators=60,max_depth=30)

In [None]:
# test_param_and_plot('n_estimators',[x for x in range(100,1100,100)])

### learning_rate

In [None]:
# test_param(learning_rate=0.01)

In [None]:
# test_param(learning_rate=0.1)

In [None]:
# test_param(learning_rate=0.3)

In [None]:
# test_param(learning_rate=0.5)

In [None]:
# test_param(learning_rate=0.9)

In [None]:
# test_param(learning_rate=1)

In [None]:
# test_param(learning_rate=1,max_depth=60,n_estimators=60)

In [None]:
# test_param(learning_rate=1,max_depth=60,n_estimators=100)

### booster

In [None]:
# test_param(booster='gblinear')

In [None]:
# test_param(booster='gblinear', max_depth=60, n_estimators=100, learning_rate=1)

In [None]:
# test_param(booster='gbtree',max_depth=60,n_estimators=100,learning_rate=1)

## Putting it together

In [None]:
# model = XGBRegressor(n_jobs=-1,random_state=42,max_depth=60,n_estimaotrs=100,learning_rate=0.9,booster='gbtree')

In [None]:
# model.fit(x_train,train_targets)

In [None]:
# mean_squared_error(model.predict(x_train),train_targets,squared=False)

In [None]:
# mean_squared_error(model.predict(x_val),val_targets,squared=False)

In [None]:
# mean_squared_error(model.predict(x_test),test_targets,squared=False)