In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
import seaborn as sns
import pickle
from imblearn.over_sampling import ADASYN
import tensorflow as tf
from tensorflow.keras.models import load_model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score 
import math

In [2]:
df = pd.read_csv('pre_encoding_data.csv')

In [3]:
df = df.drop(['prod_grp','sub_class','brand', 'dosage', 'dosage_form', 'per_pack_qty','line_qty','pack_price', 'unit_price',
              'first_line','actual_del_year', 'delay','actual_del_month', 'actual_del_day','total_cost','ship_mode'], axis=1)

In [4]:
cat_feats = df.dtypes[df.dtypes == 'object'].index.tolist()
num_feats = df.dtypes[~df.dtypes.index.isin(cat_feats)].index.tolist()

#Convert cat_feats to categorical
for col in cat_feats:
    df[col] = df[col].astype('str')

# Removing delay from num_feats as it is the dependant variable
num_feats.remove('freight')

In [5]:
# Create Pipeline
numerical = Pipeline([('standard_scaler', StandardScaler())])
                            
categorical = Pipeline([('encode', OneHotEncoder()),('Dense',FunctionTransformer(csr_matrix.todense))])

preprocess = ColumnTransformer([('categorical', categorical, cat_feats),('numerical', numerical, num_feats)])

In [6]:
X = df.drop(columns=['freight'])
y = df['freight']

In [7]:
#Fit Preprocess to data
X = preprocess.fit_transform(X)

In [8]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=50)
X_train.shape

(7473, 367)

In [9]:
inputs = tf.keras.Input(shape=(X_train.shape[1],))
hide1 = tf.keras.layers.Dense(256, activation='relu')(inputs)
hide2 = tf.keras.layers.Dense(256, activation='relu')(hide1)
hide3 = tf.keras.layers.Dense(256, activation='relu')(hide2)
hide4 = tf.keras.layers.Dense(128, activation='relu')(hide3)
hide5 = tf.keras.layers.Dense(64, activation='relu')(hide4)
outputs = tf.keras.layers.Dense(1, activation='linear')(hide5)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

model.compile(optimizer='adam',loss='mean_absolute_error',metrics=['mean_absolute_error'])

history = model.fit(X_train,y_train,validation_split=0.2,batch_size=32,epochs=200)
    #callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [10]:
# Make predictions and calculate the scores
y_pred = (model.predict(X_test))

print(mean_squared_error(y_test, y_pred))
print(math.sqrt(mean_squared_error(y_test, y_pred)))
print(mean_absolute_error(y_test,y_pred))
print(r2_score(y_test, y_pred))

60428280.85237518
7773.562944517474
3374.030014731784
0.7993324451480163


## XGBRegressor

In [12]:
from xgboost import XGBRegressor
reg = XGBRegressor(n_estimators=1000, max_depth=5, eta=0.1, subsample=0.7, colsample_bytree=0.8)
f = reg.fit(X_train, y_train)

In [13]:
y_pred = (f.predict(X_test))

print(mean_squared_error(y_test, y_pred))
print(math.sqrt(mean_squared_error(y_test, y_pred)))
print(mean_absolute_error(y_test,y_pred))
print(r2_score(y_test, y_pred))

49545776.58012623
7038.8760878514
3249.9926088519524
0.8354705826587183


## Hyperparameter Tuning

In [14]:
from sklearn.model_selection import GridSearchCV

In [15]:
params = { 'max_depth': [5,8,10],
           'learning_rate': [0.01, 0.05, 0.1],
           'n_estimators': [500,700, 1000],
           'colsample_bytree': [0.3, 0.5]}
reg = GridSearchCV(estimator=XGBRegressor(),
                   param_grid=params,
                   scoring='neg_mean_squared_error', 
                   verbose=1)
reg.fit(X, y)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


GridSearchCV(estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False, gamma=None,
                                    gpu_id=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_esti...
                                    num_parallel_tree=None, predictor=None,
                                    random_state=None, reg_alpha=None,
                                    reg_lambda=None, scale_pos_weight=None,
                                    subsample=N

In [16]:
print("Best parameters:", reg.best_params_)
print("Lowest RMSE: ", (-reg.best_score_)**(1/2))

Best parameters: {'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 8, 'n_estimators': 1000}
Lowest RMSE:  7810.284691283787


In [17]:
best_model = reg.best_estimator_

In [18]:
y_pred = (best_model.predict(X_test))

print(mean_squared_error(y_test, y_pred))
print(math.sqrt(mean_squared_error(y_test, y_pred)))
print(mean_absolute_error(y_test,y_pred))
print(r2_score(y_test, y_pred))

919930.304103604
959.129972476934
495.755567621859
0.9969451362482132


In [19]:
# Save the best model
pickle.dump(best_model, open('best_freight_model.pkl', 'wb'))