In [29]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
import seaborn as sns
import pickle
from imblearn.over_sampling import ADASYN
import tensorflow as tf
from tensorflow.keras.models import load_model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score 
import math

In [2]:
df = pd.read_csv('pre_encoding_data.csv')

In [3]:
df = df.drop(['prod_grp','sub_class','brand', 'dosage', 'dosage_form', 'per_pack_qty','line_qty','pack_price', 'unit_price',
              'first_line','actual_del_year', 'actual_del_month', 'actual_del_day','freight','total_cost','ship_mode'], axis=1)

In [4]:
cat_feats = df.dtypes[df.dtypes == 'object'].index.tolist()
num_feats = df.dtypes[~df.dtypes.index.isin(cat_feats)].index.tolist()

#Convert cat_feats to categorical
for col in cat_feats:
    df[col] = df[col].astype('str')

# Removing delay from num_feats as it is the dependant variable
num_feats.remove('delay')

In [5]:
# Create Pipeline
numerical = Pipeline([('standard_scaler', StandardScaler())])
                            
categorical = Pipeline([('encode', OneHotEncoder()),('Dense',FunctionTransformer(csr_matrix.todense))])

preprocess = ColumnTransformer([('categorical', categorical, cat_feats),('numerical', numerical, num_feats)])

In [6]:
X = df.drop(columns=['delay'])
y = df['delay']

In [7]:
#Fit Preprocess to data
X = preprocess.fit_transform(X)

In [8]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=50)
X_train.shape

(7473, 367)

In [14]:
inputs = tf.keras.Input(shape=(X_train.shape[1],))
hide1 = tf.keras.layers.Dense(256, activation='relu')(inputs)
hide2 = tf.keras.layers.Dense(256, activation='relu')(hide1)
hide3 = tf.keras.layers.Dense(256, activation='relu')(hide2)
hide4 = tf.keras.layers.Dense(128, activation='relu')(hide3)
hide5 = tf.keras.layers.Dense(64, activation='relu')(hide4)
outputs = tf.keras.layers.Dense(1, activation='linear')(hide5)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

model.compile(optimizer='adam',loss='mean_absolute_error',metrics=['mean_absolute_error'])

history = model.fit(X_train,y_train,validation_split=0.2,batch_size=32,epochs=200)
    #callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)])

Epoch 1/600
Epoch 2/600
Epoch 3/600
Epoch 4/600
Epoch 5/600
Epoch 6/600
Epoch 7/600
Epoch 8/600
Epoch 9/600
Epoch 10/600
Epoch 11/600
Epoch 12/600
Epoch 13/600
Epoch 14/600
Epoch 15/600
Epoch 16/600
Epoch 17/600
Epoch 18/600
Epoch 19/600
Epoch 20/600
Epoch 21/600
Epoch 22/600
Epoch 23/600
Epoch 24/600
Epoch 25/600
Epoch 26/600
Epoch 27/600
Epoch 28/600
Epoch 29/600
Epoch 30/600
Epoch 31/600
Epoch 32/600
Epoch 33/600
Epoch 34/600
Epoch 35/600
Epoch 36/600
Epoch 37/600
Epoch 38/600
Epoch 39/600
Epoch 40/600
Epoch 41/600
Epoch 42/600
Epoch 43/600
Epoch 44/600
Epoch 45/600
Epoch 46/600
Epoch 47/600
Epoch 48/600
Epoch 49/600
Epoch 50/600
Epoch 51/600
Epoch 52/600
Epoch 53/600
Epoch 54/600
Epoch 55/600
Epoch 56/600
Epoch 57/600
Epoch 58/600
Epoch 59/600
Epoch 60/600
Epoch 61/600
Epoch 62/600
Epoch 63/600
Epoch 64/600
Epoch 65/600
Epoch 66/600
Epoch 67/600
Epoch 68/600
Epoch 69/600
Epoch 70/600
Epoch 71/600
Epoch 72/600
Epoch 73/600
Epoch 74/600
Epoch 75/600
Epoch 76/600
Epoch 77/600
Epoch 78

KeyboardInterrupt: 

In [31]:
# Make predictions and calculate the scores
y_pred = (model.predict(X_test))

print(mean_squared_error(y_test, y_pred))
print(math.sqrt(mean_squared_error(y_test, y_pred)))
print(mean_absolute_error(y_test,y_pred))
print(r2_score(y_test, y_pred))

301.80681139623914
17.372587930306732
6.918023560203639
0.5683944345747445


## XGBRegressor

In [35]:
from xgboost import XGBRegressor
reg = XGBRegressor(n_estimators=1000, max_depth=5, eta=0.1, subsample=0.7, colsample_bytree=0.8)
f = reg.fit(X_train, y_train)

In [36]:
y_pred = (f.predict(X_test))

print(mean_squared_error(y_test, y_pred))
print(math.sqrt(mean_squared_error(y_test, y_pred)))
print(mean_absolute_error(y_test,y_pred))
print(r2_score(y_test, y_pred))

249.51548232633579
15.796059075805452
7.963607251204364
0.6431748165868085


## Hyperparameter Tuning

In [37]:
from sklearn.model_selection import GridSearchCV

In [38]:
params = { 'max_depth': [3,6,10],
           'learning_rate': [0.01, 0.05, 0.1],
           'n_estimators': [100, 500, 1000],
           'colsample_bytree': [0.3, 0.7, 0.8]}
reg = GridSearchCV(estimator=XGBRegressor(),
                   param_grid=params,
                   scoring='neg_mean_squared_error', 
                   verbose=1)
reg.fit(X, y)
print("Best parameters:", reg.best_params_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best parameters: {'colsample_bytree': 0.3, 'learning_rate': 0.05, 'max_depth': 10, 'n_estimators': 1000}


NameError: name 'clf' is not defined

In [39]:
print("Lowest RMSE: ", (-reg.best_score_)**(1/2))

Lowest RMSE:  22.757760684383033


In [41]:
best_model = reg.best_estimator_

In [42]:
y_pred = (best_model.predict(X_test))

print(mean_squared_error(y_test, y_pred))
print(math.sqrt(mean_squared_error(y_test, y_pred)))
print(mean_absolute_error(y_test,y_pred))
print(r2_score(y_test, y_pred))

6.577379538782479
2.564640235741161
1.3438684912665078
0.9905938716170138


In [43]:
# Save the best model
pickle.dump(best_model, open('best_delay_model.pkl', 'wb'))