In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn import svm
from sklearn.ensemble import AdaBoostRegressor
import time

weather = pd.read_csv('data/weather.csv')
pallet_history = pd.read_csv('data/Pallet_history_Gold_Spike.csv')
inbound = pd.read_csv('data/inbound_loads.csv')
outbound = pd.read_csv('data/outbound_laods.csv')
demand = pd.read_csv('data/demand_kWtrain_val.csv')

# Preprocessing the csv files

In [4]:
#################
# Preprocess inbound csv:
#################
print('\n #####\nInbound: \n ##### \n')
inbound_post = inbound[inbound.carrier_code != 'CANCEL']
inbound_post = inbound[inbound.carrier_code != '']

inbound_post['truck_signin_datetime'] = pd.to_datetime(inbound_post['truck_signin_datetime'])

# Compute delta times
inbound_load_time = pd.to_datetime(inbound_post['load_finish_datetime']) - pd.to_datetime(inbound_post['load_start_datetime'])
inbound_truck_time = pd.to_datetime(inbound_post['truck_signin_datetime']) - pd.to_datetime(inbound_post['signout_datetime'])

# Drop unnecessary columns
inbound_post = inbound_post.drop(['Unnamed: 0', 'warehouse_order_number', 'customer_code', 'load_reference_number', 'carrier_code', 'weight_uom', 'load_finish_datetime', 'load_start_datetime', 'dock_door_number', 'trailer_number', 'signout_datetime'], axis=1)

# Add time deltas
inbound_post['load_time'] = inbound_load_time
inbound_post['truck_time'] = inbound_truck_time

print(inbound_post.columns)

inbound_post['load_time'] = inbound_post['load_time'].dt.seconds
inbound_post['truck_time'] = inbound_post['truck_time'].dt.seconds

print('With NaN:', inbound_post.shape)

# Drop rows with >0 NaN values
inbound_post_nan = inbound_post.dropna().reset_index(drop=True)

print('Without NaN:', inbound_post_nan.shape)

#################
# Preprocess outbound csv:
#################
print('\n #####\nOutbound: \n ##### \n')
outbound_post = outbound[outbound.carrier_code != 'CANCEL']
outbound_post = outbound[outbound.carrier_code != 'VOID']
outbound_post = outbound[outbound.carrier_code != '']

outbound_post['truck_signin_datetime'] = pd.to_datetime(outbound_post['truck_signin_datetime'])

# Compute delta times
outbound_load_time = pd.to_datetime(outbound_post['load_finish_datetime']) - pd.to_datetime(outbound_post['load_start_datetime'])
outbound_truck_time = pd.to_datetime(outbound_post['truck_signin_datetime']) - pd.to_datetime(outbound_post['signout_datetime'])

# Drop unnecessary columns
outbound_post = outbound_post.drop(['Unnamed: 0', 'warehouse_order_number', 'customer_code', 'load_reference_number', 'carrier_code', 'weight_uom', 'load_finish_datetime', 'load_start_datetime', 'dock_door_number', 'trailer_number', 'signout_datetime'], axis=1)

# Add time deltas
outbound_post['load_time'] = outbound_load_time
outbound_post['truck_time'] = outbound_truck_time

print(outbound_post.columns)

outbound_post['load_time'] = outbound_post['load_time'].dt.seconds
outbound_post['truck_time'] = outbound_post['truck_time'].dt.seconds

print('With NaN:', outbound_post.shape)

# Drop rows with >0 NaN values
outbound_post_nan = outbound_post.dropna().reset_index(drop=True)

print('Without NaN:', outbound_post_nan.shape)

#################
# Preprocess demand csv:
#################
print('\n #####\nDemand:  \n ##### \n')

demand['datetime_local'] = pd.to_datetime(demand['datetime_local'])

end_known_idx = demand[demand.demand_kW > 1].index[-1]
train_val_split = 0.7 # 70% train, 30% val
end_train_idx = int((train_val_split) * end_known_idx)
demand_train = demand[0:end_train_idx-1]
demand_val = demand[end_train_idx:end_known_idx]

print('Full dataset:', demand.shape)
print('Answers known until index: ', end_known_idx)
print(f'Training set, {int(train_val_split*100)}%:', demand_train.shape)
print(f'Validation set, {int(100-train_val_split*100)}%:', demand_val.shape)

#################
# Preprocess weather csv:
#################
print('\n #####\nWeather:  \n ##### \n')

weather_post = weather.drop('datetime_UTC', axis=1)
weather_post['datetime'] = pd.to_datetime(weather_post['datetime'])

print(weather_post.columns)
print(weather_post.shape)



 #####
Inbound: 
 ##### 

Index(['front_temperature', 'middle_temperature', 'back_temperature',
       'net_weight', 'case_quantity', 'pallet_count', 'truck_signin_datetime',
       'load_time', 'truck_time'],
      dtype='object')
With NaN: (56146, 9)
Without NaN: (8761, 9)

 #####
Outbound: 
 ##### 

Index(['net_weight', 'case_quantity', 'pallet_count', 'truck_signin_datetime',
       'load_time', 'truck_time'],
      dtype='object')
With NaN: (112363, 6)
Without NaN: (96704, 6)

 #####
Demand:  
 ##### 



  demand['datetime_local'] = pd.to_datetime(demand['datetime_local'])


Full dataset: (365349, 3)
Answers known until index:  273987
Training set, 70%: (191789, 3)
Validation set, 30%: (82197, 3)

 #####
Weather:  
 ##### 

Index(['Unnamed: 0', 'datetime', 'Relative Humidity', 'Temperature', 'hour'], dtype='object')
(328242, 5)


# Merging inbound with demand

In [5]:
inbound_post_nan.sort_values("truck_signin_datetime", inplace=True)
demand_inbound_merge = pd.merge_asof(demand_train, inbound_post_nan, left_on='datetime_local', right_on='truck_signin_datetime', direction='nearest')

demand_inbound_merge_numerical = demand_inbound_merge.copy()
demand_inbound_merge_numerical['datetime_local'] = demand_inbound_merge_numerical['datetime_local'].apply(lambda x: time.mktime(x.timetuple()))
demand_inbound_merge_numerical['truck_signin_datetime'] = demand_inbound_merge_numerical['truck_signin_datetime'].apply(lambda x: time.mktime(x.timetuple()))

print(demand_inbound_merge_numerical.tail())

        Unnamed: 0  datetime_local  demand_kW  front_temperature   
191784      191784    1.628928e+09   2561.692               41.0  \
191785      191785    1.628928e+09   2864.648               41.0   
191786      191786    1.628928e+09   2820.785               41.0   
191787      191787    1.628928e+09   2817.942               41.0   
191788      191788    1.628928e+09   2699.152               41.0   

        middle_temperature  back_temperature  net_weight  case_quantity   
191784                42.0              42.0     41198.0         1642.0  \
191785                42.0              42.0     41198.0         1642.0   
191786                42.0              42.0     41198.0         1642.0   
191787                42.0              42.0     41198.0         1642.0   
191788                42.0              42.0     41198.0         1642.0   

        pallet_count  truck_signin_datetime  load_time  truck_time  
191784          28.0           1.562101e+09     1020.0     86397.0  
19

## Regression on train set

In [10]:
# Split the data into training and test sets
X = demand_inbound_merge_numerical.drop('demand_kW', axis=1)
y = demand_inbound_merge_numerical['demand_kW']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set up the pipeline
pipeline = Pipeline([
    # ('imputer', SimpleImputer(strategy='mean')), # Removing gives better performance for Adaboostregressor
    ('scaler', StandardScaler()),
    ('regressor', AdaBoostRegressor())
])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the performance of the model using mean squared error
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error:', mse)


Mean Squared Error: 117325.89532074482


# Merging outbound with demand

In [11]:
outbound_post_nan.sort_values("truck_signin_datetime", inplace=True)
demand_outbound_merge = pd.merge_asof(demand_train, outbound_post_nan, left_on='datetime_local', right_on='truck_signin_datetime', direction='nearest')

demand_outbound_merge_numerical = demand_outbound_merge.copy()
demand_outbound_merge_numerical['datetime_local'] = demand_outbound_merge_numerical['datetime_local'].apply(lambda x: time.mktime(x.timetuple()))
demand_outbound_merge_numerical['truck_signin_datetime'] = demand_outbound_merge_numerical['truck_signin_datetime'].apply(lambda x: time.mktime(x.timetuple()))

print(demand_outbound_merge_numerical.tail())

        Unnamed: 0  datetime_local  demand_kW  net_weight  case_quantity   
191784      191784    1.628928e+09   2561.692      9595.0          497.0  \
191785      191785    1.628928e+09   2864.648      9595.0          497.0   
191786      191786    1.628928e+09   2820.785      9595.0          497.0   
191787      191787    1.628928e+09   2817.942      9595.0          497.0   
191788      191788    1.628928e+09   2699.152      9595.0          497.0   

        pallet_count  truck_signin_datetime  load_time  truck_time  
191784           7.0           1.628928e+09     4081.0     71137.0  
191785           7.0           1.628928e+09     4081.0     71137.0  
191786           7.0           1.628928e+09     4081.0     71137.0  
191787           7.0           1.628928e+09     4081.0     71137.0  
191788           7.0           1.628928e+09     4081.0     71137.0  


## Regression on train set

In [12]:
# Split the data into training and test sets
X = demand_outbound_merge_numerical.drop('demand_kW', axis=1)
y = demand_outbound_merge_numerical['demand_kW']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set up the pipeline
pipeline = Pipeline([
    # ('imputer', SimpleImputer(strategy='mean')), # Removing gives better performance for Adaboostregressor
    ('scaler', StandardScaler()),
    ('regressor', AdaBoostRegressor())
])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the performance of the model using mean squared error
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error:', mse)


Mean Squared Error: 120481.46790819016
