Import necessary libraries to continue

In [10]:
import pandas as pd
import os
import sklearn
from sklearn import linear_model
from sklearn import metrics
import numpy as np
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import warnings
from sklearn import ensemble
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning) #ignore deprecation warnings

Data Analysis - Loading CSV and Looking into its contents for what might be useful

In [2]:
store = pd.read_csv('/Sample - Superstore.csv') #read in the csv using pandas
store.head() #show a snippet of the dataset

Unnamed: 0.1,Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,0,1.0,CA-2016-152156,11/8/2016,11/11/2016,,CG-12520,Claire Gute,Consumer,United States,...,42420.0,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,,2.0,0.0,41.9136
1,1,,,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,...,42420.0,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3.0,0.0,219.582
2,2,3.0,,6/12/2016,6/16/2016,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,...,90036.0,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2.0,0.0,6.8714
3,3,4.0,US-2015-108966,10/11/2015,10/18/2015,Standard Class,,Sean O'Donnell,Consumer,United States,...,33311.0,South,,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5.0,,-383.031
4,4,5.0,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,,Consumer,United States,...,33311.0,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2.0,0.2,2.5164


Data Clean Up - Removing all NaN, Integer Encoding, Train/Testing Split

In [3]:
#there are too many NaN for a model to work. Also, need to give an integer representation for the categories, region, and shipping mode

store['Discount'] = store['Discount'].fillna(0) #fill in all the NaN for the discount column
store["New Ship Mode"] = store['Ship Mode'].astype('category').map({'Same Day': 0, 'First Class': 1, 'Standard Class': 2, 'Same Day': 3}) #integer representation for shipping mode
store["New Category"] = store['Category'].astype('category').map({'Furniture': 0, 'Office Supplies': 1, 'Technology': 2})  #integer representation for product category
store["New Region"] = store['Region'].astype('category').map({'South': 0, 'West': 1, 'East': 2, 'Central': 3})  #integer representation for region
store = store.dropna(subset=['Profit', 'Ship Mode', 'Sales', 'Category', 'Region', 'Quantity', 'New Ship Mode']) #drop the rows that have a NaN value in these columns (otherwise model won't run)


In [4]:
print(store['Profit'].max()-store['Profit'].min()) # for personal curiosity, the range of what we're trying to predict is $11,546.348

11546.348


In [16]:
X = ['New Region', 'New Category', 'New Ship Mode', 'Discount', 'Quantity'] #the columns we will use as our inputs into the model
y = ['Profit'] #what our model is trying to predict accurately
train, test = train_test_split(store, test_size = 0.2, random_state = 1) # split the dataset into a 80/20 training vs testing split
X_train = train[X] #model training inputs
y_train = train[y] #model training outputs
X_test = test[X] #model testing inputs
y_test = test[y] #actual results (what model is trying to replicate)

Actual Models & Prediction

In [17]:
#The process on this was to create a list of pre-built sklearn models then loop through each and print their Mean Absolute Error
classifiers = [sklearn.ensemble.GradientBoostingRegressor(learning_rate = .1, random_state=None, verbose=0), #started with a Gradient Boosting Regressor (changed some parameters)
               linear_model.Lasso(alpha=0.1), # next went to a Lasso model with an alpha value of .1
               linear_model.LinearRegression(), # standard linear regression model
               linear_model.BayesianRidge() # finished off with a Bayesian Ridge
    ]

for classifier in classifiers: #loop through each classifier
    classifier.fit(X_train, y_train) # fit each to the model
    predicted_prices = classifier.predict(X_test) #predict outputs
    MAE = mean_absolute_error(y_test , predicted_prices) #determine the mean absolute error between the predictions and the expected
    print(classifier.__class__.__name__ + ' Mean Absolute Error:     '+ str(MAE))  #print out the model name and the MAE before moving onto the next

GradientBoostingRegressor Mean Absolute Error:     60.87571352713761
Lasso Mean Absolute Error:     67.57265634159059
LinearRegression Mean Absolute Error:     67.84208537804003
BayesianRidge Mean Absolute Error:     67.11182597723823


In [13]:
# creating a more robust (more nodes) and trained model
NN_model = Sequential() #defined a sequential model

# The Input Layer :
NN_model.add(Dense(256, kernel_initializer='normal',input_dim = X_train.shape[1], activation='relu')) #flatten input data and also initialize the first layer to have 256 nodes
# The Hidden Layers :
NN_model.add(Dense(128, kernel_initializer='normal',activation='softmax')) #softmax activation, 128 nodes
NN_model.add(Dense(156, kernel_initializer='normal',activation='relu')) #reLu activation with 156 nodes
NN_model.add(Dense(96, kernel_initializer='normal',activation='relu')) # reLu activation with 96 nodes

# The Output Layer :
NN_model.add(Dense(1, kernel_initializer='normal',activation='linear')) #only outputing the profit prediction (1 node)

# Compile the network :
NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error']) #testing using mean absolute error (as its numerical not categorical)
NN_model.summary() # calls a summary for the model creation
checkpoint_name = 'WeightsNN2-{epoch:03d}--{val_loss:.5f}.hdf5' # will save the weights everytime the validation error is lower than the current lowest value
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto') # calls above method
callbacks_list = [checkpoint]
NN_model.fit(X_train, y_train, epochs=500, batch_size=32, validation_split = 0.2, callbacks=callbacks_list) #fits the model to the data and calls the checkpoints. batch size 32, overkill on 500 epochs, just did to test if it would ever improve and when it levels out (around 50)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5 (Dense)             (None, 256)               1536      
                                                                 
 dense_6 (Dense)             (None, 128)               32896     
                                                                 
 dense_7 (Dense)             (None, 156)               20124     
                                                                 
 dense_8 (Dense)             (None, 96)                15072     
                                                                 
 dense_9 (Dense)             (None, 1)                 97        
                                                                 
Total params: 69725 (272.36 KB)
Trainable params: 69725 (272.36 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/500
Epoc

<keras.src.callbacks.History at 0x7f20b1f44430>

In [20]:
NN_model = Sequential() #create a new Sequence model to handle discrete data

# The Input Layer :
NN_model.add(Dense(64, kernel_initializer='normal',input_dim = X_train.shape[1], activation='relu')) #64 node input layer
# The Hidden Layers :
NN_model.add(Dense(128, kernel_initializer='normal',activation='relu')) #128 node hidden layer 1
NN_model.add(Dense(32, kernel_initializer='normal',activation='softmax')) #32 node hidden layer 2
NN_model.add(Dense(16, kernel_initializer='normal',activation='relu')) # 16 node hidden layer 3

# The Output Layer :
NN_model.add(Dense(1, kernel_initializer='normal',activation='linear')) #output into a 1 node prediction

# Compile the network (same process as above just with less epochs and a different name to save weights as):
NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
NN_model.summary()
checkpoint_name = 'WeightsNN1-{epoch:03d}--{val_loss:.5f}.hdf5'
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]
NN_model.fit(store[X], store[y], epochs=50, batch_size=32, validation_split = 0.2, callbacks=callbacks_list)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_10 (Dense)            (None, 64)                384       
                                                                 
 dense_11 (Dense)            (None, 128)               8320      
                                                                 
 dense_12 (Dense)            (None, 32)                4128      
                                                                 
 dense_13 (Dense)            (None, 16)                528       
                                                                 
 dense_14 (Dense)            (None, 1)                 17        
                                                                 
Total params: 13377 (52.25 KB)
Trainable params: 13377 (52.25 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/50
Epoch 1

<keras.src.callbacks.History at 0x7f20ac0f74f0>

**Summary**
-------------
After analysis, it is clear that the fist neural network built performed the best out of all the models. The first 4 prebuilt classifiers all had MAE's between 60 and 68. While still respectable, it was not nearly the same level as the neural networks. The second neural network also performed well with an MAE of 49. But the first neural network, with an MAE of 39, blew the rest out of the water. That means, out of a range of values of ~11,000 dollars, this model could predict the profit based on the category of the product, region being delivered, shipping method, quantity purchased, and discount on average within plus or minus 40 dollars. That is incredibly impressive. That model could also be improved computationally by using about 400 less epochs, I only used that many as an example to show how it would not increase the accuracy. In this case, it appears the more robust neural network performed better.