### Neural Network ML model

(Features) **X-variables**: region, date, units sold, production data, climate data, import data, export data, median income.

(Target) **y-variable:** avocado price (numerical outcome)

In [3]:
# Import dependencies.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf
from pathlib import Path

import warnings
warnings.filterwarnings('ignore')

# Import checkpoint dependencies
import os
from tensorflow.keras.callbacks import ModelCheckpoint

# Load the avocado_mock_df.csv dataset.
avocado_mock_df = pd.read_csv(Path('././MOCK_datasets_ML/MockDataset_Avocado_ML.csv'))
avocado_mock_df.head(10)

Unnamed: 0,REGION (city),LATITUDE,LONGITUDE,DATE (Month + Year),4046 Units sold,4225 Units,4770 Units,Bags sold,CLIMATE_CA_pcp,CLIMATE_CA_tmin,...,PERU PRODUCTION,COLUMBIA PRODUCTION,IMPORTS BY VALUE ($1000),IMPORTS BY VOLUME (1000 LBS),IMPORTS UNIT VALUE ($ PER POUND),EXPORTS BY VALUE ($1000),EXPORTS BY VOLUME (1000 LBS),EXPORTS UNIT VALUE ($ PER POUND),MEDIAN INCOME,PRICE
0,Los Angeles,34.0522,118.2437,1/1/2017,1223299.39,829896.69,56808.74,1441332.35,6.48,36.6,...,0,0,207387693,214329.7208,0.967611,1660842,994.194413,1.67054,64251,0.84
1,San Diego,32.7157,117.1611,1/1/2017,149666.2,206181.12,15014.45,189448.58,6.48,36.6,...,0,0,207387693,214329.7208,0.967611,1660842,994.194413,1.67054,74855,0.93
2,San Francisco,37.7749,122.4194,1/1/2017,260100.73,717625.67,2490.66,66958.27,6.48,36.6,...,0,0,207387693,214329.7208,0.967611,1660842,994.194413,1.67054,104552,0.95
3,Sacramento,38.5816,121.4944,1/1/2017,144876.31,339664.16,660.97,41564.2,6.48,36.6,...,0,0,207387693,214329.7208,0.967611,1660842,994.194413,1.67054,63902,0.98
4,Los Angeles,34.0522,118.2437,2/1/2017,5470227.08,1741607.02,937331.61,2701609.82,3.65,37.5,...,0,0,182421005,157957.183,1.154876,2119913,1400.845874,1.513309,64251,0.53
5,San Diego,32.7157,117.1611,2/1/2017,260418.91,290223.93,20313.95,321784.08,3.65,37.5,...,0,0,182421005,157957.183,1.154876,2119913,1400.845874,1.513309,74855,0.63
6,San Francisco,37.7749,122.4194,2/1/2017,560686.25,900172.65,21331.13,75785.02,3.65,37.5,...,0,0,182421005,157957.183,1.154876,2119913,1400.845874,1.513309,104552,0.84
7,Sacramento,38.5816,121.4944,2/1/2017,321474.22,418158.93,11962.44,57213.28,3.65,37.5,...,0,0,182421005,157957.183,1.154876,2119913,1400.845874,1.513309,63902,0.98
8,Los Angeles,34.0522,118.2437,3/1/2017,2768930.88,897311.08,518369.51,1269943.15,2.19,37.2,...,0,0,224948272,171411.0822,1.312332,4549077,2623.655244,1.73387,64251,0.99
9,San Diego,32.7157,117.1611,3/1/2017,163424.69,176304.67,19186.14,158785.76,2.19,37.2,...,0,0,224948272,171411.0822,1.312332,4549077,2623.655244,1.73387,74855,1.04


In [4]:
# Drop the non-beneficial columns.
# Make a copy of df
avocado_mock_df.drop(columns=["CLIMATE_CA_pcp", "CLIMATE_CA_tmin", "CLIMATE_CA_tmax", "DATE (Month + Year)"], inplace=True)
avocado_mock_df.head()

Unnamed: 0,REGION (city),LATITUDE,LONGITUDE,4046 Units sold,4225 Units,4770 Units,Bags sold,TOTAL_VOLUME,CALIFORNIA PRODUCTION,CHILE PRODUCTION,...,PERU PRODUCTION,COLUMBIA PRODUCTION,IMPORTS BY VALUE ($1000),IMPORTS BY VOLUME (1000 LBS),IMPORTS UNIT VALUE ($ PER POUND),EXPORTS BY VALUE ($1000),EXPORTS BY VOLUME (1000 LBS),EXPORTS UNIT VALUE ($ PER POUND),MEDIAN INCOME,PRICE
0,Los Angeles,34.0522,118.2437,1223299.39,829896.69,56808.74,1441332.35,45670156,899349,519433,...,0,0,207387693,214329.7208,0.967611,1660842,994.194413,1.67054,64251,0.84
1,San Diego,32.7157,117.1611,149666.2,206181.12,15014.45,189448.58,45670156,899349,519433,...,0,0,207387693,214329.7208,0.967611,1660842,994.194413,1.67054,74855,0.93
2,San Francisco,37.7749,122.4194,260100.73,717625.67,2490.66,66958.27,45670156,899349,519433,...,0,0,207387693,214329.7208,0.967611,1660842,994.194413,1.67054,104552,0.95
3,Sacramento,38.5816,121.4944,144876.31,339664.16,660.97,41564.2,45670156,899349,519433,...,0,0,207387693,214329.7208,0.967611,1660842,994.194413,1.67054,63902,0.98
4,Los Angeles,34.0522,118.2437,5470227.08,1741607.02,937331.61,2701609.82,47371481,2635471,0,...,0,0,182421005,157957.183,1.154876,2119913,1400.845874,1.513309,64251,0.53


In [5]:
# Determine the number of unique values in each column.
avo_cat = avocado_mock_df.dtypes.index.tolist()
avocado_mock_df[avo_cat].nunique()

REGION (city)                        4
LATITUDE                             4
LONGITUDE                            4
4046 Units sold                     12
4225 Units                          12
4770 Units                          12
Bags sold                           12
TOTAL_VOLUME                         3
CALIFORNIA PRODUCTION                3
CHILE PRODUCTION                     2
MEXICO PRODUCTION                    3
PERU PRODUCTION                      1
COLUMBIA PRODUCTION                  1
IMPORTS BY VALUE ($1000)             3
IMPORTS BY VOLUME (1000 LBS)         3
IMPORTS UNIT VALUE ($ PER POUND)     3
EXPORTS BY VALUE ($1000)             3
EXPORTS BY VOLUME (1000 LBS)         3
EXPORTS UNIT VALUE ($ PER POUND)     3
MEDIAN INCOME                        4
PRICE                               10
dtype: int64

In [6]:
avocado_mock_df.dtypes

REGION (city)                        object
LATITUDE                            float64
LONGITUDE                           float64
4046 Units sold                     float64
4225 Units                          float64
4770 Units                          float64
Bags sold                           float64
TOTAL_VOLUME                          int64
CALIFORNIA PRODUCTION                 int64
CHILE PRODUCTION                      int64
MEXICO PRODUCTION                     int64
PERU PRODUCTION                       int64
COLUMBIA PRODUCTION                   int64
IMPORTS BY VALUE ($1000)              int64
IMPORTS BY VOLUME (1000 LBS)        float64
IMPORTS UNIT VALUE ($ PER POUND)    float64
EXPORTS BY VALUE ($1000)              int64
EXPORTS BY VOLUME (1000 LBS)        float64
EXPORTS UNIT VALUE ($ PER POUND)    float64
MEDIAN INCOME                         int64
PRICE                               float64
dtype: object

In [7]:
# Look at SPECIFIC value counts for binning
avocado_mock_df['CALIFORNIA PRODUCTION'].value_counts()

2635471    4
4997613    4
899349     4
Name: CALIFORNIA PRODUCTION, dtype: int64

In [23]:
## Visualize the value counts of avocado_mock_df['CALIFORNIA PRODUCTION']
ca_prod = avocado_mock_df['CALIFORNIA PRODUCTION'].value_counts()
ca_prod.plot.density()

In [11]:
# Determine which values to replace if counts are less than ...?
replace_ca_prod = list(ca_prod[ca_prod < 500000].index)

## Replace in dataframe
for ca_p in replace_ca_prod:
    avocado_mock_df['CALIFORNIA PRODUCTION'] = avocado_mock_df['CALIFORNIA PRODUCTION'].replace(ca_p,"Other")
    
# Check to make sure binning was successful
avocado_mock_df['CALIFORNIA PRODUCTION'].value_counts()

Other    12
Name: CALIFORNIA PRODUCTION, dtype: int64

In [12]:
# Generate our categorical variable lists
avocado_cat = avocado_mock_df.dtypes[avocado_mock_df.dtypes == "object"].index.tolist()

In [13]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(avocado_mock_df[avocado_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(avocado_cat)
encode_df.head()

Unnamed: 0,REGION (city)_Los Angeles,REGION (city)_Sacramento,REGION (city)_San Diego,REGION (city)_San Francisco,CALIFORNIA PRODUCTION_Other
0,1.0,0.0,0.0,0.0,1.0
1,0.0,0.0,1.0,0.0,1.0
2,0.0,0.0,0.0,1.0,1.0
3,0.0,1.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,1.0


In [14]:
# Merge one-hot encoded features and drop the originals
avocado_mock_df = avocado_mock_df.merge(encode_df,left_index=True, right_index=True)
avocado_mock_df = avocado_mock_df.drop(avocado_cat,1)
avocado_mock_df.head()

Unnamed: 0,LATITUDE,LONGITUDE,4046 Units sold,4225 Units,4770 Units,Bags sold,TOTAL_VOLUME,CHILE PRODUCTION,MEXICO PRODUCTION,PERU PRODUCTION,...,EXPORTS BY VALUE ($1000),EXPORTS BY VOLUME (1000 LBS),EXPORTS UNIT VALUE ($ PER POUND),MEDIAN INCOME,PRICE,REGION (city)_Los Angeles,REGION (city)_Sacramento,REGION (city)_San Diego,REGION (city)_San Francisco,CALIFORNIA PRODUCTION_Other
0,34.0522,118.2437,1223299.39,829896.69,56808.74,1441332.35,45670156,519433,44251374,0,...,1660842,994.194413,1.67054,64251,0.84,1.0,0.0,0.0,0.0,1.0
1,32.7157,117.1611,149666.2,206181.12,15014.45,189448.58,45670156,519433,44251374,0,...,1660842,994.194413,1.67054,74855,0.93,0.0,0.0,1.0,0.0,1.0
2,37.7749,122.4194,260100.73,717625.67,2490.66,66958.27,45670156,519433,44251374,0,...,1660842,994.194413,1.67054,104552,0.95,0.0,0.0,0.0,1.0,1.0
3,38.5816,121.4944,144876.31,339664.16,660.97,41564.2,45670156,519433,44251374,0,...,1660842,994.194413,1.67054,63902,0.98,0.0,1.0,0.0,0.0,1.0
4,34.0522,118.2437,5470227.08,1741607.02,937331.61,2701609.82,47371481,0,44736010,0,...,2119913,1400.845874,1.513309,64251,0.53,1.0,0.0,0.0,0.0,1.0


In [15]:
# Split our preprocessed data into our features and target arrays
y = avocado_mock_df["PRICE"].values
X = avocado_mock_df.drop(["PRICE"],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [16]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Compile, Train and Evaluate the Model

In [17]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 210
hidden_nodes_layer2 = 90
#hidden_nodes_layer3 = 40

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Third hidden layer
#nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 210)               5040      
_________________________________________________________________
dense_1 (Dense)              (None, 90)                18990     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 91        
Total params: 24,121
Trainable params: 24,121
Non-trainable params: 0
_________________________________________________________________


In [18]:
# Define the checkpoint path and filenames
os.makedirs("checkpoints/",exist_ok=True)
checkpoint_path = "checkpoints/weights.{epoch:02d}.hdf5"

In [19]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [20]:
# Create a callback that saves the model's weights every 5 epochs
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    verbose=1,
    save_weights_only=True,
    save_freq=1000)

In [21]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=100,callbacks=[cp_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [22]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

3/3 - 0s - loss: -2.3563e+00 - acc: 0.0000e+00
Loss: -2.3562779426574707, Accuracy: 0.0


In [22]:
# Export our model to HDF5 file
#nn.save("Avocado_price.h5")