In [1]:
# Import our dependencies
#import sqlalchemy
#from sqlalchemy.ext.automap import automap_base
#from sqlalchemy.orm import Session
#from sqlalchemy import create_engine
#import psycopg2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import accuracy_score
import pandas as pd
import tensorflow as tf


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
avocado_df = pd.read_csv('/content/drive/MyDrive/avocado_csv.csv')

In [4]:
# remove unneccessary columns
avocado_df=avocado_df.drop(columns=['index', 'totalvolume',])

In [5]:
# change date to datetime for pandas
avocado_df['month'] = pd.DatetimeIndex(avocado_df['date']).month
avocado_df

Unnamed: 0.1,Unnamed: 0,date,averageprice,plu4046,plu4225,plu4770,totalbags,smallbags,largebags,xlargebags,region,type,market,month
0,0,2015-01-04,1.220000,2819.50,28287.42,49.90,9716.46,9186.93,529.53,0.0,Albany,conventional,northeast,1
1,1,2015-01-04,1.790000,57.42,153.88,0.00,1162.65,1162.65,0.00,0.0,Albany,organic,northeast,1
2,2,2015-01-04,1.000000,364302.39,23821.16,82.15,46815.79,16707.15,30108.64,0.0,Atlanta,conventional,southeast,1
3,3,2015-01-04,1.760000,1500.15,938.35,0.00,1408.19,1071.35,336.84,0.0,Atlanta,organic,southeast,1
4,4,2015-01-04,1.080000,53987.31,552906.04,39995.03,141136.68,137146.07,3990.61,0.0,BaltimoreWashington,conventional,midsouth,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45022,45022,2023-12-03,1.616528,154.75,194.69,0.00,4475.54,,,,Syracuse,organic,northeast,12
45023,45023,2023-12-03,1.421139,222.70,91.35,0.00,24206.50,,,,Tampa,organic,southeast,12
45024,45024,2023-12-03,1.550513,204.64,1211.25,0.00,4278.03,,,,Toledo,organic,great_lakes,12
45025,45025,2023-12-03,1.618931,15182.42,1211.38,0.00,18075.66,,,,WestTexNewMexico,organic,west,12


In [6]:
# create bins to move date to quarters, making them usable data points in the regression model
bins = [0, 3, 6, 9, 12]
# labels of bins
labels = ['Q1', 'Q2', 'Q3', 'Q4']
# bin the data using the month column created above
avocado_df['quarter'] = pd.cut(avocado_df['month'], bins=bins, labels=labels)
# drop the unneccessary columns
avocado_df=avocado_df.drop(columns=['date', 'month'])
# cast quarter column to string for dummies to work
avocado_df['quarter'] = avocado_df['quarter'].astype(str)
avocado_df.dtypes

Unnamed: 0,0
Unnamed: 0,int64
averageprice,float64
plu4046,float64
plu4225,float64
plu4770,float64
totalbags,float64
smallbags,float64
largebags,float64
xlargebags,float64
region,object


In [7]:
# drop the index to remove unique data
avocado_df = avocado_df.drop(columns=['Unnamed: 0'])

In [8]:
# here the number columns are scaled
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(avocado_df[['plu4046', 'plu4225', 'plu4770', 'totalbags']])

# Transform the training data using the scaler
scaled_data = X_scaler.transform(avocado_df[['plu4046', 'plu4225', 'plu4770', 'totalbags']])
avocado_df[['plu4046', 'plu4225', 'plu4770', 'totalbags']] = scaled_data

In [10]:
# define x and y here to avoid scaling the dummy data
# region and market represent the same data, so one of them must always be dropped

y = avocado_df['averageprice']

# test if market or region delivers better results
# this line to drop market (.66)
X = avocado_df.drop(columns=['averageprice', 'smallbags', 'largebags', 'xlargebags', 'market'])
# this line to drop region (.59)
#X = avocado_df.drop(columns=['averageprice', 'smallbags', 'largebags', 'xlargebags', 'region'])

# dropping market delivers better results. Market will be dropped on all other tests.
# this line to drop type (.67)
#X = avocado_df.drop(columns=['averageprice', 'smallbags', 'largebags', 'xlargebags', 'type'])
# this line to drop quarter (.62)
#X = avocado_df.drop(columns=['averageprice', 'smallbags', 'largebags', 'xlargebags', 'quarter'])
# this line to drop totalbags (.57)
#X = avocado_df.drop(columns=['averageprice', 'smallbags', 'largebags', 'xlargebags', 'totalbags'])
# this line to drop all bags (.56)
#X = avocado_df.drop(columns=['averageprice', 'totalbags', 'smallbags', 'largebags', 'xlargebags', 'market'])

In [22]:
X = pd.get_dummies(X)
X

Unnamed: 0,plu4046,plu4225,plu4770,totalbags,region_Albany,region_Atlanta,region_BaltimoreWashington,region_BirminghamMontgomery,region_Boise,region_Boston,...,region_Tampa,region_Toledo,region_WestTexNewMexico,region_Wichita,type_conventional,type_organic,quarter_Q1,quarter_Q2,quarter_Q3,quarter_Q4
0,-0.459000,-0.271191,-0.217513,-0.481574,True,False,False,False,False,False,...,False,False,False,False,True,False,True,False,False,False
1,-0.474491,-0.462892,-0.219232,-0.554467,True,False,False,False,False,False,...,False,False,False,False,False,True,True,False,False,False
2,1.568360,-0.301624,-0.216402,-0.165422,False,True,False,False,False,False,...,False,False,False,False,True,False,True,False,False,False
3,-0.466399,-0.457547,-0.219232,-0.552375,False,True,False,False,False,False,...,False,False,False,False,False,True,True,False,False,False
4,-0.172028,3.303552,1.158317,0.638356,False,False,True,False,False,False,...,False,False,False,False,True,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45022,-0.473945,-0.462614,-0.219232,-0.526235,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,True
45023,-0.473564,-0.463318,-0.219232,-0.358093,False,False,False,False,False,False,...,True,False,False,False,False,True,False,False,False,True
45024,-0.473665,-0.455687,-0.219232,-0.527919,False,False,False,False,False,False,...,False,True,False,False,False,True,False,False,False,True
45025,-0.389663,-0.455686,-0.219232,-0.410339,False,False,False,False,False,False,...,False,False,True,False,False,True,False,False,False,True


In [23]:
# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [24]:
# Define the deep learning model
number_input_features = X_train.shape[1]
node1 =  100
node2 = 62
node3 = 30

nn_model = tf.keras.models.Sequential()
nn_model.add(tf.keras.layers.Dense(units=node1, activation="relu", input_dim=number_input_features))
nn_model.add(tf.keras.layers.Dense(units=node2, activation="relu"))
nn_model.add(tf.keras.layers.Dense(units=node3, activation="relu"))
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn_model.compile(loss="MSE", optimizer="SGD", metrics=["R2Score"])

# Train the model
fit_model = nn_model.fit(X_train, y_train, epochs=50)

# Evaluate the model using the test data
model_loss, model_r2 = nn_model.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, R2 Score: {model_r2}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m1056/1056[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - R2Score: -2.1828 - loss: 0.5187
Epoch 2/50
[1m1056/1056[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - R2Score: -1.2015 - loss: 0.3572
Epoch 3/50
[1m1056/1056[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - R2Score: -1.2137 - loss: 0.3545
Epoch 4/50
[1m1056/1056[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - R2Score: -1.1822 - loss: 0.3502
Epoch 5/50
[1m1056/1056[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - R2Score: -1.1705 - loss: 0.3573
Epoch 6/50
[1m1056/1056[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - R2Score: -1.1958 - loss: 0.3546
Epoch 7/50
[1m1056/1056[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - R2Score: -1.1700 - loss: 0.3510
Epoch 8/50
[1m1056/1056[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - R2Score: -1.1786 - loss: 0.3547
Epoch 9/50
[1m1056/1056