### Purpose of ML models: forcasting avocado prices

### Starting with Linear Regression in sckit-learn, then comparing models with Random Forest Regressor and Neural Network.

(Features) **X-variables**: region, date, units sold, production data.

(Target) **y-variable:** avocado price 

Next step is research libraries Random Forest Regressor (predicting acctual price) and Classifier (binning y-variable in n-categories.

## Linear Regression with sckit-learn

In [1]:
# Import dependencies.
import pandas as pd
import numpy as np
from collections import Counter
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

### Importing data from the database

In [2]:
import pandas as pd
from sqlalchemy import create_engine
from config import db_password

# local server connection string
db_string = f"postgres://postgres:{db_password}@127.0.0.1:5432/avocados"

# create db engine
engine = create_engine(db_string)

In [3]:
# Load the avocado dataset form pgAdmin.
avocado_df = pd.read_sql_table('all_prices_prod', engine)
avocado_df.head()

Unnamed: 0,geography,date,type,avg_price,total_volume,units_4046,units_4225,units_4770_,total_bags,s_bags,l_bags,xl_bags,year_month,status,total_volume_produces,california,chile,mexico,peru,columbia
0,Albany,1/15/2017,conventional,1.55,88526.26,3327.65,71956.77,607.03,12634.81,12574.72,60.09,0.0,,,,,,,,
1,Albany,1/15/2017,organic,1.84,1982.65,82.3,328.02,0.0,1572.33,1572.33,0.0,0.0,,,,,,,,
2,Albany,1/2/2017,conventional,1.47,129948.23,4845.77,117027.41,200.36,7874.69,7866.86,7.83,0.0,,,,,,,,
3,Albany,1/2/2017,organic,1.87,1376.7,71.65,192.63,0.0,1112.42,1112.42,0.0,0.0,,,,,,,,
4,Albany,1/22/2017,conventional,1.59,128679.24,4119.94,111173.08,2191.71,11194.51,11060.19,125.5,8.82,,,,,,,,


In [4]:
# Change None values to NaN.
avocado_df = avocado_df.fillna(value=np.nan)
#avocado_df.head()

In [5]:
# Make a copy
avocado_df = avocado_df.copy()

In [6]:
# Count Null values
#avocado_df.isnull().sum()

# Drop Null values
avocado_df = avocado_df.dropna(how='any')
print(avocado_df.shape)
avocado_df.head()

(14688, 20)


Unnamed: 0,geography,date,type,avg_price,total_volume,units_4046,units_4225,units_4770_,total_bags,s_bags,l_bags,xl_bags,year_month,status,total_volume_produces,california,chile,mexico,peru,columbia
542,Albany,1/14/2018,conventional,1.42,95246.38,2897.41,76570.67,44.0,15734.3,10012.8,5721.5,0.0,1/1/2018,actual,52758905.0,830821.0,449003.0,51479081.0,0.0,0.0
543,Albany,1/14/2018,organic,1.47,4140.95,7.3,301.87,0.0,3831.78,3831.78,0.0,0.0,1/1/2018,actual,52758905.0,830821.0,449003.0,51479081.0,0.0,0.0
544,Albany,1/21/2018,conventional,1.69,135196.35,3133.37,116520.88,88.78,15453.32,10023.79,5429.53,0.0,1/1/2018,actual,54698266.0,2913663.0,445127.0,51339476.0,0.0,0.0
545,Albany,1/21/2018,organic,1.54,3346.54,14.67,253.01,0.0,3078.86,3078.86,0.0,0.0,1/1/2018,actual,54698266.0,2913663.0,445127.0,51339476.0,0.0,0.0
546,Albany,1/28/2018,conventional,1.57,93625.03,3101.17,74627.23,55.59,15841.04,11614.79,4159.58,66.67,1/1/2018,actual,45299201.0,2698356.0,95491.0,42505354.0,0.0,0.0


In [7]:
# Count Null values
#avocado_df.isnull().sum()

# Drop Null values
avocado_df = avocado_df.dropna(how='any')
print(avocado_df.shape)
avocado_df.head()

(14688, 20)


Unnamed: 0,geography,date,type,avg_price,total_volume,units_4046,units_4225,units_4770_,total_bags,s_bags,l_bags,xl_bags,year_month,status,total_volume_produces,california,chile,mexico,peru,columbia
542,Albany,1/14/2018,conventional,1.42,95246.38,2897.41,76570.67,44.0,15734.3,10012.8,5721.5,0.0,1/1/2018,actual,52758905.0,830821.0,449003.0,51479081.0,0.0,0.0
543,Albany,1/14/2018,organic,1.47,4140.95,7.3,301.87,0.0,3831.78,3831.78,0.0,0.0,1/1/2018,actual,52758905.0,830821.0,449003.0,51479081.0,0.0,0.0
544,Albany,1/21/2018,conventional,1.69,135196.35,3133.37,116520.88,88.78,15453.32,10023.79,5429.53,0.0,1/1/2018,actual,54698266.0,2913663.0,445127.0,51339476.0,0.0,0.0
545,Albany,1/21/2018,organic,1.54,3346.54,14.67,253.01,0.0,3078.86,3078.86,0.0,0.0,1/1/2018,actual,54698266.0,2913663.0,445127.0,51339476.0,0.0,0.0
546,Albany,1/28/2018,conventional,1.57,93625.03,3101.17,74627.23,55.59,15841.04,11614.79,4159.58,66.67,1/1/2018,actual,45299201.0,2698356.0,95491.0,42505354.0,0.0,0.0


In [8]:
# List all columns
avocado_df.columns.tolist()

['geography',
 'date',
 'type',
 'avg_price',
 'total_volume',
 'units_4046',
 'units_4225',
 'units_4770_',
 'total_bags',
 's_bags',
 'l_bags',
 'xl_bags',
 'year_month',
 'status',
 'total_volume_produces',
 'california',
 'chile',
 'mexico',
 'peru',
 'columbia']

In [9]:
# Drop the non-beneficial columns.
avocado_df.drop(columns=["year_month"], inplace=True)
#avocado_df.head()

In [10]:
# Rename confusing columns
avocado_df = avocado_df.rename(columns={'total_volume':'total_volume_sold', 'total_volume_produces':'total_volume_produced', 'units_4770_':'units_4770' })
#avocado_df.head()

In [11]:
# Check datatypes
avocado_df.dtypes

geography                 object
date                      object
type                      object
avg_price                float64
total_volume_sold        float64
units_4046               float64
units_4225               float64
units_4770               float64
total_bags               float64
s_bags                   float64
l_bags                   float64
xl_bags                  float64
status                    object
total_volume_produced    float64
california               float64
chile                    float64
mexico                   float64
peru                     float64
columbia                 float64
dtype: object

In [12]:
# Convert date object to date and check datatype
import datetime
avocado_df.date = avocado_df.date.apply(pd.to_datetime)
print(avocado_df.date.dtypes)

datetime64[ns]


In [13]:
# Exctract the month
avocado_df['month'] = avocado_df.date.dt.month
avocado_df.head()

Unnamed: 0,geography,date,type,avg_price,total_volume_sold,units_4046,units_4225,units_4770,total_bags,s_bags,l_bags,xl_bags,status,total_volume_produced,california,chile,mexico,peru,columbia,month
542,Albany,2018-01-14,conventional,1.42,95246.38,2897.41,76570.67,44.0,15734.3,10012.8,5721.5,0.0,actual,52758905.0,830821.0,449003.0,51479081.0,0.0,0.0,1
543,Albany,2018-01-14,organic,1.47,4140.95,7.3,301.87,0.0,3831.78,3831.78,0.0,0.0,actual,52758905.0,830821.0,449003.0,51479081.0,0.0,0.0,1
544,Albany,2018-01-21,conventional,1.69,135196.35,3133.37,116520.88,88.78,15453.32,10023.79,5429.53,0.0,actual,54698266.0,2913663.0,445127.0,51339476.0,0.0,0.0,1
545,Albany,2018-01-21,organic,1.54,3346.54,14.67,253.01,0.0,3078.86,3078.86,0.0,0.0,actual,54698266.0,2913663.0,445127.0,51339476.0,0.0,0.0,1
546,Albany,2018-01-28,conventional,1.57,93625.03,3101.17,74627.23,55.59,15841.04,11614.79,4159.58,66.67,actual,45299201.0,2698356.0,95491.0,42505354.0,0.0,0.0,1


In [14]:
# # Exctract the week with no warning
# avocado_df['week'] = avocado_df.date.dt.isocalendar().week
# avocado_df.head()

In [15]:
# Check the datatypes for month
#print(avocado_df.week.dtypes)

In [16]:
#Check the weeks
#avocado_df.week.value_counts()

In [17]:
# Exctract the week
avocado_df['week'] = avocado_df.date.dt.week
avocado_df.head()

  


Unnamed: 0,geography,date,type,avg_price,total_volume_sold,units_4046,units_4225,units_4770,total_bags,s_bags,...,xl_bags,status,total_volume_produced,california,chile,mexico,peru,columbia,month,week
542,Albany,2018-01-14,conventional,1.42,95246.38,2897.41,76570.67,44.0,15734.3,10012.8,...,0.0,actual,52758905.0,830821.0,449003.0,51479081.0,0.0,0.0,1,2
543,Albany,2018-01-14,organic,1.47,4140.95,7.3,301.87,0.0,3831.78,3831.78,...,0.0,actual,52758905.0,830821.0,449003.0,51479081.0,0.0,0.0,1,2
544,Albany,2018-01-21,conventional,1.69,135196.35,3133.37,116520.88,88.78,15453.32,10023.79,...,0.0,actual,54698266.0,2913663.0,445127.0,51339476.0,0.0,0.0,1,3
545,Albany,2018-01-21,organic,1.54,3346.54,14.67,253.01,0.0,3078.86,3078.86,...,0.0,actual,54698266.0,2913663.0,445127.0,51339476.0,0.0,0.0,1,3
546,Albany,2018-01-28,conventional,1.57,93625.03,3101.17,74627.23,55.59,15841.04,11614.79,...,66.67,actual,45299201.0,2698356.0,95491.0,42505354.0,0.0,0.0,1,4


In [18]:
# Check the datatypes for month
print(avocado_df.week.dtypes)

int64


In [19]:
#Check the months
#avocado_df.month.value_counts()

#Check the weeks
#avocado_df.week.value_counts()

In [20]:
# Select and count rows with Total U.S. (see EDA in Tableau - possible outlier) 
avocado_df_no_TotalUS_check = avocado_df[avocado_df['geography'].str.contains('Total U.S.')]
#len(avocado_df_no_TotalUS)
#avocado_df_no_TotalUS_check.head()

In [21]:
# Drop rows with Total U.S. (see EDA in Tableau - possible outlier) 
avocado_df_no_TotalUS = avocado_df[~avocado_df['geography'].isin(['Total U.S.'])]
avocado_df_no_TotalUS.head()

Unnamed: 0,geography,date,type,avg_price,total_volume_sold,units_4046,units_4225,units_4770,total_bags,s_bags,...,xl_bags,status,total_volume_produced,california,chile,mexico,peru,columbia,month,week
542,Albany,2018-01-14,conventional,1.42,95246.38,2897.41,76570.67,44.0,15734.3,10012.8,...,0.0,actual,52758905.0,830821.0,449003.0,51479081.0,0.0,0.0,1,2
543,Albany,2018-01-14,organic,1.47,4140.95,7.3,301.87,0.0,3831.78,3831.78,...,0.0,actual,52758905.0,830821.0,449003.0,51479081.0,0.0,0.0,1,2
544,Albany,2018-01-21,conventional,1.69,135196.35,3133.37,116520.88,88.78,15453.32,10023.79,...,0.0,actual,54698266.0,2913663.0,445127.0,51339476.0,0.0,0.0,1,3
545,Albany,2018-01-21,organic,1.54,3346.54,14.67,253.01,0.0,3078.86,3078.86,...,0.0,actual,54698266.0,2913663.0,445127.0,51339476.0,0.0,0.0,1,3
546,Albany,2018-01-28,conventional,1.57,93625.03,3101.17,74627.23,55.59,15841.04,11614.79,...,66.67,actual,45299201.0,2698356.0,95491.0,42505354.0,0.0,0.0,1,4


In [22]:
print(avocado_df_no_TotalUS.shape)

(14416, 21)


In [23]:
# Calculations
### sumup all units sold
### calculate ratio total volume sold vs total_volume_produced

### sumup all units sold
avocado_df_no_TotalUS['total_units_sold'] = avocado_df_no_TotalUS.iloc[:, 5:8].sum(axis=1)

### calculate ratio total volume sold vs total_volume_produced
avocado_df_no_TotalUS['ratio_sold_vs_produced'] = avocado_df_no_TotalUS['total_volume_sold']/avocado_df_no_TotalUS['total_volume_produced']
avocado_df_no_TotalUS.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Unnamed: 0,geography,date,type,avg_price,total_volume_sold,units_4046,units_4225,units_4770,total_bags,s_bags,...,total_volume_produced,california,chile,mexico,peru,columbia,month,week,total_units_sold,ratio_sold_vs_produced
542,Albany,2018-01-14,conventional,1.42,95246.38,2897.41,76570.67,44.0,15734.3,10012.8,...,52758905.0,830821.0,449003.0,51479081.0,0.0,0.0,1,2,79512.08,0.001805
543,Albany,2018-01-14,organic,1.47,4140.95,7.3,301.87,0.0,3831.78,3831.78,...,52758905.0,830821.0,449003.0,51479081.0,0.0,0.0,1,2,309.17,7.8e-05
544,Albany,2018-01-21,conventional,1.69,135196.35,3133.37,116520.88,88.78,15453.32,10023.79,...,54698266.0,2913663.0,445127.0,51339476.0,0.0,0.0,1,3,119743.03,0.002472
545,Albany,2018-01-21,organic,1.54,3346.54,14.67,253.01,0.0,3078.86,3078.86,...,54698266.0,2913663.0,445127.0,51339476.0,0.0,0.0,1,3,267.68,6.1e-05
546,Albany,2018-01-28,conventional,1.57,93625.03,3101.17,74627.23,55.59,15841.04,11614.79,...,45299201.0,2698356.0,95491.0,42505354.0,0.0,0.0,1,4,77783.99,0.002067


In [24]:
# Clean the type_conventional.
## Count the values
avocado_df_no_TotalUS["type"].value_counts()

organic          7208
conventional     4240
conventional     2968
Name: type, dtype: int64

In [25]:
# Replace "conventional " with "conventional"
avocado_df_no_TotalUS["type"] = avocado_df_no_TotalUS["type"].str.replace("conventional ", "conventional")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [26]:
avocado_df_no_TotalUS["type"].value_counts()

conventional    7208
organic         7208
Name: type, dtype: int64

In [27]:
# Plot the data with encoded date grouped by month. - check also for scatter plot
#avocado_df.groupby('month')['ratio CA sold vs prod'].mean().plot()

### DATA preprocessing for ML model

In [28]:
# Check datatypes
avocado_df_no_TotalUS.dtypes

geography                         object
date                      datetime64[ns]
type                              object
avg_price                        float64
total_volume_sold                float64
units_4046                       float64
units_4225                       float64
units_4770                       float64
total_bags                       float64
s_bags                           float64
l_bags                           float64
xl_bags                          float64
status                            object
total_volume_produced            float64
california                       float64
chile                            float64
mexico                           float64
peru                             float64
columbia                         float64
month                              int64
week                               int64
total_units_sold                 float64
ratio_sold_vs_produced           float64
dtype: object

In [29]:
# Use get_dummies() to create variables for text features.
avocado_cat_encoded = pd.get_dummies(avocado_df_no_TotalUS,  columns=["geography", "type", "status"])
avocado_cat_encoded.head()

Unnamed: 0,date,avg_price,total_volume_sold,units_4046,units_4225,units_4770,total_bags,s_bags,l_bags,xl_bags,...,geography_Southeast,geography_Spokane,geography_St. Louis,geography_Syracuse,geography_Tampa,geography_West,geography_West Tex/New Mexico,type_conventional,type_organic,status_actual
542,2018-01-14,1.42,95246.38,2897.41,76570.67,44.0,15734.3,10012.8,5721.5,0.0,...,0,0,0,0,0,0,0,1,0,1
543,2018-01-14,1.47,4140.95,7.3,301.87,0.0,3831.78,3831.78,0.0,0.0,...,0,0,0,0,0,0,0,0,1,1
544,2018-01-21,1.69,135196.35,3133.37,116520.88,88.78,15453.32,10023.79,5429.53,0.0,...,0,0,0,0,0,0,0,1,0,1
545,2018-01-21,1.54,3346.54,14.67,253.01,0.0,3078.86,3078.86,0.0,0.0,...,0,0,0,0,0,0,0,0,1,1
546,2018-01-28,1.57,93625.03,3101.17,74627.23,55.59,15841.04,11614.79,4159.58,66.67,...,0,0,0,0,0,0,0,1,0,1


In [30]:
#List the columns
avocado_cat_encoded.columns.tolist()

['date',
 'avg_price',
 'total_volume_sold',
 'units_4046',
 'units_4225',
 'units_4770',
 'total_bags',
 's_bags',
 'l_bags',
 'xl_bags',
 'total_volume_produced',
 'california',
 'chile',
 'mexico',
 'peru',
 'columbia',
 'month',
 'week',
 'total_units_sold',
 'ratio_sold_vs_produced',
 'geography_Albany',
 'geography_Atlanta',
 'geography_Baltimore/Washington',
 'geography_Boise',
 'geography_Boston',
 'geography_Buffalo/Rochester',
 'geography_California',
 'geography_Charlotte',
 'geography_Chicago',
 'geography_Cincinnati/Dayton',
 'geography_Columbus',
 'geography_Dallas/Ft. Worth',
 'geography_Denver',
 'geography_Detroit',
 'geography_Grand Rapids',
 'geography_Great Lakes',
 'geography_Harrisburg/Scranton',
 'geography_Hartford/Springfield',
 'geography_Houston',
 'geography_Indianapolis',
 'geography_Jacksonville',
 'geography_Las Vegas',
 'geography_Los Angeles',
 'geography_Louisville',
 'geography_Miami/Ft. Lauderdale',
 'geography_Midsouth',
 'geography_Nashville',
 'ge

In [31]:
# Drop the non-beneficial columns - drop "prod_total_volume" after the ratio calculations
avocado_cat_encoded.drop(columns=["date", "total_volume_produced"], inplace=True)
avocado_cat_encoded.head()

Unnamed: 0,avg_price,total_volume_sold,units_4046,units_4225,units_4770,total_bags,s_bags,l_bags,xl_bags,california,...,geography_Southeast,geography_Spokane,geography_St. Louis,geography_Syracuse,geography_Tampa,geography_West,geography_West Tex/New Mexico,type_conventional,type_organic,status_actual
542,1.42,95246.38,2897.41,76570.67,44.0,15734.3,10012.8,5721.5,0.0,830821.0,...,0,0,0,0,0,0,0,1,0,1
543,1.47,4140.95,7.3,301.87,0.0,3831.78,3831.78,0.0,0.0,830821.0,...,0,0,0,0,0,0,0,0,1,1
544,1.69,135196.35,3133.37,116520.88,88.78,15453.32,10023.79,5429.53,0.0,2913663.0,...,0,0,0,0,0,0,0,1,0,1
545,1.54,3346.54,14.67,253.01,0.0,3078.86,3078.86,0.0,0.0,2913663.0,...,0,0,0,0,0,0,0,0,1,1
546,1.57,93625.03,3101.17,74627.23,55.59,15841.04,11614.79,4159.58,66.67,2698356.0,...,0,0,0,0,0,0,0,1,0,1


In [32]:
# Determine the number of unique values in each column.
#avocado_cat_encoded.dtypes
#len(avocado_cat_encoded.nunique())

In [33]:
# Define X values.
X = avocado_cat_encoded[['total_units_sold', 'month',
 'geography_Albany',
 'geography_Atlanta',
 'geography_Baltimore/Washington',
 'geography_Boise',
 'geography_Boston',
 'geography_Buffalo/Rochester',
 'geography_California',
 'geography_Charlotte',
 'geography_Chicago',
 'geography_Cincinnati/Dayton',
 'geography_Columbus',
 'geography_Dallas/Ft. Worth',
 'geography_Denver',
 'geography_Detroit',
 'geography_Grand Rapids',
 'geography_Great Lakes',
 'geography_Harrisburg/Scranton',
 'geography_Hartford/Springfield',
 'geography_Houston',
 'geography_Indianapolis',
 'geography_Jacksonville',
 'geography_Las Vegas',
 'geography_Los Angeles',
 'geography_Louisville',
 'geography_Miami/Ft. Lauderdale',
 'geography_Midsouth',
 'geography_Nashville',
 'geography_New Orleans/Mobile',
 'geography_New York',
 'geography_Northeast',
 'geography_Northern New England',
 'geography_Orlando',
 'geography_Philadelphia',
 'geography_Phoenix/Tucson',
 'geography_Pittsburgh',
 'geography_Plains',
 'geography_Portland',
 'geography_Raleigh/Greensboro',
 'geography_Richmond/Norfolk',
 'geography_Roanoke',
 'geography_Sacramento',
 'geography_San Diego',
 'geography_San Francisco',
 'geography_Seattle',
 'geography_South Carolina',
 'geography_South Central',
 'geography_Southeast',
 'geography_Spokane',
 'geography_St. Louis',
 'geography_Syracuse',
 'geography_Tampa',
 'geography_West',
 'geography_West Tex/New Mexico',
 'type_conventional',
 'type_organic',
 'status_actual']]

In [34]:
# Define y values.
y = avocado_df_no_TotalUS.avg_price

In [35]:
# Split the to training and testing.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [36]:
# Fit the model.
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [37]:
# Predict the model.
y_pred = model.predict(X_test)

In [38]:
# Mean squared error.
mean_squared_error(y_test, y_pred)

0.04284549641074588

In [39]:
# R2 score.
r2_score(y_test, y_pred)

0.6220771836434567

In [40]:
# Print coeficient and intercept
print(model.coef_)
print(model.intercept_)

[-3.82406028e-08  8.69048214e-03  4.05773282e-02 -9.40769514e-02
  4.33870908e-02  1.89623980e-01  1.80817606e-01 -1.08309804e-01
  2.10564089e-01  1.29900322e-01  1.45331958e-01 -9.42076417e-02
 -1.84225173e-01 -2.71140571e-01 -3.95397177e-02 -1.49958091e-01
  2.34008800e-03 -6.34031660e-02 -1.43407931e-01  3.32347727e-01
 -2.33557797e-01 -1.78085101e-01 -9.64754647e-02 -9.76876659e-03
  7.52765208e-02 -1.11494027e-01 -9.53118136e-02  2.22277869e-02
 -1.43859059e-01 -2.16221230e-01  2.45558982e-01  1.70527642e-01
  4.03919502e-02 -1.56184157e-01  3.08015101e-02 -1.66235525e-01
 -8.76661504e-02 -7.74066966e-02  1.62758219e-01  1.95291885e-01
 -1.60345994e-01 -1.35750502e-01  1.84000181e-01  1.65267619e-01
  3.50374952e-01  2.89570550e-01 -3.94474916e-02 -1.69105544e-01
 -4.59329355e-02  2.00561991e-01  1.18926873e-01 -8.03504971e-02
 -1.60556783e-01  8.83622150e-02 -1.02764448e-01 -2.03048084e-01
  2.03048085e-01  0.00000000e+00]
1.325366285672971


In [41]:
# Print the scores
print("R2 score : %.2f" % r2_score(y_test, y_pred))
print("Mean squared error: %.2f" % mean_squared_error(y_test,y_pred))

R2 score : 0.62
Mean squared error: 0.04


### Interpretation of the results

**Intro**

"R squared", is the proportion of the variance in the dependent variable that is predictable from the independent variable(s).

Mean square error (MSE) is the average of the square of the errors. The larger the number the larger the error.

There is no correct value for MSE. Simply put, the lower the value the better. Since there is no correct answer, the MSE’s basic value is in selecting one prediction model over another.
Similarly, there is also no correct answer as to what R2 should be. 100% means perfect correlation. Yet, there are models with a low R2 that are still good models (Source: https://www.bmc.com/blogs/mean-squared-error-r2-and-variance-in-regression-analysis/ )

Our dataset can be:
Overfit means the model is subject to outliers and noise.
Underfit means the model could look at other inputs (i.e., additional features).

**Our Results**

R2 score is on the higher side and tells that our model does quite well for the features that we used and that there is correlation amongst variables. 
Mean square error (MSE) is low, what tells that the error between observed and predicted is low.

**Note**

Model on only California and its cities did slightly better (it could be that larger data set has more outliers that might be skewing the results). also more noise in larger dataset. feature_importance different for all cities than only for CA.

### Random Forest Regressor - in progress

In [42]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [43]:
# Train the model.
reg = RandomForestRegressor(n_estimators = 100, random_state = 0)
reg.fit(X_train, y_train)

RandomForestRegressor(random_state=0)

In [44]:
y_pred = reg.predict(X_test)
#y_pred

In [45]:
# Print result mse and rmse (root mean squared error)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmse

0.15994347419934554

#### Interpretaion of the results
The RMSE is the square root of the variance of the residuals. It indicates the absolute fit of the model to the data–how close the observed data points are to the model’s predicted values. Whereas R-squared is a relative measure of fit, RMSE is an absolute measure of fit. As the square root of a variance, RMSE can be interpreted as the standard deviation of the unexplained variance, and has the useful property of being in the same units as the response variable. Lower values of RMSE indicate better fit. RMSE is a good measure of how accurately the model predicts the response, and it is the most important criterion for fit if the main purpose of the model is prediction.

In [46]:
# List the features sorted in descending order by feature importance
feature_importance = sorted(zip(reg.feature_importances_, X.columns), reverse=True)
feature_importance

[(0.2326703721787876, 'type_conventional'),
 (0.22490788658544672, 'total_units_sold'),
 (0.167831345291769, 'type_organic'),
 (0.08388861213277578, 'month'),
 (0.017441447451596495, 'geography_San Francisco'),
 (0.01729774404095056, 'geography_Hartford/Springfield'),
 (0.015227524506358723, 'geography_Boise'),
 (0.014319711960288462, 'geography_Spokane'),
 (0.013609766997568305, 'geography_Seattle'),
 (0.012143727783640723, 'geography_Houston'),
 (0.012009646649250593, 'geography_Dallas/Ft. Worth'),
 (0.011587367792606712, 'geography_Raleigh/Greensboro'),
 (0.010264803593796284, 'geography_Phoenix/Tucson'),
 (0.009674748852169516, 'geography_South Central'),
 (0.009067373190582727, 'geography_San Diego'),
 (0.008856005518073052, 'geography_New York'),
 (0.008699041953061544, 'geography_Boston'),
 (0.007159201128607727, 'geography_Sacramento'),
 (0.0071465782374551924, 'geography_St. Louis'),
 (0.007031578515429649, 'geography_Portland'),
 (0.006275286357889559, 'geography_Grand Rapids

### Neural Network ML model - in progress (Skeleton Code)

In [47]:
# # Import dependencies.
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler,OneHotEncoder
# import pandas as pd
# import tensorflow as tf
# from pathlib import Path

# import warnings
# warnings.filterwarnings('ignore')

# # Import checkpoint dependencies
# import os
# from tensorflow.keras.callbacks import ModelCheckpoint

In [48]:
# ## Visualize the value counts of avocado_mock_df['TBD']
# avocado_df = avocado_df['TBD'].value_counts()
# #ca_prod.plot(kind='bar')

In [49]:
# # Determine which values to replace if counts are less than ...?
# replace_ca_prod = list(ca_prod[ca_prod < 500000].index)

# ## Replace in dataframe
# for ca_p in replace_ca_prod:
#     avocado_mock_df['TBD'] = avocado_df['TBD'].replace(ca_p,"Other")
    
# # Check to make sure binning was successful
# avocado_df['TBD'].value_counts()

In [50]:
# # Generate our categorical variable lists
# avocado_cat = avocado_df.dtypes[avocado_mock_df.dtypes == "object"].index.tolist()

In [51]:
# # Create a OneHotEncoder instance
# enc = OneHotEncoder(sparse=False)

# # Fit and transform the OneHotEncoder using the categorical variable list
# encode_df = pd.DataFrame(enc.fit_transform(avocado_mock_df[avocado_cat]))

# # Add the encoded variable names to the dataframe
# encode_df.columns = enc.get_feature_names(avocado_cat)
# encode_df.head()

In [52]:
# # Merge one-hot encoded features and drop the originals
# avocado_df = avocado_df.merge(encode_df,left_index=True, right_index=True)
# avocado_df = avocado_df.drop(avocado_cat,1)
# avocado_df.head()

In [53]:
# # Split our preprocessed data into our features and target arrays
# y = avocado_df["TBD"].values
# X = avocado_df.drop(["TBD"],1).values

# # Split the preprocessed data into a training and testing dataset
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [54]:
# # Create a StandardScaler instances
# scaler = StandardScaler()

# # Fit the StandardScaler
# X_scaler = scaler.fit(X_train)

# # Scale the data
# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)

### Compile, Train and Evaluate the Model

In [55]:
# # Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
# number_input_features = len(X_train[0])
# hidden_nodes_layer1 = 210
# hidden_nodes_layer2 = 90
# #hidden_nodes_layer3 = 40

# nn = tf.keras.models.Sequential()

# # First hidden layer
# nn.add(
#     tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
# )

# # Second hidden layer
# nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# # Third hidden layer
# #nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# # Output layer
# nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# # Check the structure of the model
# nn.summary()

In [56]:
# # Define the checkpoint path and filenames
# os.makedirs("checkpoints/",exist_ok=True)
# checkpoint_path = "checkpoints/weights.{epoch:02d}.hdf5"

In [57]:
# # Compile the model
# nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [58]:
# # Create a callback that saves the model's weights every 5 epochs
# cp_callback = ModelCheckpoint(
#     filepath=checkpoint_path,
#     verbose=1,
#     save_weights_only=True,
#     save_freq=1000)

In [59]:
# # Train the model
# fit_model = nn.fit(X_train_scaled,y_train,epochs=100,callbacks=[cp_callback])

In [60]:
# # Evaluate the model using the test data
# model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
# print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [61]:
# Export our model to HDF5 file
#nn.save("Avocado_price.h5")