### Purpose of ML models: forecasting avocado prices and defining what features are affecting avocado prices in the US.

### Linear Regression with sckit-learn and Random Forest Ensamble Regressor.

(Features) **X-variables**: region, date, units sold, production data (US, Mexico, Chile, Peru and Colombia).

(Target) **y-variable:** avocado price 


## Import basic dependencies

In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Import data from the database

In [2]:
import pandas as pd
from sqlalchemy import create_engine
from config import db_password

# local server connection string
db_string = f"postgres://postgres:{db_password}@127.0.0.1:5432/avocadosNEW"

# create db engine
engine = create_engine(db_string)

In [3]:
# Load the avocado dataset form pgAdmin.
avocado_df = pd.read_sql_table('prices_prod', engine)
avocado_df.head()

Unnamed: 0,year_month,geography,date,type,avg_price,prices_total_volume,units_4046,units_4225,units_4770,total_bags,s_bags,l_bags,xl_bags,status,prod_total_volume,california,chile,mexico,peru,colombia
0,2018-03-01,Richmond/Norfolk,2018-03-25,organic,1.42,17340.49,295.16,3478.97,81.25,13485.11,12149.49,1335.62,0.0,actual,45807126,4997613,0,40809513,0,0
1,2018-07-01,Louisville,2018-07-08,conventional,0.97,142520.42,22704.27,33493.36,74.79,86248.0,44028.94,41902.39,316.67,actual,54802659,11546402,0,34059160,9197097,0
2,2019-02-01,Northeast,2019-02-03,conventional,1.14,7743349.68,344482.59,5667229.92,33665.87,1697971.3,1234928.23,462968.25,74.82,actual,45231823,225945,584756,44421122,0,0
3,2018-09-01,WestTex/NewMexico,2018-09-16,conventional,1.06,653087.92,314055.97,91822.68,28164.95,219044.32,130499.36,88272.02,272.94,actual,51398671,3782736,6299540,35153392,6163003,0
4,2019-07-01,WestTex/NewMexico,2019-07-21,conventional,1.34,760761.72,337772.97,78373.8,72579.15,272035.8,182285.97,76347.61,13402.22,actual,51585051,9892498,0,27328014,14364539,0


## Data Cleaning

In [4]:
#Count Null values - if any
#avocado_df.isnull().sum()

In [5]:
# Make a copy.
avocado_df = avocado_df.copy()

In [6]:
# Drop Null values.
avocado_df = avocado_df.dropna(how='any')
print(avocado_df.shape)
avocado_df.head()

(14472, 20)


Unnamed: 0,year_month,geography,date,type,avg_price,prices_total_volume,units_4046,units_4225,units_4770,total_bags,s_bags,l_bags,xl_bags,status,prod_total_volume,california,chile,mexico,peru,colombia
0,2018-03-01,Richmond/Norfolk,2018-03-25,organic,1.42,17340.49,295.16,3478.97,81.25,13485.11,12149.49,1335.62,0.0,actual,45807126,4997613,0,40809513,0,0
1,2018-07-01,Louisville,2018-07-08,conventional,0.97,142520.42,22704.27,33493.36,74.79,86248.0,44028.94,41902.39,316.67,actual,54802659,11546402,0,34059160,9197097,0
2,2019-02-01,Northeast,2019-02-03,conventional,1.14,7743349.68,344482.59,5667229.92,33665.87,1697971.3,1234928.23,462968.25,74.82,actual,45231823,225945,584756,44421122,0,0
3,2018-09-01,WestTex/NewMexico,2018-09-16,conventional,1.06,653087.92,314055.97,91822.68,28164.95,219044.32,130499.36,88272.02,272.94,actual,51398671,3782736,6299540,35153392,6163003,0
4,2019-07-01,WestTex/NewMexico,2019-07-21,conventional,1.34,760761.72,337772.97,78373.8,72579.15,272035.8,182285.97,76347.61,13402.22,actual,51585051,9892498,0,27328014,14364539,0


In [7]:
# Drop the non-beneficial columns.
avocado_df.drop(columns=["year_month"], inplace=True)
#avocado_df.head()

In [8]:
# Rename confusing columns.
avocado_df = avocado_df.rename(columns={'prices_total_volume':'total_volume_sold', 'prod_total_volume':'total_volume_produced'})
avocado_df.head()

Unnamed: 0,geography,date,type,avg_price,total_volume_sold,units_4046,units_4225,units_4770,total_bags,s_bags,l_bags,xl_bags,status,total_volume_produced,california,chile,mexico,peru,colombia
0,Richmond/Norfolk,2018-03-25,organic,1.42,17340.49,295.16,3478.97,81.25,13485.11,12149.49,1335.62,0.0,actual,45807126,4997613,0,40809513,0,0
1,Louisville,2018-07-08,conventional,0.97,142520.42,22704.27,33493.36,74.79,86248.0,44028.94,41902.39,316.67,actual,54802659,11546402,0,34059160,9197097,0
2,Northeast,2019-02-03,conventional,1.14,7743349.68,344482.59,5667229.92,33665.87,1697971.3,1234928.23,462968.25,74.82,actual,45231823,225945,584756,44421122,0,0
3,WestTex/NewMexico,2018-09-16,conventional,1.06,653087.92,314055.97,91822.68,28164.95,219044.32,130499.36,88272.02,272.94,actual,51398671,3782736,6299540,35153392,6163003,0
4,WestTex/NewMexico,2019-07-21,conventional,1.34,760761.72,337772.97,78373.8,72579.15,272035.8,182285.97,76347.61,13402.22,actual,51585051,9892498,0,27328014,14364539,0


### Dealing with the date in the ML

To pass the date through ML model few steps were taken, assuming seasonality in the data.
- convert date to datetime datatype.
- extract week, month and year and place values in a separate columns.
- feed those values in ML model.

In [9]:
# Convert date object to date and check datatype.
avocado_df.date = avocado_df.date.apply(pd.to_datetime)
print(avocado_df.date.dtypes)

datetime64[ns]


In [10]:
#Check the df shape.
print(avocado_df.shape)

(14472, 19)


In [11]:
# Extract the month
avocado_df['month'] = avocado_df.date.dt.month
# Extract the week
avocado_df['week'] = avocado_df.date.dt.week
# Extract the year
avocado_df['year'] = avocado_df.date.dt.year
# Print df.
avocado_df.head()

Unnamed: 0,geography,date,type,avg_price,total_volume_sold,units_4046,units_4225,units_4770,total_bags,s_bags,...,status,total_volume_produced,california,chile,mexico,peru,colombia,month,week,year
0,Richmond/Norfolk,2018-03-25,organic,1.42,17340.49,295.16,3478.97,81.25,13485.11,12149.49,...,actual,45807126,4997613,0,40809513,0,0,3,12,2018
1,Louisville,2018-07-08,conventional,0.97,142520.42,22704.27,33493.36,74.79,86248.0,44028.94,...,actual,54802659,11546402,0,34059160,9197097,0,7,27,2018
2,Northeast,2019-02-03,conventional,1.14,7743349.68,344482.59,5667229.92,33665.87,1697971.3,1234928.23,...,actual,45231823,225945,584756,44421122,0,0,2,5,2019
3,WestTex/NewMexico,2018-09-16,conventional,1.06,653087.92,314055.97,91822.68,28164.95,219044.32,130499.36,...,actual,51398671,3782736,6299540,35153392,6163003,0,9,37,2018
4,WestTex/NewMexico,2019-07-21,conventional,1.34,760761.72,337772.97,78373.8,72579.15,272035.8,182285.97,...,actual,51585051,9892498,0,27328014,14364539,0,7,29,2019


In [12]:
#Check the df shape.
print(avocado_df.shape)

(14472, 22)


In [13]:
# Create a list for features to be removed (Noise).
# Some data points represents totals of the regions, and we decided to remove them.
regions = ['TotalUS', 'California', 'GreatLakes', 'Midsouth', 'Northeast', 'Plains', 'SouthCentral', 'Southeast', 'West']

In [14]:
# Keep only the rows that not in the regions list (Noise).
avocado_df_no_regions = avocado_df[~avocado_df['geography'].isin(regions)]
avocado_df_no_regions.head()

Unnamed: 0,geography,date,type,avg_price,total_volume_sold,units_4046,units_4225,units_4770,total_bags,s_bags,...,status,total_volume_produced,california,chile,mexico,peru,colombia,month,week,year
0,Richmond/Norfolk,2018-03-25,organic,1.42,17340.49,295.16,3478.97,81.25,13485.11,12149.49,...,actual,45807126,4997613,0,40809513,0,0,3,12,2018
1,Louisville,2018-07-08,conventional,0.97,142520.42,22704.27,33493.36,74.79,86248.0,44028.94,...,actual,54802659,11546402,0,34059160,9197097,0,7,27,2018
3,WestTex/NewMexico,2018-09-16,conventional,1.06,653087.92,314055.97,91822.68,28164.95,219044.32,130499.36,...,actual,51398671,3782736,6299540,35153392,6163003,0,9,37,2018
4,WestTex/NewMexico,2019-07-21,conventional,1.34,760761.72,337772.97,78373.8,72579.15,272035.8,182285.97,...,actual,51585051,9892498,0,27328014,14364539,0,7,29,2019
6,Boston,2019-12-22,conventional,1.35,553606.09,11777.95,344390.0,1201.76,196236.38,86170.28,...,actual,49600528,0,177780,49422746,0,0,12,51,2019


In [15]:
# Calculations
# We could use the calculations (ratios) for better predictions.
### sumup all units sold
avocado_df_no_regions['total_units_sold'] = avocado_df_no_regions.iloc[:, 5:8].sum(axis=1)

### calculate ratio total volume sold vs total_volume_produced
avocado_df_no_regions['ratio_sold_vs_produced'] = avocado_df_no_regions.loc[:,('total_volume_sold')]/avocado_df_no_regions.loc[:,('total_volume_produced')]

In [16]:
# Clean the type_conventional.
## Count the values
avocado_df_no_regions["type"].value_counts()

organic          6030
conventional     3555
conventional     2475
Name: type, dtype: int64

In [17]:
# Replace "conventional " with "conventional"
avocado_df_no_regions["type"] = avocado_df_no_regions.loc[:,"type"].str.replace("conventional ", "conventional")

In [18]:
# Clean the type_conventional.
## Count the values
avocado_df_no_regions["type"].value_counts()

organic         6030
conventional    6030
Name: type, dtype: int64

In [19]:
# Check all columns.
avocado_df_no_regions.columns.tolist()

['geography',
 'date',
 'type',
 'avg_price',
 'total_volume_sold',
 'units_4046',
 'units_4225',
 'units_4770',
 'total_bags',
 's_bags',
 'l_bags',
 'xl_bags',
 'status',
 'total_volume_produced',
 'california',
 'chile',
 'mexico',
 'peru',
 'colombia',
 'month',
 'week',
 'year',
 'total_units_sold',
 'ratio_sold_vs_produced']

In [20]:
# Check datatypes.
#avocado_df_no_TotalUS.dtypes

In [21]:
#Check the months
#avocado_df_no_regions.month.value_counts()

#Check the weeks
#avocado_df_no_regions.week.value_counts()

### DATA preprocessing for ML model

In [22]:
# Import dependencies data preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder

In [23]:
# Check datatypes
avocado_df_no_regions.dtypes

geography                         object
date                      datetime64[ns]
type                              object
avg_price                        float64
total_volume_sold                float64
units_4046                       float64
units_4225                       float64
units_4770                       float64
total_bags                       float64
s_bags                           float64
l_bags                           float64
xl_bags                          float64
status                            object
total_volume_produced              int64
california                         int64
chile                              int64
mexico                             int64
peru                               int64
colombia                           int64
month                              int64
week                               int64
year                               int64
total_units_sold                 float64
ratio_sold_vs_produced           float64
dtype: object

In [24]:
# Use get_dummies() to create variables for text features.
avocado_cat_encoded = pd.get_dummies(avocado_df_no_regions,  columns=["geography", "type", "status"])
avocado_cat_encoded.head()

Unnamed: 0,date,avg_price,total_volume_sold,units_4046,units_4225,units_4770,total_bags,s_bags,l_bags,xl_bags,...,geography_Seattle,geography_SouthCarolina,geography_Spokane,geography_StLouis,geography_Syracuse,geography_Tampa,geography_WestTex/NewMexico,type_conventional,type_organic,status_actual
0,2018-03-25,1.42,17340.49,295.16,3478.97,81.25,13485.11,12149.49,1335.62,0.0,...,0,0,0,0,0,0,0,0,1,1
1,2018-07-08,0.97,142520.42,22704.27,33493.36,74.79,86248.0,44028.94,41902.39,316.67,...,0,0,0,0,0,0,0,1,0,1
3,2018-09-16,1.06,653087.92,314055.97,91822.68,28164.95,219044.32,130499.36,88272.02,272.94,...,0,0,0,0,0,0,1,1,0,1
4,2019-07-21,1.34,760761.72,337772.97,78373.8,72579.15,272035.8,182285.97,76347.61,13402.22,...,0,0,0,0,0,0,1,1,0,1
6,2019-12-22,1.35,553606.09,11777.95,344390.0,1201.76,196236.38,86170.28,109789.44,276.66,...,0,0,0,0,0,0,0,1,0,1


In [25]:
# Drop the non-beneficial columns.
avocado_cat_encoded.drop(columns=["date"], inplace=True)
avocado_cat_encoded.head()

Unnamed: 0,avg_price,total_volume_sold,units_4046,units_4225,units_4770,total_bags,s_bags,l_bags,xl_bags,total_volume_produced,...,geography_Seattle,geography_SouthCarolina,geography_Spokane,geography_StLouis,geography_Syracuse,geography_Tampa,geography_WestTex/NewMexico,type_conventional,type_organic,status_actual
0,1.42,17340.49,295.16,3478.97,81.25,13485.11,12149.49,1335.62,0.0,45807126,...,0,0,0,0,0,0,0,0,1,1
1,0.97,142520.42,22704.27,33493.36,74.79,86248.0,44028.94,41902.39,316.67,54802659,...,0,0,0,0,0,0,0,1,0,1
3,1.06,653087.92,314055.97,91822.68,28164.95,219044.32,130499.36,88272.02,272.94,51398671,...,0,0,0,0,0,0,1,1,0,1
4,1.34,760761.72,337772.97,78373.8,72579.15,272035.8,182285.97,76347.61,13402.22,51585051,...,0,0,0,0,0,0,1,1,0,1
6,1.35,553606.09,11777.95,344390.0,1201.76,196236.38,86170.28,109789.44,276.66,49600528,...,0,0,0,0,0,0,0,1,0,1


## Define features (X-variables) and targert (y-variable)

In [26]:
# Define X values
X = avocado_cat_encoded[['month', 'week', 'year',
 'units_4046',
 'units_4225',
 'units_4770',
 's_bags',
 'l_bags',
 'xl_bags',
 'ratio_sold_vs_produced',
 'california',
 'chile',
 'mexico',
 'peru',
 'colombia',               
 'geography_Albany',
 'geography_Atlanta',
 'geography_Baltimore/Washington',
 'geography_Boise',
 'geography_Boston',
 'geography_Buffalo/Rochester',
 'geography_Charlotte',
 'geography_Chicago',
 'geography_Cincinnati/Dayton',
 'geography_Columbus',
 'geography_Dallas/FtWorth',
 'geography_Denver',
 'geography_Detroit',
 'geography_GrandRapids',
 'geography_Harrisburg/Scranton',
 'geography_Hartford/Springfield',
 'geography_Houston',
 'geography_Indianapolis',
 'geography_Jacksonville',
 'geography_LasVegas',
 'geography_LosAngeles',
 'geography_Louisville',
 'geography_Miami/FtLauderdale',
 'geography_Nashville',
 'geography_NewOrleans/Mobile',
 'geography_NewYork',
 'geography_NorthernNewEngland',
 'geography_Orlando',
 'geography_Philadelphia',
 'geography_Phoenix/Tucson',
 'geography_Pittsburgh',
 'geography_Portland',
 'geography_Raleigh/Greensboro',
 'geography_Richmond/Norfolk',
 'geography_Roanoke',
 'geography_Sacramento',
 'geography_SanDiego',
 'geography_SanFrancisco',
 'geography_Seattle',
 'geography_SouthCarolina',
 'geography_Spokane',
 'geography_StLouis',
 'geography_Syracuse',
 'geography_Tampa',
 'geography_WestTex/NewMexico',
 'type_conventional',
 'type_organic']]

In [33]:
# Define y-variable 
y = avocado_cat_encoded[['avg_price']]

In [34]:
#Split the model.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [35]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Linear Regression with sckit-learn

In [36]:
# Import dependencies.
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [37]:
# Fit the model.
model = LinearRegression()
model.fit(X_train_scaled, y_train.avg_price)

LinearRegression()

In [38]:
# Predict the model.
y_pred = model.predict(X_test_scaled)

In [39]:
# Print coeficient and intercept
print(model.coef_)
print(model.intercept_)

[-0.10363303  0.10303054  0.01850967 -0.02848715 -0.01347053  0.01222833
 -0.03096061 -0.02162053  0.01093956 -0.01350919  0.01245331  0.02835595
 -0.02966898  0.06059302 -0.05333511  0.00092973 -0.01015046  0.00981154
  0.02114857  0.0258502  -0.02240745  0.01748316  0.02140132 -0.01718241
 -0.02997091 -0.02849356  0.00145628 -0.02099196 -0.00937726 -0.0237248
  0.04531711 -0.02502515 -0.02829467 -0.016525   -0.00294025  0.03469016
 -0.01944568 -0.00968499 -0.02334001 -0.03522534  0.04709555  0.00637812
 -0.02314964  0.00701478 -0.01441991 -0.01690745  0.02672281  0.02740188
 -0.0248085  -0.02425644  0.02699351  0.02540013  0.05353069  0.04720361
 -0.00641089  0.02506329  0.01334484 -0.01660148 -0.02359475 -0.01312669
 -0.08546781  0.08546781]
1.3679303482587046


In [40]:
# Print the scores for Linear Regression with sckit-learn.
print("R2 score : %.2f" % r2_score(y_test.avg_price, y_pred))
print("Mean squared error: %.2f" % mean_squared_error(y_test.avg_price, y_pred))

R2 score : 0.66
Mean squared error: 0.04


## Random Forest Regressor

In [41]:
# Import dependencies.
from sklearn.ensemble import RandomForestRegressor

In [42]:
# Train the model.
reg = RandomForestRegressor(n_estimators = 200, random_state = 0)
reg.fit(X_train_scaled, y_train.avg_price)

RandomForestRegressor(n_estimators=200, random_state=0)

In [43]:
# List the features sorted in descending order by feature importance.
feature_importance = sorted(zip(reg.feature_importances_, X.columns), reverse=True)
for i in feature_importance:
    print('{} : ({})'.format(i[1], i[0]))

type_conventional : (0.22559573428777174)
type_organic : (0.15606915428946094)
units_4225 : (0.15227997378361288)
units_4046 : (0.07959118325714079)
s_bags : (0.07429273258030285)
peru : (0.04031192063216615)
l_bags : (0.03858969652931684)
year : (0.0286735535061826)
mexico : (0.024230845016240378)
ratio_sold_vs_produced : (0.021407842102866056)
week : (0.018534329757704963)
units_4770 : (0.016219167452960353)
california : (0.015502691321973388)
xl_bags : (0.012082575271924452)
chile : (0.00911809214751404)
geography_Raleigh/Greensboro : (0.008678005338421913)
geography_Seattle : (0.00792606180024174)
geography_Boston : (0.005915244049875996)
geography_SanFrancisco : (0.005025048794310143)
colombia : (0.004985834201438628)
geography_Hartford/Springfield : (0.004841389692760676)
month : (0.004236123609302192)
geography_NorthernNewEngland : (0.0041340905523266575)
geography_Chicago : (0.0030865082381683535)
geography_Harrisburg/Scranton : (0.0029867787756460884)
geography_SanDiego : (0.0

In [44]:
# Predictions
y_pred = reg.predict(X_test_scaled)

In [45]:
# Print result mse and rmse (root mean squared error)
mse = mean_squared_error(y_test.avg_price, y_pred)
rmse = np.sqrt(mse)

In [46]:
# Print the scores for RandomForestRegressor
print("R2 score : %.2f" % r2_score(y_test.avg_price, y_pred))
print("Mean squared error: %.2f" % mean_squared_error(y_test.avg_price,y_pred))
print("Root mean squared error: %.2f" % rmse)

R2 score : 0.88
Mean squared error: 0.01
Root mean squared error: 0.12
