### Purpose of ML models: forcasting avocado prices

### Starting with Linear Regression in sckit-learn, then comparing models with Random Forest Regressor and Neural Network.

(Features) **X-variables**: region, date, units sold, production data.

(Target) **y-variable:** avocado price 

Next step is research libraries Random Forest Regressor (predicting acctual price) and Classifier (binning y-variable in n-categories).

## Linear Regression with sckit-learn

In [1]:
# Import dependencies.
import pandas as pd
import numpy as np
from collections import Counter
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler,OneHotEncoder

### Importing data from the database

In [2]:
import pandas as pd
from sqlalchemy import create_engine
from config import db_password

# local server connection string
db_string = f"postgres://postgres:{db_password}@127.0.0.1:5432/avocados"

# create db engine
engine = create_engine(db_string)

In [3]:
# Load the avocado dataset form pgAdmin.
avocado_df = pd.read_sql_table('all_prices_prod', engine)
avocado_df.head()

Unnamed: 0,geography,date,type,avg_price,total_volume,units_4046,units_4225,units_4770_,total_bags,s_bags,l_bags,xl_bags,year_month,status,total_volume_produces,california,chile,mexico,peru,columbia
0,Albany,1/15/2017,conventional,1.55,88526.26,3327.65,71956.77,607.03,12634.81,12574.72,60.09,0.0,,,,,,,,
1,Albany,1/15/2017,organic,1.84,1982.65,82.3,328.02,0.0,1572.33,1572.33,0.0,0.0,,,,,,,,
2,Albany,1/2/2017,conventional,1.47,129948.23,4845.77,117027.41,200.36,7874.69,7866.86,7.83,0.0,,,,,,,,
3,Albany,1/2/2017,organic,1.87,1376.7,71.65,192.63,0.0,1112.42,1112.42,0.0,0.0,,,,,,,,
4,Albany,1/22/2017,conventional,1.59,128679.24,4119.94,111173.08,2191.71,11194.51,11060.19,125.5,8.82,,,,,,,,


In [4]:
# Change None values to NaN.
avocado_df = avocado_df.fillna(value=np.nan)
#avocado_df.head()

In [5]:
# Make a copy
avocado_df = avocado_df.copy()

In [6]:
# Count Null values
#avocado_df.isnull().sum()

# Drop Null values
avocado_df = avocado_df.dropna(how='any')
print(avocado_df.shape)
avocado_df.head()

(14688, 20)


Unnamed: 0,geography,date,type,avg_price,total_volume,units_4046,units_4225,units_4770_,total_bags,s_bags,l_bags,xl_bags,year_month,status,total_volume_produces,california,chile,mexico,peru,columbia
542,Albany,1/14/2018,conventional,1.42,95246.38,2897.41,76570.67,44.0,15734.3,10012.8,5721.5,0.0,1/1/2018,actual,52758905.0,830821.0,449003.0,51479081.0,0.0,0.0
543,Albany,1/14/2018,organic,1.47,4140.95,7.3,301.87,0.0,3831.78,3831.78,0.0,0.0,1/1/2018,actual,52758905.0,830821.0,449003.0,51479081.0,0.0,0.0
544,Albany,1/21/2018,conventional,1.69,135196.35,3133.37,116520.88,88.78,15453.32,10023.79,5429.53,0.0,1/1/2018,actual,54698266.0,2913663.0,445127.0,51339476.0,0.0,0.0
545,Albany,1/21/2018,organic,1.54,3346.54,14.67,253.01,0.0,3078.86,3078.86,0.0,0.0,1/1/2018,actual,54698266.0,2913663.0,445127.0,51339476.0,0.0,0.0
546,Albany,1/28/2018,conventional,1.57,93625.03,3101.17,74627.23,55.59,15841.04,11614.79,4159.58,66.67,1/1/2018,actual,45299201.0,2698356.0,95491.0,42505354.0,0.0,0.0


In [7]:
# List all columns
avocado_df.columns.tolist()

['geography',
 'date',
 'type',
 'avg_price',
 'total_volume',
 'units_4046',
 'units_4225',
 'units_4770_',
 'total_bags',
 's_bags',
 'l_bags',
 'xl_bags',
 'year_month',
 'status',
 'total_volume_produces',
 'california',
 'chile',
 'mexico',
 'peru',
 'columbia']

In [8]:
# Drop the non-beneficial columns.
avocado_df.drop(columns=["year_month"], inplace=True)
#avocado_df.head()

In [9]:
# Rename confusing columns
avocado_df = avocado_df.rename(columns={'total_volume':'total_volume_sold', 'total_volume_produces':'total_volume_produced', 'units_4770_':'units_4770' })
#avocado_df.head()

In [10]:
# Check datatypes
avocado_df.dtypes

geography                 object
date                      object
type                      object
avg_price                float64
total_volume_sold        float64
units_4046               float64
units_4225               float64
units_4770               float64
total_bags               float64
s_bags                   float64
l_bags                   float64
xl_bags                  float64
status                    object
total_volume_produced    float64
california               float64
chile                    float64
mexico                   float64
peru                     float64
columbia                 float64
dtype: object

In [11]:
# Convert date object to date and check datatype
import datetime
avocado_df.date = avocado_df.date.apply(pd.to_datetime)
print(avocado_df.date.dtypes)

datetime64[ns]


In [12]:
# Exctract the month
avocado_df['month'] = avocado_df.date.dt.month
#avocado_df.head()

In [13]:
# Exctract the week
avocado_df['week'] = avocado_df.date.dt.week
#avocado_df.head()

  


In [14]:
# Check the datatypes for month
print(avocado_df.week.dtypes)

int64


In [15]:
#Check the months
#avocado_df.month.value_counts()

#Check the weeks
#avocado_df.week.value_counts()

In [16]:
# Keep only California and its cities.
avocado_df_CA = avocado_df[avocado_df['geography'].isin(['California', 'Los Angeles', 'Sacramento', 'San Diego', 'San Francisco'])]
avocado_df_CA.head()

Unnamed: 0,geography,date,type,avg_price,total_volume_sold,units_4046,units_4225,units_4770,total_bags,s_bags,...,xl_bags,status,total_volume_produced,california,chile,mexico,peru,columbia,month,week
602,California,2018-01-14,conventional,1.27,5927016.73,2098762.55,2356359.93,151045.33,1320848.92,1226559.09,...,68898.86,actual,52758905.0,830821.0,449003.0,51479081.0,0.0,0.0,1,2
603,California,2018-01-14,organic,1.61,216681.04,52253.4,95353.95,0.0,69073.69,69054.51,...,0.0,actual,52758905.0,830821.0,449003.0,51479081.0,0.0,0.0,1,2
604,California,2018-01-21,conventional,1.12,6610010.64,2378177.09,2590301.7,122950.68,1518581.17,1431898.88,...,61880.7,actual,54698266.0,2913663.0,445127.0,51339476.0,0.0,0.0,1,3
605,California,2018-01-21,organic,1.69,181974.98,34411.37,64513.67,0.0,83049.94,83022.84,...,0.0,actual,54698266.0,2913663.0,445127.0,51339476.0,0.0,0.0,1,3
606,California,2018-01-28,conventional,1.16,6134040.58,2254631.8,2185495.65,121090.87,1572822.26,1480527.7,...,67748.62,actual,45299201.0,2698356.0,95491.0,42505354.0,0.0,0.0,1,4


In [17]:
print(avocado_df_CA.shape)

(1360, 21)


In [18]:
# Calculations
### sumup all units sold
### calculate ratio total volume sold vs total_volume_produced

### sumup all units sold
avocado_df_CA['total_units_sold'] = avocado_df_CA.iloc[:, 5:8].sum(axis=1)

### calculate ratio total volume sold vs total_volume_produced
avocado_df_CA['ratio_sold_vs_produced'] = avocado_df_CA['total_volume_sold']/avocado_df_CA['total_volume_produced']

### calculate ratio total sold and total volume produced in CA
avocado_df_CA['CA_ratio_sold_vs_produced'] = avocado_df_CA['total_volume_sold']/avocado_df_CA['california']

avocado_df_CA.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Unnamed: 0,geography,date,type,avg_price,total_volume_sold,units_4046,units_4225,units_4770,total_bags,s_bags,...,california,chile,mexico,peru,columbia,month,week,total_units_sold,ratio_sold_vs_produced,CA_ratio_sold_vs_produced
602,California,2018-01-14,conventional,1.27,5927016.73,2098762.55,2356359.93,151045.33,1320848.92,1226559.09,...,830821.0,449003.0,51479081.0,0.0,0.0,1,2,4606167.81,0.112342,7.133927
603,California,2018-01-14,organic,1.61,216681.04,52253.4,95353.95,0.0,69073.69,69054.51,...,830821.0,449003.0,51479081.0,0.0,0.0,1,2,147607.35,0.004107,0.260804
604,California,2018-01-21,conventional,1.12,6610010.64,2378177.09,2590301.7,122950.68,1518581.17,1431898.88,...,2913663.0,445127.0,51339476.0,0.0,0.0,1,3,5091429.47,0.120845,2.268626
605,California,2018-01-21,organic,1.69,181974.98,34411.37,64513.67,0.0,83049.94,83022.84,...,2913663.0,445127.0,51339476.0,0.0,0.0,1,3,98925.04,0.003327,0.062456
606,California,2018-01-28,conventional,1.16,6134040.58,2254631.8,2185495.65,121090.87,1572822.26,1480527.7,...,2698356.0,95491.0,42505354.0,0.0,0.0,1,4,4561218.32,0.135412,2.273251


In [19]:
# Clean the type_conventional.
## Count the values
avocado_df_CA["type"].value_counts()

organic          680
conventional     400
conventional     280
Name: type, dtype: int64

In [20]:
# Replace "conventional " with "conventional"
avocado_df_CA["type"] = avocado_df_CA["type"].str.replace("conventional ", "conventional")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [21]:
avocado_df_CA["type"].value_counts()

organic         680
conventional    680
Name: type, dtype: int64

In [22]:
# Plot the data with encoded date grouped by month. - check also for scatter plot
#avocado_df.groupby('month')['ratio CA sold vs prod'].mean().plot()

### DATA preprocessing for ML model

In [23]:
# Check datatypes
avocado_df_CA.dtypes

geography                            object
date                         datetime64[ns]
type                                 object
avg_price                           float64
total_volume_sold                   float64
units_4046                          float64
units_4225                          float64
units_4770                          float64
total_bags                          float64
s_bags                              float64
l_bags                              float64
xl_bags                             float64
status                               object
total_volume_produced               float64
california                          float64
chile                               float64
mexico                              float64
peru                                float64
columbia                            float64
month                                 int64
week                                  int64
total_units_sold                    float64
ratio_sold_vs_produced          

In [24]:
# Use get_dummies() to create variables for text features.
avocado_cat_encoded = pd.get_dummies(avocado_df_CA,  columns=["geography", "type", "status"])
avocado_cat_encoded.head()

Unnamed: 0,date,avg_price,total_volume_sold,units_4046,units_4225,units_4770,total_bags,s_bags,l_bags,xl_bags,...,ratio_sold_vs_produced,CA_ratio_sold_vs_produced,geography_California,geography_Los Angeles,geography_Sacramento,geography_San Diego,geography_San Francisco,type_conventional,type_organic,status_actual
602,2018-01-14,1.27,5927016.73,2098762.55,2356359.93,151045.33,1320848.92,1226559.09,25390.97,68898.86,...,0.112342,7.133927,1,0,0,0,0,1,0,1
603,2018-01-14,1.61,216681.04,52253.4,95353.95,0.0,69073.69,69054.51,19.18,0.0,...,0.004107,0.260804,1,0,0,0,0,0,1,1
604,2018-01-21,1.12,6610010.64,2378177.09,2590301.7,122950.68,1518581.17,1431898.88,24801.59,61880.7,...,0.120845,2.268626,1,0,0,0,0,1,0,1
605,2018-01-21,1.69,181974.98,34411.37,64513.67,0.0,83049.94,83022.84,27.1,0.0,...,0.003327,0.062456,1,0,0,0,0,0,1,1
606,2018-01-28,1.16,6134040.58,2254631.8,2185495.65,121090.87,1572822.26,1480527.7,24545.94,67748.62,...,0.135412,2.273251,1,0,0,0,0,1,0,1


In [25]:
#List the columns
avocado_cat_encoded.columns.tolist()

['date',
 'avg_price',
 'total_volume_sold',
 'units_4046',
 'units_4225',
 'units_4770',
 'total_bags',
 's_bags',
 'l_bags',
 'xl_bags',
 'total_volume_produced',
 'california',
 'chile',
 'mexico',
 'peru',
 'columbia',
 'month',
 'week',
 'total_units_sold',
 'ratio_sold_vs_produced',
 'CA_ratio_sold_vs_produced',
 'geography_California',
 'geography_Los Angeles',
 'geography_Sacramento',
 'geography_San Diego',
 'geography_San Francisco',
 'type_conventional',
 'type_organic',
 'status_actual']

In [26]:
# Drop the non-beneficial columns - drop "prod_total_volume" after the ratio calculations
avocado_cat_encoded.drop(columns=["date"], inplace=True)
avocado_cat_encoded.head()

Unnamed: 0,avg_price,total_volume_sold,units_4046,units_4225,units_4770,total_bags,s_bags,l_bags,xl_bags,total_volume_produced,...,ratio_sold_vs_produced,CA_ratio_sold_vs_produced,geography_California,geography_Los Angeles,geography_Sacramento,geography_San Diego,geography_San Francisco,type_conventional,type_organic,status_actual
602,1.27,5927016.73,2098762.55,2356359.93,151045.33,1320848.92,1226559.09,25390.97,68898.86,52758905.0,...,0.112342,7.133927,1,0,0,0,0,1,0,1
603,1.61,216681.04,52253.4,95353.95,0.0,69073.69,69054.51,19.18,0.0,52758905.0,...,0.004107,0.260804,1,0,0,0,0,0,1,1
604,1.12,6610010.64,2378177.09,2590301.7,122950.68,1518581.17,1431898.88,24801.59,61880.7,54698266.0,...,0.120845,2.268626,1,0,0,0,0,1,0,1
605,1.69,181974.98,34411.37,64513.67,0.0,83049.94,83022.84,27.1,0.0,54698266.0,...,0.003327,0.062456,1,0,0,0,0,0,1,1
606,1.16,6134040.58,2254631.8,2185495.65,121090.87,1572822.26,1480527.7,24545.94,67748.62,45299201.0,...,0.135412,2.273251,1,0,0,0,0,1,0,1


In [27]:
# Determine the number of unique values in each column.
#avocado_cat_encoded.dtypes
#len(avocado_cat_encoded.nunique())

In [28]:
# # Define X values.
# X = avocado_cat_encoded[['total_volume_sold', 'month', 'week', 'total_volume_produced',
#  'geography_California',
#  'geography_Los Angeles',
#  'geography_Sacramento',
#  'geography_San Diego',
#  'geography_San Francisco',
#  'type_conventional',
#  'type_organic',
#  'status_actual']]

In [29]:
# # Define X values.
# X = avocado_cat_encoded[['month', 'week', 'total_volume_produced', 'california',
#  'units_4046',
#  'units_4225',
#  'units_4770',
#  's_bags',
#  'l_bags',
#  'xl_bags',                      
#  'geography_California',
#  'geography_Los Angeles',
#  'geography_Sacramento',
#  'geography_San Diego',
#  'geography_San Francisco',
#  'type_conventional',
#  'type_organic',]]

In [30]:
# Define X values.
X = avocado_cat_encoded[['month', 'week',
 'units_4046',
 'units_4225',
 'units_4770',
 's_bags',
 'l_bags',
 'xl_bags',
 'california',
 'chile',
 'mexico',
 'peru',
 'columbia',                    
 'geography_California',
 'geography_Los Angeles',
 'geography_Sacramento',
 'geography_San Diego',
 'geography_San Francisco',
 'type_conventional',
 'type_organic',]]

In [31]:
# Define y values.
y = avocado_cat_encoded.avg_price

In [32]:
# Split the to training and testing.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [33]:
# # Create a StandardScaler instances
# scaler = StandardScaler()

# # Fit the StandardScaler
# X_scaler = scaler.fit(X_train)

# # Scale the data
# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)

In [34]:
# Fit the model.
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [35]:
# Predict the model.
y_pred = model.predict(X_test)

In [36]:
# Mean squared error.
mean_squared_error(y_test, y_pred)

0.0360806966801605

In [37]:
# R2 score.
r2_score(y_test, y_pred)

0.7698826285173441

In [38]:
# Print coeficient and intercept
print(model.coef_)
print(model.intercept_)

[-5.20411919e-02  1.24465564e-02 -4.50509556e-09 -9.65536803e-08
  2.03289404e-07  1.38322390e-08 -2.78399022e-08  8.10588809e-08
 -1.18183447e-09  1.56549472e-08 -2.02584661e-09  1.78645520e-08
 -8.91300960e-07  9.72856897e-03 -1.30385086e-01 -1.15381410e-02
 -3.06464852e-02  1.62841144e-01 -2.73749802e-01  2.73749802e-01]
1.6029110102977273


In [39]:
# Print the scores
print("R2 score : %.2f" % r2_score(y_test, y_pred))
print("Mean squared error: %.2f" % mean_squared_error(y_test,y_pred))

R2 score : 0.77
Mean squared error: 0.04


### Interpretation of the results

**Intro**

"R squared", is the proportion of the variance in the dependent variable that is predictable from the independent variable(s).

Mean square error (MSE) is the average of the square of the errors. The larger the number the larger the error.

There is no correct value for MSE. Simply put, the lower the value the better. Since there is no correct answer, the MSE’s basic value is in selecting one prediction model over another.
Similarly, there is also no correct answer as to what R2 should be. 100% means perfect correlation. Yet, there are models with a low R2 that are still good models (Source: https://www.bmc.com/blogs/mean-squared-error-r2-and-variance-in-regression-analysis/ )

Our dataset can be:
Overfit means the model is subject to outliers and noise.
Underfit means the model could look at other inputs (i.e., additional features).

**Our Results**

R2 score is on the higher side and tells that our model does quite well for the features that we used and that there is correlation amongst variables. 
Mean square error (MSE) is low, what tells that the error between observed and predicted is low.

### Random Forest Regressor

#### Sliding Window or moving average, linear decay or exponential decay
https://www.coursera.org/lecture/design-thinking-predictive-analytics-data-products/autoregression-khn7z
weights of the points (the most recent is the most important)

    - does the data needs to be in certain order?
    - OneHotEncoding date (by month and week)
    - different regions
    - example on the dataset?

#### Time series
(all of these models are assigning the weights to previous values using some predifined scheme, why not just learn the weights?)

What weights most efficiently predict the price?
    - Is it yesterdays's price
    - Is it the month, week, year?
    - Is it combination of the region and time?
    - Is it production? Where?



In [40]:
# Initial imports.
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns

In [41]:
# Train the model.
reg = RandomForestRegressor(n_estimators = 100, random_state = 0)
reg.fit(X_train, y_train)

RandomForestRegressor(random_state=0)

In [42]:
y_pred = reg.predict(X_test)
#y_pred

In [43]:
# Print result mse and rmse (root mean squared error)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmse

0.12968206291224885

In [44]:
# List the features sorted in descending order by feature importance
feature_importance = sorted(zip(reg.feature_importances_, X.columns), reverse=True)
feature_importance

[(0.5637810611696684, 'units_4046'),
 (0.0711039144634098, 'peru'),
 (0.06100512903439189, 's_bags'),
 (0.05013624027575877, 'units_4770'),
 (0.03985786164167764, 'mexico'),
 (0.03557995267174179, 'units_4225'),
 (0.02967210663054804, 'l_bags'),
 (0.028191454916212326, 'geography_San Francisco'),
 (0.024030617965027482, 'type_organic'),
 (0.02004058722388513, 'week'),
 (0.019530541180412413, 'california'),
 (0.018284274497766914, 'type_conventional'),
 (0.01583200510336997, 'xl_bags'),
 (0.009185638955690264, 'chile'),
 (0.004876022628394658, 'month'),
 (0.002963473865847431, 'geography_Los Angeles'),
 (0.0024731326217405465, 'columbia'),
 (0.0014615985810106388, 'geography_San Diego'),
 (0.001114877283675617, 'geography_Sacramento'),
 (0.0008795092897701962, 'geography_California')]

### Random Forest Classifier

In [45]:
# Import Dependencies
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import confusion_matrix

In [46]:
# Descriptive statistics
avocado_cat_encoded.avg_price.describe()

count    1360.000000
mean        1.546941
std         0.379179
min         0.670000
25%         1.200000
50%         1.580000
75%         1.820000
max         2.780000
Name: avg_price, dtype: float64

In [62]:
# Categorize y-values
labels = ["low", "low-medium", "high-medium","high"]
x = avocado_cat_encoded['avg_price']

#Categorize prices
avocado_cat_encoded["price_category"] = pd.qcut(x, 4, labels=labels)
avocado_cat_encoded.head()

Unnamed: 0,avg_price,total_volume_sold,units_4046,units_4225,units_4770,total_bags,s_bags,l_bags,xl_bags,total_volume_produced,...,CA_ratio_sold_vs_produced,geography_California,geography_Los Angeles,geography_Sacramento,geography_San Diego,geography_San Francisco,type_conventional,type_organic,status_actual,price_category
602,1.27,5927016.73,2098762.55,2356359.93,151045.33,1320848.92,1226559.09,25390.97,68898.86,52758905.0,...,7.133927,1,0,0,0,0,1,0,1,low-medium
603,1.61,216681.04,52253.4,95353.95,0.0,69073.69,69054.51,19.18,0.0,52758905.0,...,0.260804,1,0,0,0,0,0,1,1,high-medium
604,1.12,6610010.64,2378177.09,2590301.7,122950.68,1518581.17,1431898.88,24801.59,61880.7,54698266.0,...,2.268626,1,0,0,0,0,1,0,1,low
605,1.69,181974.98,34411.37,64513.67,0.0,83049.94,83022.84,27.1,0.0,54698266.0,...,0.062456,1,0,0,0,0,0,1,1,high-medium
606,1.16,6134040.58,2254631.8,2185495.65,121090.87,1572822.26,1480527.7,24545.94,67748.62,45299201.0,...,2.273251,1,0,0,0,0,1,0,1,low


In [63]:
# Check bin balance
avocado_cat_encoded["price_category"].value_counts()

high-medium    349
low-medium     341
low            341
high           329
Name: price_category, dtype: int64

In [64]:
# Assigning new X-values and y-values
y_new = avocado_cat_encoded["price_category"]
X_new = avocado_cat_encoded[['month', 'week',
 'units_4046',
 'units_4225',
 'units_4770',
 's_bags',
 'l_bags',
 'xl_bags',
 'california',
 'chile',
 'mexico',
 'peru',
 'columbia',                    
 'geography_California',
 'geography_Los Angeles',
 'geography_Sacramento',
 'geography_San Diego',
 'geography_San Francisco',
 'type_conventional',
 'type_organic',]]

In [65]:
# Split the to training and testing.
X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, test_size=0.25, random_state=42)

In [66]:
# Resample the training data with the BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brf.fit(X_train, y_train)  

BalancedRandomForestClassifier(random_state=1)

In [67]:
# Calculated the balanced accuracy score
y_pred = brf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)  

0.7575939492606159

In [68]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       high       0.82      0.81      0.94      0.82      0.87      0.75        91
high-medium       0.63      0.73      0.87      0.68      0.80      0.63        78
        low       0.87      0.84      0.96      0.86      0.90      0.80        90
 low-medium       0.71      0.64      0.92      0.68      0.77      0.57        81

avg / total       0.77      0.76      0.92      0.76      0.84      0.69       340



In [69]:
# List the features sorted in descending order by feature importance
feature_importance = sorted(zip(brf.feature_importances_, X.columns), reverse=True)
for i in feature_importance:
    print('{} : ({})'.format(i[1], i[0]))

units_4046 : (0.1205602308078158)
units_4225 : (0.09716476214127649)
s_bags : (0.08956106171231079)
units_4770 : (0.08644585985390954)
l_bags : (0.07883988671854453)
mexico : (0.07397554296908587)
peru : (0.06444980366192571)
california : (0.062306759077035895)
xl_bags : (0.061149876929641585)
week : (0.05664376346555704)
type_conventional : (0.039792179339880704)
type_organic : (0.036646481128714346)
month : (0.03329813948521519)
chile : (0.030279288004679966)
geography_San Francisco : (0.02636730507254257)
geography_Los Angeles : (0.010153984922442072)
columbia : (0.009541064563511763)
geography_San Diego : (0.009280367016304473)
geography_Sacramento : (0.008613462741258472)
geography_California : (0.004930180388347205)


## Notes
Standard scaler doesn't do much change on the results?
Better to use MinMix Scaler?

using separate values imporved the model quite a bit, rather using totals.


From the feature importances we can see what features have the most weight on decisions trees.
Grouping by: Keeping only the features (geography) that we want to predict the prices on and the one that has the weight.
For example, keep San Francisco - predicting is this price form SF or outside SF.

#### Interpretaion of the results
The RMSE is the square root of the variance of the residuals. It indicates the absolute fit of the model to the data–how close the observed data points are to the model’s predicted values. Whereas R-squared is a relative measure of fit, RMSE is an absolute measure of fit. As the square root of a variance, RMSE can be interpreted as the standard deviation of the unexplained variance, and has the useful property of being in the same units as the response variable. Lower values of RMSE indicate better fit. RMSE is a good measure of how accurately the model predicts the response, and it is the most important criterion for fit if the main purpose of the model is prediction.

### Neural Network ML model - in progress (Skeleton Code)

In [47]:
# # Import dependencies.
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler,OneHotEncoder
# import pandas as pd
# import tensorflow as tf
# from pathlib import Path

# import warnings
# warnings.filterwarnings('ignore')

# # Import checkpoint dependencies
# import os
# from tensorflow.keras.callbacks import ModelCheckpoint

In [48]:
# ## Visualize the value counts of avocado_mock_df['TBD']
# avocado_df = avocado_df['TBD'].value_counts()
# #ca_prod.plot(kind='bar')

In [49]:
# # Determine which values to replace if counts are less than ...?
# replace_ca_prod = list(ca_prod[ca_prod < 500000].index)

# ## Replace in dataframe
# for ca_p in replace_ca_prod:
#     avocado_mock_df['TBD'] = avocado_df['TBD'].replace(ca_p,"Other")
    
# # Check to make sure binning was successful
# avocado_df['TBD'].value_counts()

In [50]:
# # Generate our categorical variable lists
# avocado_cat = avocado_df.dtypes[avocado_mock_df.dtypes == "object"].index.tolist()

In [51]:
# # Create a OneHotEncoder instance
# enc = OneHotEncoder(sparse=False)

# # Fit and transform the OneHotEncoder using the categorical variable list
# encode_df = pd.DataFrame(enc.fit_transform(avocado_mock_df[avocado_cat]))

# # Add the encoded variable names to the dataframe
# encode_df.columns = enc.get_feature_names(avocado_cat)
# encode_df.head()

In [52]:
# # Merge one-hot encoded features and drop the originals
# avocado_df = avocado_df.merge(encode_df,left_index=True, right_index=True)
# avocado_df = avocado_df.drop(avocado_cat,1)
# avocado_df.head()

In [53]:
# # Split our preprocessed data into our features and target arrays
# y = avocado_df["TBD"].values
# X = avocado_df.drop(["TBD"],1).values

# # Split the preprocessed data into a training and testing dataset
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [54]:
# # Create a StandardScaler instances
# scaler = StandardScaler()

# # Fit the StandardScaler
# X_scaler = scaler.fit(X_train)

# # Scale the data
# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)

### Compile, Train and Evaluate the Model

In [55]:
# # Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
# number_input_features = len(X_train[0])
# hidden_nodes_layer1 = 210
# hidden_nodes_layer2 = 90
# #hidden_nodes_layer3 = 40

# nn = tf.keras.models.Sequential()

# # First hidden layer
# nn.add(
#     tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
# )

# # Second hidden layer
# nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# # Third hidden layer
# #nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# # Output layer
# nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# # Check the structure of the model
# nn.summary()

In [56]:
# # Define the checkpoint path and filenames
# os.makedirs("checkpoints/",exist_ok=True)
# checkpoint_path = "checkpoints/weights.{epoch:02d}.hdf5"

In [57]:
# # Compile the model
# nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [58]:
# # Create a callback that saves the model's weights every 5 epochs
# cp_callback = ModelCheckpoint(
#     filepath=checkpoint_path,
#     verbose=1,
#     save_weights_only=True,
#     save_freq=1000)

In [59]:
# # Train the model
# fit_model = nn.fit(X_train_scaled,y_train,epochs=100,callbacks=[cp_callback])

In [60]:
# # Evaluate the model using the test data
# model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
# print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [61]:
# Export our model to HDF5 file
# nn.save("Avocado_price.h5")