In [1]:
# import dependencies
import pandas as pd
import numpy as np

# for server connection
from sqlalchemy import create_engine
from config import db_password

# for machine learning
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier


### DB connection string

In [2]:
# local server connection string
db_string = f"postgres://postgres:{db_password}@127.0.0.1:5432/avocados"

# create db engine
engine = create_engine(db_string)

### preprocessing prices_clim

In [3]:
# Load the avocado dataset from pgAdmin
prices_clim = pd.read_sql_table('prices_clim', engine)
prices_clim.head()

Unnamed: 0,year_month,geography,date,type,avg_price,prices_total_volume,units_4046,units_4225,units_4770,total_bags,...,cdd,sp01,sp02,sp03,sp06,sp09,sp12,sp24,tmin,tmax
0,2017-12-01,Roanoke,2017-12-10,conventional,1.05,137960.99,38481.95,36370.51,73.18,63035.35,...,3,-1.95,-0.97,-1.22,-1.27,-0.95,1.02,1.12,35.2,59.7
1,2017-05-01,Midsouth,2017-05-21,conventional,1.34,3232304.82,740206.13,1308057.07,32390.63,1151650.99,...,46,-0.66,0.62,0.14,1.68,1.8,1.77,1.52,49.7,77.2
2,2020-01-01,Spokane,2020-01-12,conventional,1.11,121910.14,17230.22,16545.99,113.3,88020.63,...,0,-0.74,-0.1,-0.36,-0.59,-0.27,0.74,0.18,35.6,55.1
3,2019-07-01,Seattle,2019-07-21,conventional,2.02,465059.0,71979.03,116438.62,1155.43,275485.92,...,247,-0.75,-1.21,1.57,1.57,1.11,0.95,0.02,61.1,90.0
4,2020-07-01,GrandRapids,2020-07-12,organic,1.63,8953.35,76.77,2459.61,0.0,6416.97,...,254,-1.17,-1.06,0.19,-0.83,-0.8,-0.99,0.0,61.3,92.0


In [4]:
# remove spaces
prices_clim.type = prices_clim.type.str.replace(' ', '')
prices_clim["type"].value_counts()

conventional    10206
organic         10204
Name: type, dtype: int64

In [5]:
# check for any nulls
prices_clim.isnull().values.any()

False

In [6]:
# shape of df
prices_clim.shape

(20410, 29)

In [7]:
# check datatypes
prices_clim.dtypes

year_month             datetime64[ns]
geography                      object
date                   datetime64[ns]
type                           object
avg_price                     float64
prices_total_volume           float64
units_4046                    float64
units_4225                    float64
units_4770                    float64
total_bags                    float64
s_bags                        float64
l_bags                        float64
xl_bags                       float64
pcp                           float64
tavg                          float64
pdsi                          float64
phdi                          float64
zndx                          float64
pmdi                          float64
cdd                             int64
sp01                          float64
sp02                          float64
sp03                          float64
sp06                          float64
sp09                          float64
sp12                          float64
sp24        

In [8]:
# split dates
prices_clim["year"] = pd.DatetimeIndex(prices_clim['date']).year
prices_clim["month"] = pd.DatetimeIndex(prices_clim['date']).month
prices_clim["week"] = pd.DatetimeIndex(prices_clim['date']).weekday
prices_clim.head()

Unnamed: 0,year_month,geography,date,type,avg_price,prices_total_volume,units_4046,units_4225,units_4770,total_bags,...,sp03,sp06,sp09,sp12,sp24,tmin,tmax,year,month,week
0,2017-12-01,Roanoke,2017-12-10,conventional,1.05,137960.99,38481.95,36370.51,73.18,63035.35,...,-1.22,-1.27,-0.95,1.02,1.12,35.2,59.7,2017,12,6
1,2017-05-01,Midsouth,2017-05-21,conventional,1.34,3232304.82,740206.13,1308057.07,32390.63,1151650.99,...,0.14,1.68,1.8,1.77,1.52,49.7,77.2,2017,5,6
2,2020-01-01,Spokane,2020-01-12,conventional,1.11,121910.14,17230.22,16545.99,113.3,88020.63,...,-0.36,-0.59,-0.27,0.74,0.18,35.6,55.1,2020,1,6
3,2019-07-01,Seattle,2019-07-21,conventional,2.02,465059.0,71979.03,116438.62,1155.43,275485.92,...,1.57,1.57,1.11,0.95,0.02,61.1,90.0,2019,7,6
4,2020-07-01,GrandRapids,2020-07-12,organic,1.63,8953.35,76.77,2459.61,0.0,6416.97,...,0.19,-0.83,-0.8,-0.99,0.0,61.3,92.0,2020,7,6


In [9]:
# view all columns
prices_clim.columns

Index(['year_month', 'geography', 'date', 'type', 'avg_price',
       'prices_total_volume', 'units_4046', 'units_4225', 'units_4770',
       'total_bags', 's_bags', 'l_bags', 'xl_bags', 'pcp', 'tavg', 'pdsi',
       'phdi', 'zndx', 'pmdi', 'cdd', 'sp01', 'sp02', 'sp03', 'sp06', 'sp09',
       'sp12', 'sp24', 'tmin', 'tmax', 'year', 'month', 'week'],
      dtype='object')

In [10]:
# drop columns
test1 = prices_clim.drop(columns=["year_month", "date"])
test2 = prices_clim.drop(columns=["year_month", "date", "total_bags"])

test1

Unnamed: 0,geography,type,avg_price,prices_total_volume,units_4046,units_4225,units_4770,total_bags,s_bags,l_bags,...,sp03,sp06,sp09,sp12,sp24,tmin,tmax,year,month,week
0,Roanoke,conventional,1.05,137960.99,38481.95,36370.51,73.18,63035.35,57738.32,5284.43,...,-1.22,-1.27,-0.95,1.02,1.12,35.2,59.7,2017,12,6
1,Midsouth,conventional,1.34,3232304.82,740206.13,1308057.07,32390.63,1151650.99,953595.56,193338.02,...,0.14,1.68,1.80,1.77,1.52,49.7,77.2,2017,5,6
2,Spokane,conventional,1.11,121910.14,17230.22,16545.99,113.30,88020.63,46850.74,41117.35,...,-0.36,-0.59,-0.27,0.74,0.18,35.6,55.1,2020,1,6
3,Seattle,conventional,2.02,465059.00,71979.03,116438.62,1155.43,275485.92,134101.42,140683.16,...,1.57,1.57,1.11,0.95,0.02,61.1,90.0,2019,7,6
4,GrandRapids,organic,1.63,8953.35,76.77,2459.61,0.00,6416.97,6318.55,98.42,...,0.19,-0.83,-0.80,-0.99,0.00,61.3,92.0,2020,7,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20405,Houston,conventional,0.70,1258481.11,789020.20,215934.35,58981.08,194545.48,74838.47,119674.39,...,1.43,1.77,1.69,1.29,0.70,34.0,50.3,2017,1,6
20406,Sacramento,conventional,1.36,401513.00,106684.00,169023.00,4611.00,121194.00,87382.00,25473.00,...,-0.93,-1.25,0.04,0.81,-0.15,40.4,65.6,2019,11,6
20407,Houston,organic,1.81,26056.01,5636.21,0.00,0.00,20419.80,20419.80,0.00,...,-1.25,0.74,1.10,0.96,0.01,62.0,91.5,2019,8,6
20408,Louisville,organic,1.60,3328.48,4.53,596.38,0.00,2727.57,2469.61,257.96,...,0.18,0.74,1.40,1.03,0.03,55.3,82.5,2019,9,6


In [11]:
prices_clim.nunique()

year_month                45
geography                 54
date                     189
type                       2
avg_price                255
prices_total_volume    20396
units_4046             19748
units_4225             20011
units_4770             12972
total_bags             20388
s_bags                 20360
l_bags                 18447
xl_bags                 8311
pcp                       41
tavg                      43
pdsi                      44
phdi                      44
zndx                      43
pmdi                      44
cdd                       31
sp01                      43
sp02                      40
sp03                      40
sp06                      42
sp09                      41
sp12                      36
sp24                      36
tmin                      44
tmax                      43
year                       4
month                     12
week                       2
dtype: int64

### prices_clim-test1

In [12]:
# Generate our categorical variable lists
category = test1.dtypes[test1.dtypes == "object"].index.tolist()

In [13]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(test1[["geography", "type"]]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(category)
encode_df.head()

Unnamed: 0,geography_Albany,geography_Atlanta,geography_Baltimore/Washington,geography_Boise,geography_Boston,geography_Buffalo/Rochester,geography_California,geography_Charlotte,geography_Chicago,geography_Cincinnati/Dayton,...,geography_Southeast,geography_Spokane,geography_StLouis,geography_Syracuse,geography_Tampa,geography_TotalUS,geography_West,geography_WestTex/NewMexico,type_conventional,type_organic
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [14]:
# Merge one-hot encoded features and drop the originals
prices_clim_encoded1 = test1.merge(encode_df,left_index=True, right_index=True)
prices_clim_encoded1 = test1.drop(category,1)
prices_clim_encoded1.head()

Unnamed: 0,avg_price,prices_total_volume,units_4046,units_4225,units_4770,total_bags,s_bags,l_bags,xl_bags,pcp,...,sp03,sp06,sp09,sp12,sp24,tmin,tmax,year,month,week
0,1.05,137960.99,38481.95,36370.51,73.18,63035.35,57738.32,5284.43,12.6,0.3,...,-1.22,-1.27,-0.95,1.02,1.12,35.2,59.7,2017,12,6
1,1.34,3232304.82,740206.13,1308057.07,32390.63,1151650.99,953595.56,193338.02,4717.41,0.43,...,0.14,1.68,1.8,1.77,1.52,49.7,77.2,2017,5,6
2,1.11,121910.14,17230.22,16545.99,113.3,88020.63,46850.74,41117.35,52.54,2.06,...,-0.36,-0.59,-0.27,0.74,0.18,35.6,55.1,2020,1,6
3,2.02,465059.0,71979.03,116438.62,1155.43,275485.92,134101.42,140683.16,701.34,0.08,...,1.57,1.57,1.11,0.95,0.02,61.1,90.0,2019,7,6
4,1.63,8953.35,76.77,2459.61,0.0,6416.97,6318.55,98.42,0.0,0.05,...,0.19,-0.83,-0.8,-0.99,0.0,61.3,92.0,2020,7,6


In [15]:
# Categorize y-values
labels = ["low", "medium", "high"]
y_categories = prices_clim_encoded1['avg_price']

#Categorize prices
prices_clim_encoded1["price_category"] = pd.qcut(y_categories, 3, labels=labels)
prices_clim_encoded1.head()

Unnamed: 0,avg_price,prices_total_volume,units_4046,units_4225,units_4770,total_bags,s_bags,l_bags,xl_bags,pcp,...,sp06,sp09,sp12,sp24,tmin,tmax,year,month,week,price_category
0,1.05,137960.99,38481.95,36370.51,73.18,63035.35,57738.32,5284.43,12.6,0.3,...,-1.27,-0.95,1.02,1.12,35.2,59.7,2017,12,6,low
1,1.34,3232304.82,740206.13,1308057.07,32390.63,1151650.99,953595.56,193338.02,4717.41,0.43,...,1.68,1.8,1.77,1.52,49.7,77.2,2017,5,6,medium
2,1.11,121910.14,17230.22,16545.99,113.3,88020.63,46850.74,41117.35,52.54,2.06,...,-0.59,-0.27,0.74,0.18,35.6,55.1,2020,1,6,low
3,2.02,465059.0,71979.03,116438.62,1155.43,275485.92,134101.42,140683.16,701.34,0.08,...,1.57,1.11,0.95,0.02,61.1,90.0,2019,7,6,high
4,1.63,8953.35,76.77,2459.61,0.0,6416.97,6318.55,98.42,0.0,0.05,...,-0.83,-0.8,-0.99,0.0,61.3,92.0,2020,7,6,high


In [16]:
# Check bin balance
prices_clim_encoded1["price_category"].value_counts()

low       6906
high      6782
medium    6722
Name: price_category, dtype: int64

In [17]:
# Split our preprocessed data into our features and target arrays
y = prices_clim_encoded1.price_category
X = prices_clim_encoded1.drop(columns=["avg_price", "price_category"])

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [18]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [19]:
# Resample the training data with the BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=128, random_state=42)
brf.fit(X_train, y_train)

BalancedRandomForestClassifier(n_estimators=128, random_state=42)

In [20]:
# Calculated the balanced accuracy score
y_pred = brf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.8296393727651991

In [21]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[1520,   14,  162],
       [  19, 1480,  228],
       [ 241,  203, 1236]], dtype=int64)

In [22]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       high       0.85      0.90      0.92      0.87      0.91      0.83      1696
        low       0.87      0.86      0.94      0.86      0.90      0.80      1727
     medium       0.76      0.74      0.89      0.75      0.81      0.64      1680

avg / total       0.83      0.83      0.92      0.83      0.87      0.76      5103



In [23]:
# List the features sorted in descending order by feature importance
sorted_importances = sorted(zip(brf.feature_importances_, X.columns), reverse=True)

for i in sorted_importances:
    print("{}: ({})".format(i[1],i[0]))

units_4046: (0.14608416218279888)
units_4225: (0.1233046333874485)
prices_total_volume: (0.10685141244946653)
total_bags: (0.09859190648037154)
s_bags: (0.09239380513095395)
l_bags: (0.08942954245752546)
units_4770: (0.06780750588084215)
xl_bags: (0.06488521140202835)
sp09: (0.018306443678371694)
sp24: (0.01802101695626619)
sp12: (0.015494380214109774)
tmin: (0.015316908874969977)
phdi: (0.01360232854370257)
pdsi: (0.012968952585218949)
cdd: (0.012936869567386912)
pmdi: (0.012253499501702266)
tavg: (0.011722873902348422)
tmax: (0.010574343679332188)
sp06: (0.010420674524945108)
pcp: (0.009130240172975843)
zndx: (0.008921337368962504)
sp03: (0.008727022928528437)
sp02: (0.008704979352762912)
month: (0.00866659192101468)
sp01: (0.007669247084665013)
year: (0.0059849118267586085)
week: (0.0012291979445425327)


### prices_clim-test2

In [24]:
# Generate our categorical variable lists
category = test2.dtypes[test2.dtypes == "object"].index.tolist()

In [25]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(test2[["geography", "type"]]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(category)
encode_df.head()

Unnamed: 0,geography_Albany,geography_Atlanta,geography_Baltimore/Washington,geography_Boise,geography_Boston,geography_Buffalo/Rochester,geography_California,geography_Charlotte,geography_Chicago,geography_Cincinnati/Dayton,...,geography_Southeast,geography_Spokane,geography_StLouis,geography_Syracuse,geography_Tampa,geography_TotalUS,geography_West,geography_WestTex/NewMexico,type_conventional,type_organic
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [26]:
# Merge one-hot encoded features and drop the originals
prices_clim_encoded2 = test2.merge(encode_df,left_index=True, right_index=True)
prices_clim_encoded2 = test2.drop(category,1)
prices_clim_encoded2.head()

Unnamed: 0,avg_price,prices_total_volume,units_4046,units_4225,units_4770,s_bags,l_bags,xl_bags,pcp,tavg,...,sp03,sp06,sp09,sp12,sp24,tmin,tmax,year,month,week
0,1.05,137960.99,38481.95,36370.51,73.18,57738.32,5284.43,12.6,0.3,47.5,...,-1.22,-1.27,-0.95,1.02,1.12,35.2,59.7,2017,12,6
1,1.34,3232304.82,740206.13,1308057.07,32390.63,953595.56,193338.02,4717.41,0.43,63.4,...,0.14,1.68,1.8,1.77,1.52,49.7,77.2,2017,5,6
2,1.11,121910.14,17230.22,16545.99,113.3,46850.74,41117.35,52.54,2.06,45.4,...,-0.36,-0.59,-0.27,0.74,0.18,35.6,55.1,2020,1,6
3,2.02,465059.0,71979.03,116438.62,1155.43,134101.42,140683.16,701.34,0.08,75.6,...,1.57,1.57,1.11,0.95,0.02,61.1,90.0,2019,7,6
4,1.63,8953.35,76.77,2459.61,0.0,6318.55,98.42,0.0,0.05,76.7,...,0.19,-0.83,-0.8,-0.99,0.0,61.3,92.0,2020,7,6


In [27]:
# Categorize y-values
labels = ["low", "medium", "high"]
y_categories = prices_clim_encoded2['avg_price']

#Categorize prices
prices_clim_encoded2["price_category"] = pd.qcut(y_categories, 3, labels=labels)
prices_clim_encoded2.head()

Unnamed: 0,avg_price,prices_total_volume,units_4046,units_4225,units_4770,s_bags,l_bags,xl_bags,pcp,tavg,...,sp06,sp09,sp12,sp24,tmin,tmax,year,month,week,price_category
0,1.05,137960.99,38481.95,36370.51,73.18,57738.32,5284.43,12.6,0.3,47.5,...,-1.27,-0.95,1.02,1.12,35.2,59.7,2017,12,6,low
1,1.34,3232304.82,740206.13,1308057.07,32390.63,953595.56,193338.02,4717.41,0.43,63.4,...,1.68,1.8,1.77,1.52,49.7,77.2,2017,5,6,medium
2,1.11,121910.14,17230.22,16545.99,113.3,46850.74,41117.35,52.54,2.06,45.4,...,-0.59,-0.27,0.74,0.18,35.6,55.1,2020,1,6,low
3,2.02,465059.0,71979.03,116438.62,1155.43,134101.42,140683.16,701.34,0.08,75.6,...,1.57,1.11,0.95,0.02,61.1,90.0,2019,7,6,high
4,1.63,8953.35,76.77,2459.61,0.0,6318.55,98.42,0.0,0.05,76.7,...,-0.83,-0.8,-0.99,0.0,61.3,92.0,2020,7,6,high


In [28]:
# Check bin balance
prices_clim_encoded2["price_category"].value_counts()

low       6906
high      6782
medium    6722
Name: price_category, dtype: int64

In [29]:
# Split our preprocessed data into our features and target arrays
y = prices_clim_encoded2.price_category
X = prices_clim_encoded2.drop(columns=["avg_price", "price_category"])

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [30]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [31]:
# Resample the training data with the BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=128, random_state=42)
brf.fit(X_train, y_train)

BalancedRandomForestClassifier(n_estimators=128, random_state=42)

In [32]:
# Calculated the balanced accuracy score
y_pred = brf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.8280257194067132

In [33]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[1511,   15,  170],
       [  19, 1488,  220],
       [ 251,  200, 1229]], dtype=int64)

In [34]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       high       0.85      0.89      0.92      0.87      0.91      0.82      1696
        low       0.87      0.86      0.94      0.87      0.90      0.80      1727
     medium       0.76      0.73      0.89      0.75      0.81      0.64      1680

avg / total       0.83      0.83      0.91      0.83      0.87      0.75      5103



In [35]:
# List the features sorted in descending order by feature importance
sorted_importances = sorted(zip(brf.feature_importances_, X.columns), reverse=True)

for i in sorted_importances:
    print("{}: ({})".format(i[1],i[0]))

units_4046: (0.1536398684319572)
units_4225: (0.13704817913542)
prices_total_volume: (0.12234818147852781)
l_bags: (0.11263752059000544)
s_bags: (0.11113087877409254)
xl_bags: (0.07999180986370708)
units_4770: (0.07945303411234551)
sp09: (0.01948008184261914)
sp24: (0.018146843293573748)
sp12: (0.014831409869735497)
phdi: (0.013406340602672201)
tmin: (0.01303328650868137)
pmdi: (0.012068665534840614)
cdd: (0.011753438943027881)
tavg: (0.011225262039669883)
pdsi: (0.011188644706570196)
tmax: (0.010983423504462497)
sp06: (0.009977455466839447)
pcp: (0.009200045251359739)
month: (0.008919668524123426)
sp02: (0.008213724043402234)
sp03: (0.008170042825561002)
zndx: (0.007956806493999807)
sp01: (0.007384457214980312)
year: (0.006353980957525108)
week: (0.0014569499903003204)


### preprocessing prices_prod

In [36]:
# Load the avocado dataset from pgAdmin
prices_prod = pd.read_sql_table('prices_prod', engine)
prices_prod.head()

Unnamed: 0,year_month,geography,date,type,avg_price,prices_total_volume,units_4046,units_4225,units_4770,total_bags,s_bags,l_bags,xl_bags,status,prod_total_volume,california,chile,mexico,peru,colombia
0,2020-01-01,Spokane,2020-01-12,conventional,1.11,121910.14,17230.22,16545.99,113.3,88020.63,46850.74,41117.35,52.54,actual,65307572,319866,177780,64809925,0,0
1,2019-07-01,Seattle,2019-07-21,conventional,2.02,465059.0,71979.03,116438.62,1155.43,275485.92,134101.42,140683.16,701.34,actual,51585051,9892498,0,27328014,14364539,0
2,2020-07-01,GrandRapids,2020-07-12,organic,1.63,8953.35,76.77,2459.61,0.0,6416.97,6318.55,98.42,0.0,actual,52258875,11391217,0,25273369,15360100,234188
3,2019-05-01,Buffalo/Rochester,2019-05-26,organic,1.51,7998.01,451.21,199.51,0.0,7347.29,2283.06,5064.23,0.0,actual,53448347,11609804,0,35628132,6210411,0
4,2018-10-01,NorthernNewEngland,2018-10-14,organic,1.44,33042.22,0.0,3381.87,0.0,29660.35,23200.12,6460.23,0.0,actual,43609892,798732,3241640,39427297,142223,0


In [37]:
# remove spaces
prices_prod.type = prices_prod.type.str.replace(' ', '')
prices_prod["type"].value_counts()

conventional    7236
organic         7236
Name: type, dtype: int64

In [38]:
# check for any nulls
prices_prod.isnull().values.any()

False

In [39]:
# shape of df
prices_prod.shape

(14472, 20)

In [40]:
# check datatypes
prices_prod.dtypes

year_month             datetime64[ns]
geography                      object
date                   datetime64[ns]
type                           object
avg_price                     float64
prices_total_volume           float64
units_4046                    float64
units_4225                    float64
units_4770                    float64
total_bags                    float64
s_bags                        float64
l_bags                        float64
xl_bags                       float64
status                         object
prod_total_volume               int64
california                      int64
chile                           int64
mexico                          int64
peru                            int64
colombia                        int64
dtype: object

In [41]:
# split dates
prices_prod["year"] = pd.DatetimeIndex(prices_prod['date']).year
prices_prod["month"] = pd.DatetimeIndex(prices_prod['date']).month
prices_prod["week"] = pd.DatetimeIndex(prices_prod['date']).weekday
prices_prod.head()

Unnamed: 0,year_month,geography,date,type,avg_price,prices_total_volume,units_4046,units_4225,units_4770,total_bags,...,status,prod_total_volume,california,chile,mexico,peru,colombia,year,month,week
0,2020-01-01,Spokane,2020-01-12,conventional,1.11,121910.14,17230.22,16545.99,113.3,88020.63,...,actual,65307572,319866,177780,64809925,0,0,2020,1,6
1,2019-07-01,Seattle,2019-07-21,conventional,2.02,465059.0,71979.03,116438.62,1155.43,275485.92,...,actual,51585051,9892498,0,27328014,14364539,0,2019,7,6
2,2020-07-01,GrandRapids,2020-07-12,organic,1.63,8953.35,76.77,2459.61,0.0,6416.97,...,actual,52258875,11391217,0,25273369,15360100,234188,2020,7,6
3,2019-05-01,Buffalo/Rochester,2019-05-26,organic,1.51,7998.01,451.21,199.51,0.0,7347.29,...,actual,53448347,11609804,0,35628132,6210411,0,2019,5,6
4,2018-10-01,NorthernNewEngland,2018-10-14,organic,1.44,33042.22,0.0,3381.87,0.0,29660.35,...,actual,43609892,798732,3241640,39427297,142223,0,2018,10,6


In [42]:
# view all columns
prices_prod.columns

Index(['year_month', 'geography', 'date', 'type', 'avg_price',
       'prices_total_volume', 'units_4046', 'units_4225', 'units_4770',
       'total_bags', 's_bags', 'l_bags', 'xl_bags', 'status',
       'prod_total_volume', 'california', 'chile', 'mexico', 'peru',
       'colombia', 'year', 'month', 'week'],
      dtype='object')

In [43]:
prices_prod.nunique()

year_month                33
geography                 54
date                     134
type                       2
avg_price                216
prices_total_volume    14463
units_4046             14051
units_4225             14132
units_4770              9256
total_bags             14460
s_bags                 14456
l_bags                 13243
xl_bags                 6310
status                     1
prod_total_volume        134
california               119
chile                     52
mexico                   134
peru                      66
colombia                  12
year                       3
month                     12
week                       1
dtype: int64

In [44]:
# drop columns
test1 = prices_prod.drop(columns=["year_month", "date", "status", "total_bags"])
test2 = prices_prod.drop(columns=["year_month", "date", "status", "prices_total_volume", "total_bags",
                                 "prod_total_volume"])

test2

Unnamed: 0,geography,type,avg_price,units_4046,units_4225,units_4770,s_bags,l_bags,xl_bags,california,chile,mexico,peru,colombia,year,month,week
0,Spokane,conventional,1.11,17230.22,16545.99,113.30,46850.74,41117.35,52.54,319866,177780,64809925,0,0,2020,1,6
1,Seattle,conventional,2.02,71979.03,116438.62,1155.43,134101.42,140683.16,701.34,9892498,0,27328014,14364539,0,2019,7,6
2,GrandRapids,organic,1.63,76.77,2459.61,0.00,6318.55,98.42,0.00,11391217,0,25273369,15360100,234188,2020,7,6
3,Buffalo/Rochester,organic,1.51,451.21,199.51,0.00,2283.06,5064.23,0.00,11609804,0,35628132,6210411,0,2019,5,6
4,NorthernNewEngland,organic,1.44,0.00,3381.87,0.00,23200.12,6460.23,0.00,798732,3241640,39427297,142223,0,2018,10,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14467,SanFrancisco,organic,1.87,8196.29,7722.20,0.00,11999.59,6.60,0.00,11103220,0,60654534,0,0,2019,4,6
14468,Sacramento,conventional,1.36,106684.00,169023.00,4611.00,87382.00,25473.00,8339.00,0,234344,39363186,0,0,2019,11,6
14469,Houston,organic,1.81,5636.21,0.00,0.00,20419.80,0.00,0.00,4205974,235947,18955232,7490420,0,2019,8,6
14470,Louisville,organic,1.60,4.53,596.38,0.00,2469.61,257.96,0.00,2233704,1684129,29806714,3365948,0,2019,9,6


### prices_prod-test1

In [45]:
# Generate our categorical variable lists
category = test1.dtypes[test1.dtypes == "object"].index.tolist()

In [46]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(test1[["geography", "type"]]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(category)
encode_df.head()

Unnamed: 0,geography_Albany,geography_Atlanta,geography_Baltimore/Washington,geography_Boise,geography_Boston,geography_Buffalo/Rochester,geography_California,geography_Charlotte,geography_Chicago,geography_Cincinnati/Dayton,...,geography_Southeast,geography_Spokane,geography_StLouis,geography_Syracuse,geography_Tampa,geography_TotalUS,geography_West,geography_WestTex/NewMexico,type_conventional,type_organic
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [47]:
# Merge one-hot encoded features and drop the originals
prices_prod_encoded1 = test1.merge(encode_df,left_index=True, right_index=True)
prices_prod_encoded1 = test1.drop(category,1)
prices_prod_encoded1.head()

Unnamed: 0,avg_price,prices_total_volume,units_4046,units_4225,units_4770,s_bags,l_bags,xl_bags,prod_total_volume,california,chile,mexico,peru,colombia,year,month,week
0,1.11,121910.14,17230.22,16545.99,113.3,46850.74,41117.35,52.54,65307572,319866,177780,64809925,0,0,2020,1,6
1,2.02,465059.0,71979.03,116438.62,1155.43,134101.42,140683.16,701.34,51585051,9892498,0,27328014,14364539,0,2019,7,6
2,1.63,8953.35,76.77,2459.61,0.0,6318.55,98.42,0.0,52258875,11391217,0,25273369,15360100,234188,2020,7,6
3,1.51,7998.01,451.21,199.51,0.0,2283.06,5064.23,0.0,53448347,11609804,0,35628132,6210411,0,2019,5,6
4,1.44,33042.22,0.0,3381.87,0.0,23200.12,6460.23,0.0,43609892,798732,3241640,39427297,142223,0,2018,10,6


In [48]:
# Categorize y-values
labels = ["low", "medium", "high"]
y_categories = prices_prod_encoded1['avg_price']

#Categorize prices
prices_prod_encoded1["price_category"] = pd.qcut(y_categories, 3, labels=labels)
prices_prod_encoded1.head()

Unnamed: 0,avg_price,prices_total_volume,units_4046,units_4225,units_4770,s_bags,l_bags,xl_bags,prod_total_volume,california,chile,mexico,peru,colombia,year,month,week,price_category
0,1.11,121910.14,17230.22,16545.99,113.3,46850.74,41117.35,52.54,65307572,319866,177780,64809925,0,0,2020,1,6,low
1,2.02,465059.0,71979.03,116438.62,1155.43,134101.42,140683.16,701.34,51585051,9892498,0,27328014,14364539,0,2019,7,6,high
2,1.63,8953.35,76.77,2459.61,0.0,6318.55,98.42,0.0,52258875,11391217,0,25273369,15360100,234188,2020,7,6,high
3,1.51,7998.01,451.21,199.51,0.0,2283.06,5064.23,0.0,53448347,11609804,0,35628132,6210411,0,2019,5,6,high
4,1.44,33042.22,0.0,3381.87,0.0,23200.12,6460.23,0.0,43609892,798732,3241640,39427297,142223,0,2018,10,6,medium


In [49]:
# Check bin balance
prices_prod_encoded1["price_category"].value_counts()

low       5007
medium    4771
high      4694
Name: price_category, dtype: int64

In [50]:
# Split our preprocessed data into our features and target arrays
y = prices_prod_encoded1.price_category
X = prices_prod_encoded1.drop(columns=["avg_price", "price_category"])

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [51]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [52]:
# Resample the training data with the BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=128, random_state=42)
brf.fit(X_train, y_train)

BalancedRandomForestClassifier(n_estimators=128, random_state=42)

In [53]:
# Calculated the balanced accuracy score
y_pred = brf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.8209776775840428

In [54]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[1010,    7,  156],
       [  20, 1081,  151],
       [ 172,  140,  881]], dtype=int64)

In [55]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       high       0.84      0.86      0.92      0.85      0.89      0.79      1173
        low       0.88      0.86      0.94      0.87      0.90      0.80      1252
     medium       0.74      0.74      0.87      0.74      0.80      0.64      1193

avg / total       0.82      0.82      0.91      0.82      0.87      0.74      3618



In [56]:
# List the features sorted in descending order by feature importance
sorted_importances = sorted(zip(brf.feature_importances_, X.columns), reverse=True)

for i in sorted_importances:
    print("{}: ({})".format(i[1],i[0]))

units_4046: (0.1630985410663391)
units_4225: (0.13443771258514484)
prices_total_volume: (0.11465330813562169)
s_bags: (0.10331487066984672)
l_bags: (0.08883528198675615)
xl_bags: (0.08222034322480484)
units_4770: (0.08145239618162678)
mexico: (0.04245587438645399)
california: (0.039532893044661045)
prod_total_volume: (0.038493388228668164)
peru: (0.0333884407231358)
year: (0.025712159391468713)
month: (0.02468593561050238)
chile: (0.020371647568429782)
colombia: (0.0073472071965399845)
week: (0.0)


### prices_prod-test2

In [57]:
# Generate our categorical variable lists
category = test2.dtypes[test2.dtypes == "object"].index.tolist()

In [58]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(test2[["geography", "type"]]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(category)
encode_df.head()

Unnamed: 0,geography_Albany,geography_Atlanta,geography_Baltimore/Washington,geography_Boise,geography_Boston,geography_Buffalo/Rochester,geography_California,geography_Charlotte,geography_Chicago,geography_Cincinnati/Dayton,...,geography_Southeast,geography_Spokane,geography_StLouis,geography_Syracuse,geography_Tampa,geography_TotalUS,geography_West,geography_WestTex/NewMexico,type_conventional,type_organic
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [59]:
# Merge one-hot encoded features and drop the originals
prices_prod_encoded2 = test2.merge(encode_df,left_index=True, right_index=True)
prices_prod_encoded2 = test2.drop(category,1)
prices_prod_encoded2.head()

Unnamed: 0,avg_price,units_4046,units_4225,units_4770,s_bags,l_bags,xl_bags,california,chile,mexico,peru,colombia,year,month,week
0,1.11,17230.22,16545.99,113.3,46850.74,41117.35,52.54,319866,177780,64809925,0,0,2020,1,6
1,2.02,71979.03,116438.62,1155.43,134101.42,140683.16,701.34,9892498,0,27328014,14364539,0,2019,7,6
2,1.63,76.77,2459.61,0.0,6318.55,98.42,0.0,11391217,0,25273369,15360100,234188,2020,7,6
3,1.51,451.21,199.51,0.0,2283.06,5064.23,0.0,11609804,0,35628132,6210411,0,2019,5,6
4,1.44,0.0,3381.87,0.0,23200.12,6460.23,0.0,798732,3241640,39427297,142223,0,2018,10,6


In [60]:
# Categorize y-values
labels = ["low", "medium", "high"]
y_categories = prices_prod_encoded2['avg_price']

#Categorize prices
prices_prod_encoded2["price_category"] = pd.qcut(y_categories, 3, labels=labels)
prices_prod_encoded2.head()

Unnamed: 0,avg_price,units_4046,units_4225,units_4770,s_bags,l_bags,xl_bags,california,chile,mexico,peru,colombia,year,month,week,price_category
0,1.11,17230.22,16545.99,113.3,46850.74,41117.35,52.54,319866,177780,64809925,0,0,2020,1,6,low
1,2.02,71979.03,116438.62,1155.43,134101.42,140683.16,701.34,9892498,0,27328014,14364539,0,2019,7,6,high
2,1.63,76.77,2459.61,0.0,6318.55,98.42,0.0,11391217,0,25273369,15360100,234188,2020,7,6,high
3,1.51,451.21,199.51,0.0,2283.06,5064.23,0.0,11609804,0,35628132,6210411,0,2019,5,6,high
4,1.44,0.0,3381.87,0.0,23200.12,6460.23,0.0,798732,3241640,39427297,142223,0,2018,10,6,medium


In [61]:
# Check bin balance
prices_prod_encoded2["price_category"].value_counts()

low       5007
medium    4771
high      4694
Name: price_category, dtype: int64

In [62]:
# Split our preprocessed data into our features and target arrays
y = prices_prod_encoded2.price_category
X = prices_prod_encoded2.drop(columns=["avg_price", "price_category"])

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [63]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [64]:
# Resample the training data with the BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=128, random_state=42)
brf.fit(X_train, y_train)

BalancedRandomForestClassifier(n_estimators=128, random_state=42)

In [65]:
# Calculated the balanced accuracy score
y_pred = brf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.8203205404178711

In [66]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[1017,    9,  147],
       [  21, 1091,  140],
       [ 191,  140,  862]], dtype=int64)

In [67]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       high       0.83      0.87      0.91      0.85      0.89      0.79      1173
        low       0.88      0.87      0.94      0.88      0.90      0.81      1252
     medium       0.75      0.72      0.88      0.74      0.80      0.63      1193

avg / total       0.82      0.82      0.91      0.82      0.86      0.74      3618



In [68]:
# List the features sorted in descending order by feature importance
sorted_importances = sorted(zip(brf.feature_importances_, X.columns), reverse=True)

for i in sorted_importances:
    print("{}: ({})".format(i[1],i[0]))

units_4046: (0.18222710930858851)
units_4225: (0.15638532531757732)
s_bags: (0.1258015923339114)
l_bags: (0.11214471376357296)
units_4770: (0.1001247222323826)
xl_bags: (0.09116251956274915)
mexico: (0.05327457510517305)
california: (0.046931738153075685)
peru: (0.03772168758331447)
month: (0.030698861363960557)
year: (0.028570990951584406)
chile: (0.025450884285599964)
colombia: (0.00950528003850999)
week: (0.0)
