In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os
import warnings
warnings.filterwarnings('ignore')
from pathlib import Path

from sqlalchemy import create_engine
from config import db_password

In [None]:
# get db server connection string
db_string = f'postgres://postgres:{db_password}@127.0.0.1:5432/avocados'

# Create db engine
engine = create_engine(db_string)

# Load prices data set
cleaned_df = pd.read_sql_table('combined_cleaned', engine) 

In [45]:
cleaned_df = pd.read_csv(Path('../resources/combined_cleaned.csv'))
cleaned_df.head()

Unnamed: 0.1,Unnamed: 0,index,type,avg_price,total_volume_price,4046_units,4225_units,4770_units,total_bags,s_bags,...,geography_South Carolina,geography_South Central,geography_Southeast,geography_Spokane,geography_St. Louis,geography_Syracuse,geography_Tampa,geography_Total U.S.,geography_West,geography_West Tex/New Mexico
0,0,5616,1,1.42,95246.38,2897.41,76570.67,44.0,15734.3,10012.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,5617,0,1.47,4140.95,7.3,301.87,0.0,3831.78,3831.78,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,5618,1,1.69,135196.35,3133.37,116520.88,88.78,15453.32,10023.79,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,5619,0,1.54,3346.54,14.67,253.01,0.0,3078.86,3078.86,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,5620,1,1.57,93625.03,3101.17,74627.23,55.59,15841.04,11614.79,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
cleaned_df = cleaned_df.drop(columns=['Unnamed: 0', 'index'], axis=1)
cleaned_df.head()

Unnamed: 0,type,avg_price,total_volume_price,4046_units,4225_units,4770_units,total_bags,s_bags,l_bags,xl_bags,...,geography_South Carolina,geography_South Central,geography_Southeast,geography_Spokane,geography_St. Louis,geography_Syracuse,geography_Tampa,geography_Total U.S.,geography_West,geography_West Tex/New Mexico
0,1,1.42,95246.38,2897.41,76570.67,44.0,15734.3,10012.8,5721.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,1.47,4140.95,7.3,301.87,0.0,3831.78,3831.78,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,1.69,135196.35,3133.37,116520.88,88.78,15453.32,10023.79,5429.53,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,1.54,3346.54,14.67,253.01,0.0,3078.86,3078.86,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,1.57,93625.03,3101.17,74627.23,55.59,15841.04,11614.79,4159.58,66.67,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
price_bins = [0.44, 1.14, 1.37, 1.63, 3.17]
group_names = ["0", "1", "2", "3" ]

cleaned_cat_df = cleaned_df.copy()

# Categorize prices for NB.
cleaned_cat_df["price_cat"] = pd.cut(cleaned_cat_df['avg_price'], price_bins, labels=group_names)

cleaned_cat_df.head()

Unnamed: 0,type,avg_price,total_volume_price,4046_units,4225_units,4770_units,total_bags,s_bags,l_bags,xl_bags,...,geography_South Central,geography_Southeast,geography_Spokane,geography_St. Louis,geography_Syracuse,geography_Tampa,geography_Total U.S.,geography_West,geography_West Tex/New Mexico,price_cat
0,1,1.42,95246.38,2897.41,76570.67,44.0,15734.3,10012.8,5721.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
1,0,1.47,4140.95,7.3,301.87,0.0,3831.78,3831.78,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2,1,1.69,135196.35,3133.37,116520.88,88.78,15453.32,10023.79,5429.53,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
3,0,1.54,3346.54,14.67,253.01,0.0,3078.86,3078.86,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
4,1,1.57,93625.03,3101.17,74627.23,55.59,15841.04,11614.79,4159.58,66.67,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2


In [48]:
cleaned_cat_df['price_cat'].value_counts()

0    4282
1    3838
2    3681
3    3039
Name: price_cat, dtype: int64

In [49]:
cleaned_cat_df.columns

Index(['type', 'avg_price', 'total_volume_price', '4046_units', '4225_units',
       '4770_units', 'total_bags', 's_bags', 'l_bags', 'xl_bags',
       ...
       'geography_South Central', 'geography_Southeast', 'geography_Spokane',
       'geography_St. Louis', 'geography_Syracuse', 'geography_Tampa',
       'geography_Total U.S.', 'geography_West',
       'geography_West Tex/New Mexico', 'price_cat'],
      dtype='object', length=267)

In [50]:
cleaned_cat_df.shape

(14840, 267)

In [51]:
def RFSSModel4(X, y):
    # Initial imports.
    import pandas as pd
    from path import Path
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.preprocessing import StandardScaler
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
    
    # Splitting into Train and Test sets.
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
    
    # Creating a StandardScaler instance.
    scaler = StandardScaler()
    # Fitting the Standard Scaler with the training data.
    X_scaler = scaler.fit(X_train)
    
    # Scaling the data.
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    
    # Create a random forest classifier.
    #rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 
    rf_model = RandomForestClassifier(n_estimators=1024, random_state=78, max_depth=5) 
    
    # Fitting the model
    rf_model = rf_model.fit(X_train_scaled, y_train)
    
    # Making predictions using the testing data.
    predictions = rf_model.predict(X_test_scaled)
    
    print(f"Predictions: {predictions}")
    
    # Calculating the confusion matrix.
    cm = confusion_matrix(y_test, predictions)
    
    # Create a DataFrame from the confusion matrix.
    cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1", "Actual 2", "Actual 3"], 
                         columns=["Predicted 0", "Predicted 1", "Predicted 2", "Predicted 3"])
    cm_df
    
    # Calculating the accuracy score.
    acc_score = accuracy_score(y_test, predictions)
    
    # Displaying results
    print("Confusion Matrix")
    display(cm_df)
    print(f"Accuracy Score : {acc_score}")
    print("Classification Report")
    print(classification_report(y_test, predictions))
    
    # Calculate feature importance in the Random Forest model.
    importances = rf_model.feature_importances_
    print(f"Importances: {importances}")
    
    # We can sort the features by their importance.
    print(sorted(zip(rf_model.feature_importances_, X.columns), reverse=True))
    

In [52]:
y = cleaned_cat_df['price_cat']
X = cleaned_cat_df.drop(columns=['price_cat', 'total_volume_prod', 'total_volume_price'], axis=1)

RFSSModel4(X, y)

Predictions: ['2' '0' '3' ... '2' '0' '2']
Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3
Actual 0,974,15,39,5
Actual 1,68,642,227,13
Actual 2,14,178,740,39
Actual 3,4,22,36,694


Accuracy Score : 0.8221024258760108
Classification Report
              precision    recall  f1-score   support

           0       0.92      0.94      0.93      1033
           1       0.75      0.68      0.71       950
           2       0.71      0.76      0.74       971
           3       0.92      0.92      0.92       756

    accuracy                           0.82      3710
   macro avg       0.83      0.82      0.82      3710
weighted avg       0.82      0.82      0.82      3710

Importances: [4.36197080e-02 3.06348645e-01 7.96838929e-02 4.17820227e-02
 5.89422673e-02 5.45278732e-02 4.34861178e-02 3.74548750e-02
 7.47235749e-02 2.48165256e-03 3.19582550e-03 4.03326885e-03
 7.21207756e-03 1.23984693e-03 6.53251148e-03 3.92172990e-03
 3.32797299e-03 9.36167737e-04 1.40289302e-03 3.11889083e-03
 5.31057351e-03 7.62008491e-03 4.39179922e-03 1.84783654e-03
 4.16125387e-03 3.66403604e-03 0.00000000e+00 2.33014474e-03
 1.08006047e-03 2.88168721e-03 4.08171474e-03 4.69385336e-04
 0.000

In [53]:
y = cleaned_cat_df['price_cat']
X = cleaned_cat_df.drop(columns=['price_cat', 'avg_price', 'total_volume_prod', 'total_volume_price'], axis=1)

RFSSModel4(X, y)

Predictions: ['3' '0' '3' ... '2' '0' '2']
Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3
Actual 0,877,82,57,17
Actual 1,350,283,260,57
Actual 2,101,147,551,172
Actual 3,16,39,195,506


Accuracy Score : 0.5975741239892183
Classification Report
              precision    recall  f1-score   support

           0       0.65      0.85      0.74      1033
           1       0.51      0.30      0.38       950
           2       0.52      0.57      0.54       971
           3       0.67      0.67      0.67       756

    accuracy                           0.60      3710
   macro avg       0.59      0.60      0.58      3710
weighted avg       0.59      0.60      0.58      3710

Importances: [6.06437409e-02 1.08313855e-01 5.80785601e-02 7.38218975e-02
 8.17983179e-02 6.85955246e-02 5.87508358e-02 1.02358736e-01
 3.68180670e-03 4.75631422e-03 5.99983972e-03 9.67685534e-03
 1.96918677e-03 8.29046411e-03 5.98247336e-03 4.67655329e-03
 1.22022080e-03 2.41216432e-03 4.43469889e-03 7.67913619e-03
 1.16746497e-02 6.56429479e-03 2.85111424e-03 6.40571022e-03
 4.98865101e-03 0.00000000e+00 3.09184261e-03 1.74164274e-03
 4.16107856e-03 6.12210913e-03 9.88367903e-04 0.00000000e+00
 0.000

In [54]:
prices_df = pd.read_csv(Path('../resources/prices.csv'))
prices_df.head()

Unnamed: 0,year_month,geography,timeframe,date,type,avg_price,total_volume,4046_units,4225_units,4770_units,total_bags,s_bags,l_bags,xl_bags,price_inc
0,1/1/2017,Albany,Weekly,1/2/2017,conventional,1.47,129948.23,4845.77,117027.41,200.36,7874.69,7866.86,7.83,0.0,1
1,1/1/2017,Albany,Weekly,1/8/2017,conventional,1.55,91728.18,3355.47,75641.23,56.91,12674.57,12606.67,67.9,0.0,1
2,1/1/2017,Albany,Weekly,1/15/2017,conventional,1.55,88526.26,3327.65,71956.77,607.03,12634.81,12574.72,60.09,0.0,0
3,1/1/2017,Albany,Weekly,1/22/2017,conventional,1.59,128679.24,4119.94,111173.08,2191.71,11194.51,11060.19,125.5,8.82,1
4,1/1/2017,Albany,Weekly,1/29/2017,conventional,1.31,95424.59,3844.62,78315.15,484.56,12780.26,12393.84,382.06,4.36,0


In [55]:
prices_df['year_month'] = pd.to_datetime(prices_df['year_month'])
prices_df['date'] = pd.to_datetime(prices_df['date'])
#prices_df["year"] = prices_df['date'].apply(lambda x: x.year)
#prices_df["month"] = prices_df['date'].apply(lambda x: x.month)
#prices_df["day"] = prices_df['date'].apply(lambda x: x.day)

In [56]:
for col in prices_df.columns:
    print(f"Column {col} has {prices_df[col].isnull().sum()} null values")

Column year_month has 0 null values
Column geography has 0 null values
Column timeframe has 0 null values
Column date has 0 null values
Column type has 0 null values
Column avg_price has 0 null values
Column total_volume has 0 null values
Column 4046_units has 0 null values
Column 4225_units has 0 null values
Column 4770_units has 0 null values
Column total_bags has 0 null values
Column s_bags has 0 null values
Column l_bags has 0 null values
Column xl_bags has 0 null values
Column price_inc has 0 null values


In [57]:
# type con organic
def change_type_string(str):
        
    if str == "actual":
        return 1
    
    if str == "avocados":
        return 1
    
    if str == "hass":
        return 1
    
    if str == "conventional":
        return 1
    else:
        return 0
    
    
prices_df['type'] = prices_df['type'].apply(change_type_string)
#combined_cleaned_df['status'] = combined_cleaned_df['status'].apply(change_type_string)
#combined_cleaned_df['segment'] = combined_cleaned_df['segment'].apply(change_type_string)
#combined_cleaned_df['variety'] = combined_cleaned_df['variety'].apply(change_type_string)

#combined_cleaned_df.head()

In [58]:
price_bins = [0.44, 1.14, 1.37, 1.63, 3.17]
group_names = ["0", "1", "2", "3" ]

#cleaned_cat_df = prices_df.copy()

# Categorize prices for NB.
prices_df["price_cat"] = pd.cut(prices_df['avg_price'], price_bins, labels=group_names)

prices_df.head()

Unnamed: 0,year_month,geography,timeframe,date,type,avg_price,total_volume,4046_units,4225_units,4770_units,total_bags,s_bags,l_bags,xl_bags,price_inc,price_cat
0,2017-01-01,Albany,Weekly,2017-01-02,1,1.47,129948.23,4845.77,117027.41,200.36,7874.69,7866.86,7.83,0.0,1,2
1,2017-01-01,Albany,Weekly,2017-01-08,1,1.55,91728.18,3355.47,75641.23,56.91,12674.57,12606.67,67.9,0.0,1,2
2,2017-01-01,Albany,Weekly,2017-01-15,1,1.55,88526.26,3327.65,71956.77,607.03,12634.81,12574.72,60.09,0.0,0,2
3,2017-01-01,Albany,Weekly,2017-01-22,1,1.59,128679.24,4119.94,111173.08,2191.71,11194.51,11060.19,125.5,8.82,1,2
4,2017-01-01,Albany,Weekly,2017-01-29,1,1.31,95424.59,3844.62,78315.15,484.56,12780.26,12393.84,382.06,4.36,0,1


In [59]:
prices_df = pd.get_dummies(data=prices_df, columns=['year_month', 'date'])
prices_df.head()

Unnamed: 0,geography,timeframe,type,avg_price,total_volume,4046_units,4225_units,4770_units,total_bags,s_bags,...,date_2020-07-05 00:00:00,date_2020-07-12 00:00:00,date_2020-07-19 00:00:00,date_2020-07-26 00:00:00,date_2020-08-02 00:00:00,date_2020-08-09 00:00:00,date_2020-08-16 00:00:00,date_2020-08-23 00:00:00,date_2020-08-30 00:00:00,date_2020-09-06 00:00:00
0,Albany,Weekly,1,1.47,129948.23,4845.77,117027.41,200.36,7874.69,7866.86,...,0,0,0,0,0,0,0,0,0,0
1,Albany,Weekly,1,1.55,91728.18,3355.47,75641.23,56.91,12674.57,12606.67,...,0,0,0,0,0,0,0,0,0,0
2,Albany,Weekly,1,1.55,88526.26,3327.65,71956.77,607.03,12634.81,12574.72,...,0,0,0,0,0,0,0,0,0,0
3,Albany,Weekly,1,1.59,128679.24,4119.94,111173.08,2191.71,11194.51,11060.19,...,0,0,0,0,0,0,0,0,0,0
4,Albany,Weekly,1,1.31,95424.59,3844.62,78315.15,484.56,12780.26,12393.84,...,0,0,0,0,0,0,0,0,0,0


In [60]:
prices_df = prices_df.reset_index()

In [61]:
# Create the OneHotEncoder instance
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse=False)

# Fit the encoder and produce encoded DataFrame
encode_df = pd.DataFrame(enc.fit_transform(prices_df.geography.values.reshape(-1,1)))

# Rename encoded columns
encode_df.columns = enc.get_feature_names(['geography'])
encode_df.head()

Unnamed: 0,geography_Albany,geography_Atlanta,geography_Baltimore/Washington,geography_Boise,geography_Boston,geography_Buffalo/Rochester,geography_California,geography_Charlotte,geography_Chicago,geography_Cincinnati/Dayton,...,geography_South Carolina,geography_South Central,geography_Southeast,geography_Spokane,geography_St. Louis,geography_Syracuse,geography_Tampa,geography_Total U.S.,geography_West,geography_West Tex/New Mexico
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [62]:
# Merge the two DataFrames together and drop the geography column
prices_df = prices_df.merge(encode_df,left_index=True,right_index=True).drop("geography",1)
prices_df.head()

Unnamed: 0,index,timeframe,type,avg_price,total_volume,4046_units,4225_units,4770_units,total_bags,s_bags,...,geography_South Carolina,geography_South Central,geography_Southeast,geography_Spokane,geography_St. Louis,geography_Syracuse,geography_Tampa,geography_Total U.S.,geography_West,geography_West Tex/New Mexico
0,0,Weekly,1,1.47,129948.23,4845.77,117027.41,200.36,7874.69,7866.86,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,Weekly,1,1.55,91728.18,3355.47,75641.23,56.91,12674.57,12606.67,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,Weekly,1,1.55,88526.26,3327.65,71956.77,607.03,12634.81,12574.72,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,Weekly,1,1.59,128679.24,4119.94,111173.08,2191.71,11194.51,11060.19,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,Weekly,1,1.31,95424.59,3844.62,78315.15,484.56,12780.26,12393.84,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [63]:
prices_df = prices_df.drop(columns=['timeframe'], axis=1)
prices_df.head()

Unnamed: 0,index,type,avg_price,total_volume,4046_units,4225_units,4770_units,total_bags,s_bags,l_bags,...,geography_South Carolina,geography_South Central,geography_Southeast,geography_Spokane,geography_St. Louis,geography_Syracuse,geography_Tampa,geography_Total U.S.,geography_West,geography_West Tex/New Mexico
0,0,1,1.47,129948.23,4845.77,117027.41,200.36,7874.69,7866.86,7.83,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1,1.55,91728.18,3355.47,75641.23,56.91,12674.57,12606.67,67.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,1,1.55,88526.26,3327.65,71956.77,607.03,12634.81,12574.72,60.09,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,1,1.59,128679.24,4119.94,111173.08,2191.71,11194.51,11060.19,125.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,1,1.31,95424.59,3844.62,78315.15,484.56,12780.26,12393.84,382.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [64]:
prices_df.shape

(20410, 301)

In [65]:
print(prices_df.columns)

Index(['index', 'type', 'avg_price', 'total_volume', '4046_units',
       '4225_units', '4770_units', 'total_bags', 's_bags', 'l_bags',
       ...
       'geography_South Carolina', 'geography_South Central',
       'geography_Southeast', 'geography_Spokane', 'geography_St. Louis',
       'geography_Syracuse', 'geography_Tampa', 'geography_Total U.S.',
       'geography_West', 'geography_West Tex/New Mexico'],
      dtype='object', length=301)


In [66]:
prices_df = prices_df.dropna()

In [67]:
prices_df["4046_units"] = prices_df["4046_units"] / 100
prices_df["4225_units"] = prices_df["4225_units"] / 1000
prices_df["4770_units"] = prices_df["4770_units"] / 10
prices_df["total_bags"] = prices_df["total_bags"] / 100
prices_df["s_bags"] = prices_df["s_bags"] / 100
prices_df["l_bags"] = prices_df["l_bags"] / 100

In [68]:
y = prices_df['price_cat']
X = prices_df.drop(columns=['index', 'price_cat', 'avg_price', 'total_volume'], axis=1)

RFSSModel4(X, y)

Predictions: ['3' '3' '3' ... '0' '3' '3']
Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3
Actual 0,1147,22,45,116
Actual 1,636,72,168,341
Actual 2,339,31,209,712
Actual 3,127,7,89,1042


Accuracy Score : 0.48402900254752107
Classification Report
              precision    recall  f1-score   support

           0       0.51      0.86      0.64      1330
           1       0.55      0.06      0.11      1217
           2       0.41      0.16      0.23      1291
           3       0.47      0.82      0.60      1265

    accuracy                           0.48      5103
   macro avg       0.48      0.48      0.39      5103
weighted avg       0.48      0.48      0.40      5103

Importances: [8.47747813e-02 1.49524193e-01 7.21587054e-02 9.51356396e-02
 1.18458881e-01 8.83919252e-02 8.85176864e-02 1.09615650e-01
 3.95119949e-04 4.63380970e-04 1.65566776e-03 2.74314669e-04
 4.10024755e-04 3.26134496e-04 3.75768572e-04 1.53418362e-03
 5.99550427e-03 1.67528777e-02 1.45377979e-02 3.90577110e-04
 1.15485347e-04 1.14144223e-04 1.44412474e-04 7.13887112e-05
 1.76671022e-04 1.64006473e-04 1.35196045e-04 1.56834091e-04
 1.34189912e-04 4.60502257e-04 1.07592620e-04 9.41397859e-05
 4.30

In [70]:
cleaned_cat_df["4046_units"] = cleaned_cat_df["4046_units"] / 100
cleaned_cat_df["4225_units"] = cleaned_cat_df["4225_units"] / 1000
cleaned_cat_df["4770_units"] = cleaned_cat_df["4770_units"] / 10
cleaned_cat_df["total_bags"] = cleaned_cat_df["total_bags"] / 100
cleaned_cat_df["s_bags"] = cleaned_cat_df["s_bags"] / 100
cleaned_cat_df["l_bags"] = cleaned_cat_df["l_bags"] / 100

In [74]:
y = cleaned_cat_df['price_cat']
X = cleaned_cat_df.drop(columns=['price_cat', 'total_bags', 'avg_price_current_year',
                                 'avg_price_prior_year', 'total_volume_prod', 'avg_price_variance', 
                                 'unit_variance', 'total_volume_price'], axis=1)

RFSSModel4(X, y)

Predictions: ['2' '0' '3' ... '2' '0' '2']
Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3
Actual 0,992,2,27,12
Actual 1,108,578,239,25
Actual 2,34,184,707,46
Actual 3,1,34,24,697


Accuracy Score : 0.8016172506738545
Classification Report
              precision    recall  f1-score   support

           0       0.87      0.96      0.92      1033
           1       0.72      0.61      0.66       950
           2       0.71      0.73      0.72       971
           3       0.89      0.92      0.91       756

    accuracy                           0.80      3710
   macro avg       0.80      0.80      0.80      3710
weighted avg       0.80      0.80      0.80      3710

Importances: [4.83042268e-02 3.30939821e-01 1.05268307e-01 5.05364285e-02
 6.14864439e-02 6.37390852e-02 4.92384685e-02 7.80323219e-02
 3.53995080e-03 4.19033549e-03 7.31316977e-03 1.07926090e-02
 2.13085825e-03 9.86499061e-03 6.09130998e-03 5.58396108e-03
 1.10707622e-03 2.22399273e-03 4.33798368e-03 8.37492200e-03
 1.15222695e-02 7.33669870e-03 2.77538511e-03 6.16443182e-03
 4.49271965e-03 0.00000000e+00 2.44024251e-03 1.71103523e-03
 3.80011491e-03 6.40413772e-03 9.02236371e-04 0.00000000e+00
 0.000

In [97]:
pp_df = pd.read_csv(Path('../resources/prices_prod.csv'))
pp_df.head()

Unnamed: 0,year_month,geography,timeframe,date,type,avg_price,total_volume,4046_units,4225_units,4770_units,...,s_bags,l_bags,xl_bags,status,prod_total_volume,california,chile,mexico,peru,columbia
0,2020-03-01,Phoenix/Tucson,Weekly,2020-03-22,conventional,0.83,1559597.93,490963.83,191059.35,5795.86,...,225725.82,640831.47,5221.6,actual,54765620,8086208,0,46679412,0,0
1,2020-03-01,Buffalo/Rochester,Weekly,2020-03-15,conventional,1.2,255471.62,22625.06,70184.86,2547.78,...,134178.7,23965.22,1970.0,actual,48039847,3868964,0,44120882,50000,0
2,2018-11-01,San Diego,Weekly,2018-11-18,organic,2.07,15809.64,2284.08,9243.37,0.0,...,4263.31,18.88,0.0,actual,44931673,0,3513150,41418523,0,0
3,2018-11-01,Dallas/Ft. Worth,Weekly,2018-11-18,conventional,0.71,1160242.0,655676.32,95361.47,3983.06,...,278437.39,126749.95,33.81,actual,44931673,0,3513150,41418523,0,0
4,2020-02-01,Total U.S.,Weekly,2020-02-23,organic,1.41,1871904.09,105003.23,150837.39,6789.04,...,1298135.07,311012.52,77.88,actual,60175473,6454635,0,53720838,0,0


In [98]:
for col in pp_df.columns:
    print(f"Column {col} has {pp_df[col].isnull().sum()} null values")

Column year_month has 0 null values
Column geography has 0 null values
Column timeframe has 0 null values
Column date has 0 null values
Column type has 0 null values
Column avg_price has 0 null values
Column total_volume has 0 null values
Column 4046_units has 0 null values
Column 4225_units has 0 null values
Column 4770_units has 0 null values
Column total_bags has 0 null values
Column s_bags has 0 null values
Column l_bags has 0 null values
Column xl_bags has 0 null values
Column status has 0 null values
Column prod_total_volume has 0 null values
Column california has 0 null values
Column chile has 0 null values
Column mexico has 0 null values
Column peru has 0 null values
Column columbia has 0 null values


In [99]:
pp_df['type'] = pp_df['type'].apply(change_type_string)

In [100]:
pp_df = pd.get_dummies(data=pp_df, columns=['year_month', 'date', 'geography'])
pp_df.head()

Unnamed: 0,timeframe,type,avg_price,total_volume,4046_units,4225_units,4770_units,total_bags,s_bags,l_bags,...,geography_South Carolina,geography_South Central,geography_Southeast,geography_Spokane,geography_St. Louis,geography_Syracuse,geography_Tampa,geography_Total U.S.,geography_West,geography_West Tex/New Mexico
0,Weekly,0,0.83,1559597.93,490963.83,191059.35,5795.86,871778.89,225725.82,640831.47,...,0,0,0,0,0,0,0,0,0,0
1,Weekly,0,1.2,255471.62,22625.06,70184.86,2547.78,160113.92,134178.7,23965.22,...,0,0,0,0,0,0,0,0,0,0
2,Weekly,0,2.07,15809.64,2284.08,9243.37,0.0,4282.19,4263.31,18.88,...,0,0,0,0,0,0,0,0,0,0
3,Weekly,1,0.71,1160242.0,655676.32,95361.47,3983.06,405221.15,278437.39,126749.95,...,0,0,0,0,0,0,0,0,0,0
4,Weekly,0,1.41,1871904.09,105003.23,150837.39,6789.04,1609225.47,1298135.07,311012.52,...,0,0,0,0,0,0,0,1,0,0


In [101]:
print(pp_df.columns)

Index(['timeframe', 'type', 'avg_price', 'total_volume', '4046_units',
       '4225_units', '4770_units', 'total_bags', 's_bags', 'l_bags',
       ...
       'geography_South Carolina', 'geography_South Central',
       'geography_Southeast', 'geography_Spokane', 'geography_St. Louis',
       'geography_Syracuse', 'geography_Tampa', 'geography_Total U.S.',
       'geography_West', 'geography_West Tex/New Mexico'],
      dtype='object', length=239)


In [102]:
price_bins = [0.44, 1.14, 1.37, 1.63, 3.17]
group_names = ["0", "1", "2", "3" ]

#cleaned_cat_df = prices_df.copy()

# Categorize prices for NB.
pp_df["price_cat"] = pd.cut(pp_df['avg_price'], price_bins, labels=group_names)

pp_df.head()

Unnamed: 0,timeframe,type,avg_price,total_volume,4046_units,4225_units,4770_units,total_bags,s_bags,l_bags,...,geography_South Central,geography_Southeast,geography_Spokane,geography_St. Louis,geography_Syracuse,geography_Tampa,geography_Total U.S.,geography_West,geography_West Tex/New Mexico,price_cat
0,Weekly,0,0.83,1559597.93,490963.83,191059.35,5795.86,871778.89,225725.82,640831.47,...,0,0,0,0,0,0,0,0,0,0
1,Weekly,0,1.2,255471.62,22625.06,70184.86,2547.78,160113.92,134178.7,23965.22,...,0,0,0,0,0,0,0,0,0,1
2,Weekly,0,2.07,15809.64,2284.08,9243.37,0.0,4282.19,4263.31,18.88,...,0,0,0,0,0,0,0,0,0,3
3,Weekly,1,0.71,1160242.0,655676.32,95361.47,3983.06,405221.15,278437.39,126749.95,...,0,0,0,0,0,0,0,0,0,0
4,Weekly,0,1.41,1871904.09,105003.23,150837.39,6789.04,1609225.47,1298135.07,311012.52,...,0,0,0,0,0,0,1,0,0,2


In [103]:
pp_df = pp_df.drop(columns=['timeframe', 'status'], axis=1)
pp_df.head()

Unnamed: 0,type,avg_price,total_volume,4046_units,4225_units,4770_units,total_bags,s_bags,l_bags,xl_bags,...,geography_South Central,geography_Southeast,geography_Spokane,geography_St. Louis,geography_Syracuse,geography_Tampa,geography_Total U.S.,geography_West,geography_West Tex/New Mexico,price_cat
0,0,0.83,1559597.93,490963.83,191059.35,5795.86,871778.89,225725.82,640831.47,5221.6,...,0,0,0,0,0,0,0,0,0,0
1,0,1.2,255471.62,22625.06,70184.86,2547.78,160113.92,134178.7,23965.22,1970.0,...,0,0,0,0,0,0,0,0,0,1
2,0,2.07,15809.64,2284.08,9243.37,0.0,4282.19,4263.31,18.88,0.0,...,0,0,0,0,0,0,0,0,0,3
3,1,0.71,1160242.0,655676.32,95361.47,3983.06,405221.15,278437.39,126749.95,33.81,...,0,0,0,0,0,0,0,0,0,0
4,0,1.41,1871904.09,105003.23,150837.39,6789.04,1609225.47,1298135.07,311012.52,77.88,...,0,0,0,0,0,0,1,0,0,2


In [104]:
y = pp_df['price_cat']
X = pp_df.drop(columns=['price_cat', 'total_volume', 'prod_total_volume'], axis=1)

RFSSModel4(X, y)

Predictions: ['1' '1' '0' ... '3' '1' '2']
Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3
Actual 0,974,2,85,0
Actual 1,89,557,318,1
Actual 2,52,154,653,0
Actual 3,1,41,20,671


Accuracy Score : 0.789110005527916
Classification Report
              precision    recall  f1-score   support

           0       0.87      0.92      0.89      1061
           1       0.74      0.58      0.65       965
           2       0.61      0.76      0.67       859
           3       1.00      0.92      0.96       733

    accuracy                           0.79      3618
   macro avg       0.80      0.79      0.79      3618
weighted avg       0.80      0.79      0.79      3618

Importances: [6.85363297e-02 3.22124416e-01 1.02296103e-01 5.20452821e-02
 6.52251627e-02 7.37873805e-02 5.80475502e-02 4.20394549e-02
 8.55233762e-02 5.65562328e-03 2.48726280e-03 1.05017266e-02
 1.09955478e-02 1.67564318e-03 1.99183487e-04 1.18318334e-04
 5.53653641e-05 9.43043905e-05 2.50349969e-04 1.06384343e-04
 1.26137275e-04 1.09168507e-04 1.24822941e-03 9.62980803e-05
 1.71599459e-04 3.76378140e-05 1.00303094e-04 7.70937865e-04
 8.58283240e-04 2.49119835e-04 1.12125194e-03 7.59708132e-04
 1.0653

In [105]:
y = pp_df['price_cat']
X = pp_df.drop(columns=['price_cat', 'avg_price', 'total_volume', 'prod_total_volume'], axis=1)

RFSSModel4(X, y)

Predictions: ['0' '0' '0' ... '3' '0' '2']
Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3
Actual 0,935,29,84,13
Actual 1,500,101,335,29
Actual 2,198,38,523,100
Actual 3,46,4,198,485


Accuracy Score : 0.5649530127142067
Classification Report
              precision    recall  f1-score   support

           0       0.56      0.88      0.68      1061
           1       0.59      0.10      0.18       965
           2       0.46      0.61      0.52       859
           3       0.77      0.66      0.71       733

    accuracy                           0.56      3618
   macro avg       0.59      0.56      0.52      3618
weighted avg       0.59      0.56      0.52      3618

Importances: [1.04947870e-01 1.38929829e-01 8.53006244e-02 9.76447926e-02
 9.75646177e-02 8.19397760e-02 7.10238616e-02 1.20624332e-01
 8.99151419e-03 4.46490329e-03 1.73726984e-02 1.92921836e-02
 3.00738140e-03 3.75435591e-04 1.95039711e-04 9.63124357e-05
 2.53704930e-04 3.63967028e-04 1.92527318e-04 2.06777832e-04
 1.98465729e-04 1.36558293e-03 2.16801182e-04 2.37064329e-04
 1.67833155e-04 1.39906925e-04 1.17468695e-03 1.00055750e-03
 3.56129359e-04 1.18190610e-03 9.97396005e-04 1.40368773e-02
 5.189