In [1]:
# ==============================================================================
# File: Avocado_Sales_Classification.ipynb
# Author: Vraj Shah
# Revisions by: Francisco Cornejo-Garcia
# Description: Classify date and time
# ==============================================================================

# Load models
from downstream_models import LogRegClassifier, RandForestClassifier, LinearRegression, RandForestRegressor, MLPRegressorr, MLPClassifierr

# Load libraries
import pandas as pd
import re
from collections import Counter
from collections import defaultdict
from matplotlib import pyplot as plt

# Load scikit-learn libraries
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
from sklearn.svm import LinearSVC,LinearSVR
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load data
avocado = pd.read_csv('dataset/avocado.csv')
# Rename columns
avocado.columns = ['index', 'date', 'avgPrice', 'totalVolume', '4046', '4225', '4770', 'totalBags', 'smallBags', 'largeBags', 'xlargeBags', 'type', 'year', 'region']

In [3]:
# Create separate columns
numeric_cols = avocado[['avgPrice']]
ngram_cols = avocado[['date', 'year']]
target = avocado['region']

# Create ngram vectorizer
vectorizer = CountVectorizer(ngram_range = (2,2),analyzer = 'char')

# Create dataframe
table = pd.DataFrame()
table = pd.concat([table, numeric_cols], axis = 1, sort = False)

# Iterate through each ngram column
for col in ngram_cols.columns:
    # Transform values in column as strings
    array = ngram_cols[col].astype(str).values
    X = vectorizer.fit_transform(array)
    dataframe = pd.DataFrame(X.toarray())
    table = pd.concat([table, dataframe], axis = 1, sort = False)

In [4]:
# Log Regression
avgsc_train_lst, avgsc_lst, avgsc_hld_lst, cnf_matrix = LogRegClassifier(table, target)

(8759, 52)
(2920, 52)
0.033478893740902474
0.01643835616438356
0.017534246575342465
(8759, 52)
(2920, 52)
0.034934497816593885
0.018150684931506848
0.021369863013698632
(8759, 52)
(2920, 52)
0.03998630019693467
0.01780821917808219
0.016712328767123287
(8759, 52)
(2920, 52)
0.036646973199760256
0.015753424657534248
0.01863013698630137
(8760, 52)
(2920, 52)
0.037842465753424655
0.020212401507365536
0.02219178082191781
5 fold Train, Validation, and Test Accuracies:
[0.033478893740902474, 0.034934497816593885, 0.03998630019693467, 0.036646973199760256, 0.037842465753424655]
[0.01643835616438356, 0.018150684931506848, 0.01780821917808219, 0.015753424657534248, 0.020212401507365536]
[0.017534246575342465, 0.021369863013698632, 0.016712328767123287, 0.01863013698630137, 0.02219178082191781]
Avg Train, Validation, and Test Accuracies:
0.03657782614152319
0.01767261728777448
0.019287671232876714


In [5]:
# Random Forest Classifier
avgsc_train_lst, avgsc_lst, avgsc_hld_lst, cnf_matrix = RandForestClassifier(table, target)

(8759, 52)
(2920, 52)
0.04615121157633359
0.021232876712328767
0.020821917808219178
(8759, 52)
(2920, 52)
0.06807089648086309
0.017123287671232876
0.023013698630136987
(8759, 52)
(2920, 52)
0.08048634300881925
0.01678082191780822
0.011506849315068493
(8759, 52)
(2920, 52)
0.044267488654850584
0.02089041095890411
0.019452054794520546
(8760, 52)
(2920, 52)
0.052311643835616435
0.02603631380609798
0.027123287671232878
5 fold Train, Validation, and Test Accuracies:
[0.04615121157633359, 0.06807089648086309, 0.08048634300881925, 0.044267488654850584, 0.052311643835616435]
[0.021232876712328767, 0.017123287671232876, 0.01678082191780822, 0.02089041095890411, 0.02603631380609798]
[0.020821917808219178, 0.023013698630136987, 0.011506849315068493, 0.019452054794520546, 0.027123287671232878]
Avg Train, Validation, and Test Accuracies:
0.058257516711296586
0.02041274221327439
0.020383561643835618


In [6]:
# MLP Classifier & Regressor
avgsc_train_lst, avgsc_lst, avgsc_hld_lst = MLPClassifierr(table, target)

(8759, 52)
(2920, 52)
0.12963438650569398
0.1267123287671233
0.02684931506849315
(8759, 52)
(2920, 52)
0.12852127750663583
0.13116438356164384
0.02684931506849315
(8759, 52)
(2920, 52)
0.13134686188886036
0.11986301369863013
0.02684931506849315
(8759, 52)
(2920, 52)
0.12638068327767787
0.13972602739726028
0.02684931506849315
(8760, 52)
(2920, 52)
0.12936643835616438
0.12778348749571772
0.02684931506849315
5-fold Train, Validation, and Test Accuracies:
[0.12963438650569398, 0.12852127750663583, 0.13134686188886036, 0.12638068327767787, 0.12936643835616438]
[0.1267123287671233, 0.13116438356164384, 0.11986301369863013, 0.13972602739726028, 0.12778348749571772]
[0.02684931506849315, 0.02684931506849315, 0.02684931506849315, 0.02684931506849315, 0.02684931506849315]
Avg Train, Validation, and Test Accuracies:
0.1290499295070065
0.12904984818407503
0.02684931506849315
