In [1]:
# ==============================================================================
# File: Avocado_Classification.ipynb
# Author: Vraj Shah
# Revisions by: Francisco Cornejo-Garcia
# Description: Classify date and time
# ==============================================================================

# Load libraries
import pandas as pd
import re
from collections import Counter
from collections import defaultdict
from matplotlib import pyplot as plt

# Load additional libraries
from datetime import datetime
from dateutil.parser import parse

# Load scikit-learn libraries
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
from sklearn.svm import LinearSVC,LinearSVR
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')

# Load models
from downstream_models import LogRegClassifier, RandForestClassifier, LinearRegression, RandForestRegressor, MLPRegressorr, MLPClassifierr
from custom_functions import validate, findSeason

In [2]:
# Load data
avocado = pd.read_csv('dataset/avocado.csv')

In [3]:
# Create datetime dataframe
org_datetime_df = pd.DataFrame()
org_datetime_df['datetime'] = avocado['Date']
datetime_col = avocado['Date']

# Create arrays to be added to dataframe
season_arr = []
date_arr = []
year_arr = []
month_arr = []
day_arr = []
dayofyear_arr = []
time_arr = []
hour_arr = []
minute_arr = []

# Extract features from datetime values in column
for value in datetime_col:
    datetime = validate(value)
    date = datetime.date()
    time = datetime.time()
    
    season_arr.append(findSeason(date.month, date.day))
    date_arr.append(date)
    year_arr.append(date.year)
    month_arr.append(date.month)
    day_arr.append(date.day)
    dayofyear_arr.append(date.timetuple().tm_yday)
    time_arr.append(time)
    hour_arr.append(time.hour)
    minute_arr.append(time.minute)

# Combine arrays as dataframes into one dataframe
season_df = pd.DataFrame(season_arr)
date_df = pd.DataFrame(date_arr)
year_df = pd.DataFrame(year_arr)
month_df = pd.DataFrame(month_arr)
day_df = pd.DataFrame(day_arr)
dayofyear_df = pd.DataFrame(dayofyear_arr)
time_df = pd.DataFrame(time_arr)
hour_df = pd.DataFrame(hour_arr)
minute_df = pd.DataFrame(minute_arr)
datetime_df = pd.concat([season_df, date_df, year_df, month_df, day_df, dayofyear_df, time_df, hour_df, minute_df], axis = 1, sort = False)
datetime_df.columns = ['season', 'date', 'year', 'month', 'day', 'day_of_year', 'time', 'hour', 'minute']
datetime_df = datetime_df.fillna('0')

In [4]:
# Benchmark 1
# Create separate columns
target = avocado['type']
numeric_cols = avocado[['AveragePrice', 'Total Volume', '4046', '4225', '4770']]
ngram1_cols = avocado[['Date']]

# Create ngram vectorizer
vectorizer = CountVectorizer(ngram_range = (2,2),analyzer = 'char')

# Create dataframe
benchmark1_df = pd.DataFrame()
benchmark1_df = pd.concat([benchmark1_df, numeric_cols], axis = 1, sort = False)

# Iterate through each ngram column
for col in ngram1_cols.columns:
    array = ngram1_cols[col].astype(str).values
    X = vectorizer.fit_transform(array)
    vectorizer_df = pd.DataFrame(X.toarray())
    benchmark1_df = pd.concat([benchmark1_df, vectorizer_df], axis = 1, sort = False)

In [5]:
# Logistic Regression Benchmark 1
avgsc_train_lst, avgsc_lst, avgsc_hld_lst, cnf_matrix = LogRegClassifier(benchmark1_df, target)

5 fold Train, Validation, and Test Accuracies:
[0.7267745526158061, 0.7270314239232811, 0.9247367069098382, 0.9283329052144875, 0.9202054794520548]
[0.7273972602739726, 0.7188356164383561, 0.9366438356164384, 0.923972602739726, 0.9277149708804385]
[0.7254794520547945, 0.7254794520547945, 0.9224657534246575, 0.9232876712328767, 0.9178082191780822]
Avg Train, Validation, and Test Accuracies:
0.8454162136230936
0.8469128571897864
0.842904109589041


In [6]:
# Random Forest Benchmark 1
avgsc_train_lst, avgsc_lst, avgsc_hld_lst, cnf_matrix = RandForestClassifier(benchmark1_df, target)

5 fold Train, Validation, and Test Accuracies:
[0.9939207123897594, 0.9945200787738676, 0.9934069697748095, 0.9934069697748095, 0.9940924657534247]
[0.9756849315068493, 0.9736301369863014, 0.9811643835616438, 0.977054794520548, 0.9746488523466941]
[0.9758904109589042, 0.9767123287671233, 0.9789041095890411, 0.9758904109589042, 0.9789041095890411]
Avg Train, Validation, and Test Accuracies:
0.993869439293334
0.9764366197844072
0.9772602739726027


In [7]:
# MLP Classifier & Regressor Benchmark 1
avgsc_train_lst, avgsc_lst, avgsc_hld_lst = MLPClassifierr(benchmark1_df, target)

5-fold Train, Validation, and Test Accuracies:
[0.9194280332220224, 0.9213973799126638, 0.9184861717612809, 0.9214830036818221, 0.9184931506849315]
[0.9215753424657535, 0.9136986301369863, 0.9253424657534246, 0.9133561643835616, 0.9253168893456664]
[0.9183561643835616, 0.9183561643835616, 0.9183561643835616, 0.9183561643835616, 0.9183561643835616]
Avg Train, Validation, and Test Accuracies:
0.9198575478525441
0.9198578984170785
0.9183561643835617


In [8]:
# Benchmark 2
# Create separate columns
numeric_cols = avocado[['AveragePrice', 'Total Volume', '4046', '4225', '4770']] + datetime_df[['year', 'day', 'day_of_year', 'hour', 'minute']]
ngram2_cols = avocado[['Date']]

# Create dataframe
benchmark2_df = pd.DataFrame()
benchmark2_df = pd.concat([benchmark2_df, numeric_cols], axis = 1, sort = False)

# Iterate through each ngram column
for col in ngram2_cols.columns:
    array = ngram2_cols[col].astype(str).values
    X = vectorizer.fit_transform(array)
    vectorizer_df = pd.DataFrame(X.toarray())
    benchmark2_df = pd.concat([benchmark2_df, vectorizer_df], axis = 1, sort = False)
benchmark2_df = benchmark2_df.fillna('0')

In [9]:
# Log Regression Benchmark 2
avgsc_train_lst, avgsc_lst, avgsc_hld_lst, cnf_matrix = LogRegClassifier(benchmark2_df, target)

5 fold Train, Validation, and Test Accuracies:
[0.5001284356537374, 0.5014984159602706, 0.504837742957445, 0.5088620601078859, 0.5040239726027397]
[0.5092465753424658, 0.501027397260274, 0.4948630136986301, 0.47363013698630135, 0.49194929770469337]
[0.4917808219178082, 0.49397260273972604, 0.4882191780821918, 0.4928767123287671, 0.4931506849315068]
Avg Train, Validation, and Test Accuracies:
0.5038701254564157
0.494143284198473
0.492


In [10]:
# Random Forest Classifier Benchmark 2
avgsc_train_lst, avgsc_lst, avgsc_hld_lst, cnf_matrix = RandForestClassifier(benchmark2_df, target)

5 fold Train, Validation, and Test Accuracies:
[0.5157975854097098, 0.5068927134172446, 0.5093758027228359, 0.5175956845620344, 0.5068493150684932]
[0.45787671232876714, 0.4828767123287671, 0.4794520547945205, 0.4547945205479452, 0.49640287769784175]
[0.4832876712328767, 0.4917808219178082, 0.4865753424657534, 0.4797260273972603, 0.48054794520547944]
Avg Train, Validation, and Test Accuracies:
0.5113022202360635
0.47428057553956837
0.48438356164383556


In [11]:
# MLP Classifier Benchmark 2
avgsc_train_lst, avgsc_lst, avgsc_hld_lst = MLPClassifierr(benchmark2_df, target)

5-fold Train, Validation, and Test Accuracies:
[0.512372634643377, 0.5184519222536176, 0.5150269714872848, 0.5202500214059423, 0.5166095890410959]
[0.5332191780821918, 0.5089041095890411, 0.5226027397260274, 0.5017123287671232, 0.5162726961288112]
[0.43397260273972604, 0.43397260273972604, 0.43397260273972604, 0.43397260273972604, 0.43397260273972604]
Avg Train, Validation, and Test Accuracies:
0.5165422277662636
0.516542210458639
0.43397260273972604


In [13]:
# Benchmark 3
# Create separate columns
numeric_cols = avocado[['AveragePrice', 'Total Volume', '4046', '4225', '4770']]
categ_cols = datetime_df[['year', 'day', 'day_of_year', 'hour', 'minute']]
ngram3_cols = avocado[['Date']]

# Create dataframe
benchmark3_df = pd.DataFrame()
benchmark3_df = pd.concat([benchmark3_df, numeric_cols], axis = 1, sort = False)

# Create encoder
enc = OneHotEncoder(handle_unknown = 'ignore')
X = enc.fit_transform(categ_cols)
enc_df = pd.DataFrame(X.toarray())
benchmark3_df = pd.concat([benchmark3_df, enc_df], axis = 1, sort = False)

# Iterate through each ngram column
for col in ngram3_cols.columns:
    array = ngram3_cols[col].astype(str).values
    X = vectorizer.fit_transform(array)
    vectorizer_df = pd.DataFrame(X.toarray())
    benchmark3_df = pd.concat([benchmark3_df, vectorizer_df], axis = 1, sort = False)
benchmark3_df = benchmark3_df.fillna('0')

In [14]:
# Log Regression Benchmark 3
avgsc_train_lst, avgsc_lst, avgsc_hld_lst, cnf_matrix = LogRegClassifier(benchmark3_df, target)

5 fold Train, Validation, and Test Accuracies:
[0.7267745526158061, 0.7270314239232811, 0.9243942118332049, 0.9289322715985958, 0.9268835616438356]
[0.7273972602739726, 0.7188356164383561, 0.936986301369863, 0.9208904109589041, 0.934566632408359]
[0.7254794520547945, 0.7254794520547945, 0.9221917808219178, 0.9213698630136986, 0.9213698630136986]
Avg Train, Validation, and Test Accuracies:
0.8468032043229448
0.847735244289891
0.8431780821917808


In [15]:
# Random Forest Classifier Benchmark 3
avgsc_train_lst, avgsc_lst, avgsc_hld_lst, cnf_matrix = RandForestClassifier(benchmark3_df, target)

5 fold Train, Validation, and Test Accuracies:
[0.9928932271598596, 0.992208237006593, 0.9913519993150098, 0.9926363558523846, 0.9924657534246575]
[0.9674657534246576, 0.9667808219178082, 0.9746575342465753, 0.9739726027397261, 0.9681397738951696]
[0.9679452054794521, 0.9663013698630137, 0.9682191780821918, 0.9679452054794521, 0.9698630136986301]
Avg Train, Validation, and Test Accuracies:
0.9923111145517008
0.9702032972447874
0.9680547945205479


In [16]:
# MLP Classifier Benchmark 3
avgsc_train_lst, avgsc_lst, avgsc_hld_lst = MLPClassifierr(benchmark3_df, target)

5-fold Train, Validation, and Test Accuracies:
[0.9440876787396181, 0.9458001541227845, 0.9423752033564518, 0.9463138967377344, 0.9440068493150685]
[0.9462328767123288, 0.9393835616438356, 0.9530821917808219, 0.9373287671232877, 0.94655704008222]
[0.9394520547945205, 0.9394520547945205, 0.9394520547945205, 0.9394520547945205, 0.9394520547945205]
Avg Train, Validation, and Test Accuracies:
0.9445167564543315
0.9445168874684986
0.9394520547945205


In [17]:
# Benchmark 4
# Create separate columns
numeric_cols = avocado[['AveragePrice', 'Total Volume', '4046', '4225', '4770']] + datetime_df[['year', 'day', 'day_of_year', 'hour', 'minute']]
categ_cols = datetime_df[['year', 'day', 'day_of_year', 'hour', 'minute']]
ngram4_cols = avocado[['Date']]

# Create dataframe
benchmark4_df = pd.DataFrame()
benchmark4_df = pd.concat([benchmark4_df, numeric_cols], axis = 1, sort = False)

# Create encoder
enc = OneHotEncoder(handle_unknown = 'ignore')
X = enc.fit_transform(categ_cols)
enc_df = pd.DataFrame(X.toarray())
benchmark4_df = pd.concat([benchmark4_df, enc_df], axis = 1, sort = False)

# Iterate through each ngram column
for col in ngram4_cols.columns:
    array = ngram4_cols[col].astype(str).values
    X = vectorizer.fit_transform(array)
    vectorizer_df = pd.DataFrame(X.toarray())
    benchmark4_df = pd.concat([benchmark4_df, vectorizer_df], axis = 1, sort = False)
benchmark4_df = benchmark4_df.fillna('0')

In [18]:
# Log Regression Benchmark 4
avgsc_train_lst, avgsc_lst, avgsc_hld_lst, cnf_matrix = LogRegClassifier(benchmark4_df, target)

5 fold Train, Validation, and Test Accuracies:
[0.5001284356537374, 0.5032108913434369, 0.5056939806490282, 0.511858892028427, 0.5040239726027397]
[0.5092465753424658, 0.49726027397260275, 0.49246575342465754, 0.4726027397260274, 0.49194929770469337]
[0.4917808219178082, 0.4915068493150685, 0.4873972602739726, 0.4841095890410959, 0.4931506849315068]
Avg Train, Validation, and Test Accuracies:
0.5049832344554739
0.4927049280340893
0.4895890410958904


In [19]:
# Random Forest Classifier Benchmark 4
avgsc_train_lst, avgsc_lst, avgsc_hld_lst, cnf_matrix = RandForestClassifier(benchmark4_df, target)

5 fold Train, Validation, and Test Accuracies:
[0.5078345748779861, 0.5060364757256615, 0.5044096241116534, 0.5162257042555013, 0.5054794520547945]
[0.4876712328767123, 0.4863013698630137, 0.5020547945205479, 0.4613013698630137, 0.49023638232271327]
[0.4852054794520548, 0.4917808219178082, 0.4846575342465753, 0.4791780821917808, 0.4898630136986301]
Avg Train, Validation, and Test Accuracies:
0.5079971662051193
0.4855130298892002
0.48613698630136987


In [20]:
# MLP Classifier Benchmark 4 
avgsc_train_lst, avgsc_lst, avgsc_hld_lst = MLPClassifierr(benchmark4_df, target)

5-fold Train, Validation, and Test Accuracies:
[0.5151982190256015, 0.5183662984844593, 0.5163113280246596, 0.5175100607928761, 0.5197773972602739]
[0.5263698630136986, 0.5136986301369864, 0.5219178082191781, 0.5171232876712328, 0.5080507022953066]
[0.4301369863013699, 0.4301369863013699, 0.4301369863013699, 0.4301369863013699, 0.4301369863013699]
Avg Train, Validation, and Test Accuracies:
0.5174326607175741
0.5174320582672804
0.4301369863013699
