In [1]:
#use scikit-learn and pandas to build our pipeline
import pandas as pd
import numpy as np
from pprint import pprint 

In [2]:
#first task is to pre-process the data to apply machine learning algorithms 
#implemented by scikit-learn. To do this, we first read our CSV file into a pandas DataFrame. 
#We do this with our dataset preprocessing function.
f = open('avocado.csv')
avoFrame = pd.read_csv(f)

In [3]:
#transform our features and make them numerical for scikit-learn’s methods to work. 
#For this, we take the columns of the textual attributes from the DataFrame and 
#encode them into numbers using scikit-learn’s LabelEncoder class

from sklearn.preprocessing import LabelEncoder

In [4]:
#First, we get the entire column of the feature we need to encode

AvgPrice = avoFrame['AveragePrice'].tolist()

In [5]:
#we instantiate a LabelEncoder, fit it to the column, 
#and transform it into a list of numbers, which go back into our DataFrame.


labelEncoderAveragePrice = LabelEncoder()
labelEncoderAveragePrice.fit(AvgPrice)
labelsAveragePrice = labelEncoderAveragePrice.transform(AvgPrice)
avoFrame['AveragePrice']=pd.Series(labelsAveragePrice)

In [22]:
#Need to prepare the training and test splits for the data. 

#Keep track of the features
headers = list(avoFrame)

#Store the totals avocados sold separately in a list
YIndex = headers.index('Total Volume')
datasetMatrix = avoFrame.values
#Make the train and test splits
datasetTrain = datasetMatrix[0:1500]
datasetTrainWithoutLabels = np.delete(datasetTrain,YIndex,1)

labels = datasetTrain[:,YIndex]

datasetTest = datasetMatrix[1500:datasetMatrix.shape[0]]

trueLabels = datasetTest[:,YIndex]
    
 
pprint((headers,datasetMatrix,datasetTrainWithoutLabels,labels,datasetTest,
datasetTestWithoutLabels,trueLabels))

(['Unnamed: 0',
  'Date',
  'AveragePrice',
  'Total Volume',
  '4046',
  '4225',
  '4770',
  'Total Bags',
  'Small Bags',
  'Large Bags',
  'XLarge Bags',
  'type',
  'year',
  'region'],
 array([[0, '2015-12-27', 86, ..., 'conventional', 2015, 'Albany'],
       [1, '2015-12-20', 88, ..., 'conventional', 2015, 'Albany'],
       [2, '2015-12-13', 46, ..., 'conventional', 2015, 'Albany'],
       ...,
       [9, '2018-01-21', 140, ..., 'organic', 2018, 'WestTexNewMexico'],
       [10, '2018-01-14', 146, ..., 'organic', 2018, 'WestTexNewMexico'],
       [11, '2018-01-07', 115, ..., 'organic', 2018, 'WestTexNewMexico']],
      dtype=object),
 array([[0, '2015-12-27', 86, ..., 'conventional', 2015, 'Albany'],
       [1, '2015-12-20', 88, ..., 'conventional', 2015, 'Albany'],
       [2, '2015-12-13', 46, ..., 'conventional', 2015, 'Albany'],
       ...,
       [41, '2015-03-15', 98, ..., 'conventional', 2015, 'NewYork'],
       [42, '2015-03-08', 89, ..., 'conventional', 2015, 'NewYork'],
 

In [23]:
pd.DataFrame(datasetTrainWithoutLabels)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0,2015-12-27,86,1036.74,54454.8,48.16,8696.87,8603.62,93.25,0,conventional,2015,Albany
1,1,2015-12-20,88,674.28,44638.8,58.33,9505.56,9408.07,97.49,0,conventional,2015,Albany
2,2,2015-12-13,46,794.7,109150,130.5,8145.35,8042.21,103.14,0,conventional,2015,Albany
3,3,2015-12-06,61,1132,71976.4,72.58,5811.16,5677.4,133.76,0,conventional,2015,Albany
4,4,2015-11-29,81,941.48,43838.4,75.78,6183.95,5986.26,197.69,0,conventional,2015,Albany
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,39,2015-03-29,89,21780.5,882072,1812.92,278150,216859,61290.6,0,conventional,2015,NewYork
1496,40,2015-03-22,86,22713.3,884993,1650.73,283375,223370,60005.8,0,conventional,2015,NewYork
1497,41,2015-03-15,98,22591.7,749461,1923.63,323762,258642,65119.1,0,conventional,2015,NewYork
1498,42,2015-03-08,89,31886.4,755538,2109.98,339800,271636,68163.6,0,conventional,2015,NewYork


In [7]:
#The above method returns the training and testing splits of the data(without the labels), 
#along with the labels for each of those splits.

In [8]:
pd.DataFrame(datasetMatrix).head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0,2015-12-27,86,64236.6,1036.74,54454.8,48.16,8696.87,8603.62,93.25,0,conventional,2015,Albany
1,1,2015-12-20,88,54877.0,674.28,44638.8,58.33,9505.56,9408.07,97.49,0,conventional,2015,Albany
2,2,2015-12-13,46,118220.0,794.7,109150.0,130.5,8145.35,8042.21,103.14,0,conventional,2015,Albany
3,3,2015-12-06,61,78992.1,1132.0,71976.4,72.58,5811.16,5677.4,133.76,0,conventional,2015,Albany
4,4,2015-11-29,81,51039.6,941.48,43838.4,75.78,6183.95,5986.26,197.69,0,conventional,2015,Albany
5,5,2015-11-22,79,55979.8,1184.27,48068.0,43.61,6683.91,6556.47,127.44,0,conventional,2015,Albany
6,6,2015-11-15,52,83453.8,1368.92,73672.7,93.26,8318.86,8196.81,122.05,0,conventional,2015,Albany
7,7,2015-11-08,51,109428.0,703.75,101815.0,80.0,6829.22,6266.85,562.37,0,conventional,2015,Albany
8,8,2015-11-01,55,99811.4,1022.15,87315.6,85.34,11388.4,11104.5,283.83,0,conventional,2015,Albany
9,9,2015-10-25,60,74338.8,842.4,64757.4,113.0,8625.92,8061.47,564.45,0,conventional,2015,Albany


In [9]:
#build a 1-step pipeline, which consists of directly applying a regressor on the training data 
#and making predictions. using Lasso regression (it has one hyperparameter to be tuned, alpha)

from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

In [10]:
#initialize a Lasso regressor object and define the range of alphas 
#plan to search over to find the one that gives the best training accuracy 
#determined by cross validation

regressor = Lasso()
alphas = np.arange(1,50)

In [11]:
##build the pipeline in only one step, i.e. making predictions using the regression model:

steps = [('regressor',regressor)]
pipeline = Pipeline(steps)

In [12]:
#create a GridSearchCV object that trains all possible models 
#to determine the one with the least training error.

parameterGrid = dict(regressor__alpha = alphas)
GridSearchResult = GridSearchCV(pipeline,param_grid=parameterGrid)

In [13]:
#A GridSearchCV object requires two arguments: a pipeline object, and a parameter grid. 
#A parameter grid is a dictionary. It contains the details of the range of values to be 
#searched over for each step of the pipeline. In this case, we just have a regressor, 
#so we just need the range of hyperparameters to be searched over, which we created above.
labels

array([64236.62, 54876.98, 118220.22, ..., 1097737.98, 1129333.95,
       1338129.89], dtype=object)

In [21]:
datasetTrainWithoutLabels[:,2:9]

array([[86, 1036.74, 54454.85, ..., 8696.87, 8603.62, 93.25],
       [88, 674.28, 44638.81, ..., 9505.56, 9408.07, 97.49],
       [46, 794.7, 109149.67, ..., 8145.35, 8042.21, 103.14],
       ...,
       [98, 22591.72, 749461.13, ..., 323761.5, 258642.39, 65119.11],
       [89, 31886.43, 755537.72, ..., 339799.82, 271636.2, 68163.62],
       [71, 19919.91, 1025372.06, ..., 289988.49, 228539.44, 61449.05]],
      dtype=object)

In [15]:
#let GridSearch work and let it fit and cross validate several models over the data and pick the best one.

GridSearchResult.fit(datasetTrainWithoutLabels[:,2:9],labels)

GridSearchCV(estimator=Pipeline(steps=[('regressor', Lasso())]),
             param_grid={'regressor__alpha': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])})

In [16]:
#can also find out hyperparameters that gave rise to the least cross validation error.

print (GridSearchResult.best_params_)

{'regressor__alpha': 21}


In [17]:
#make predictions on the test data using this best model, 
#and calculate the accuracy by using any metric(be sure to import the appropriate modules

predictions = GridSearchResult.predict(datasetTestWithoutLabels[:,2:9])
print (r2_score(trueLabels,predictions))
predictions

0.9999999971911436


array([1088947.17319847,  997532.01419093, 1658679.00335329, ...,
         13749.93265583,   16186.80753917,   17477.76988215])

In [18]:
#Done building machine learning pipeline that takes in a csv file, 
#and performs a supervised machine learning task

In [19]:
#Option 1: Visualization of results 
#Visualization of results
#plot the mean absolute errors obtained during cross validation. 
#GridSearch actually stores the training error obtained at each step of the pipeline
#can plot them to observe how training error varies over the hyperparameter range.

In [20]:
#Option 2: Try Different Regressors
#a lot of regression methods are available on scikit-learn. 
#Some examples of available regressors are Ridge regressors, Support Vector Machines, Decision Trees etc. 
#can also try ensemble methods(like AdaBoost) to build potentially better regressors. 
#could compare these models’ performance in terms of accuracy, training time and testing time.

In [None]:
#Predict if total of 3 is good indicator of actual total volume with missing data