In [184]:
import importlib
import data_transform
import Cross_validation
importlib.reload(data_transform)
importlib.reload(Cross_validation)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from Visualization.Feature_importance import FeatImp
from data_transform import apply_transformations
from Cross_validation import cross_validation,BlockingTimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
import seaborn as sns

#Models for predicting
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor,AdaBoostRegressor,VotingRegressor
from sklearn.linear_model import LinearRegression,SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

#Grid Search
from sklearn.model_selection import GridSearchCV

# preparing and cleaning data
data = pd.read_csv('data/avocado.csv')
data = data.sort_values(by=['Date'])
target = data['AveragePrice']

data = data.drop(['Unnamed: 0','AveragePrice'],axis = 1)

# applied all transformations
data_2 = apply_transformations(data)

# seperating into train, test
train_size = int(0.2*len(data_2))

#train data
X_train = data_2[:][0:train_size]
y_train = target[0:train_size]

# test data
X_test = data_2[:][train_size:]
y_test = target[train_size:]


In [185]:
#scaling data
SC = StandardScaler()
X_train = SC.fit_transform(X_train) # important to fit data to X_train and applying same statistical properties to X_test!!!
X_test = SC.transform(X_test)

y_train = pd.Series(SC.fit_transform(np.array(y_train).reshape(-1,1)).ravel()) # same goes for target values!!!
y_test = pd.Series(SC.transform(np.array(y_test).reshape(-1,1)).ravel())


In [153]:
cross_validation(X_train,y_train)

Unnamed: 0,SVR,Random Forest Regressor,Extra Trees Regressor,AdaBoost,LinearRegresion,SGD
means,0.749667,0.786094,0.803752,0.798321,-3.907501e+25,0.69534
std,0.114276,0.099278,0.087795,0.084768,4.723836e+25,0.12486


In [None]:
'''I used R2 scoring function for my performance measure. Generally, higher R2 values mean better model fitting. We see that the best 
models are RFR,ETR,AdaBoost'''

In [50]:
# Random Forest Regressor hyperparameter Tuning

RFR = RandomForestRegressor()
btss = BlockingTimeSeriesSplit(n_splits=5)

rfr_params = {"n_estimators":[10,50,100,200],
              "criterion":['mse','mae'],
              "max_depth":[4,6,8,10,20],
             }
gsRFC = GridSearchCV(RFR,param_grid = rfr_params, cv=btss, scoring="r2", n_jobs= -1, verbose = 1)

gsRFC.fit(X_train,y_train)

RFC_best = gsRFC.best_estimator_


Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   21.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 18.7min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 22.4min finished


In [59]:
gsRFC.best_params_

{'criterion': 'mse', 'max_depth': 20, 'n_estimators': 100}

In [60]:
# Extra Trees Regressor hyperparameter Tuning

ETR = ExtraTreesRegressor()
btss = BlockingTimeSeriesSplit(n_splits=5)

ETR_params = {"n_estimators":[10,50,100,200],
              "criterion":['mse','mae'],
              "max_depth":[4,6,8,10,20],
             }
gsETR = GridSearchCV(ETR,param_grid = ETR_params, cv=btss, scoring="r2", n_jobs= -1, verbose = 1)

gsETR.fit(X_train,y_train)

ETR_best = gsETR.best_estimator_


Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   26.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 31.7min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 37.8min finished


In [61]:
ETR_best

ExtraTreesRegressor(max_depth=20, n_estimators=200)

In [62]:
gsETR.best_params_

{'criterion': 'mse', 'max_depth': 20, 'n_estimators': 200}

In [64]:
AB = AdaBoostRegressor(base_estimator=DecisionTreeRegressor())

AB_params = {"n_estimators":[10,50,100,200],
              "learning_rate":[0.1,0.5,1,2],
              "loss":["linear","square","exponential"],
             }
gsAB = GridSearchCV(AB,param_grid = AB_params, cv=btss, scoring="r2", n_jobs= -1, verbose = 1)

gsAB.fit(X_train,y_train)

AB_best = gsAB.best_estimator_

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  5.3min finished


In [66]:
gsAB.best_params_
AB_best

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), learning_rate=0.1,
                  loss='exponential', n_estimators=100)

In [67]:
svr = SVR()
svr_params = {"kernel":['linear','poly','rbf','sigmoid'],
              "degree":[2,3,4,5],
              "gamma":['scale','auto'],
              "C":[0.1,0.5,1,2,3,4]}

gsSVR = GridSearchCV(svr,param_grid = svr_params, cv=btss, scoring="r2", n_jobs= -1, verbose = 1)

gsSVR.fit(X_train,y_train)

svr_best = gsSVR.best_estimator_

Fitting 5 folds for each of 192 candidates, totalling 960 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   24.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 960 out of 960 | elapsed: 13.6min finished


In [68]:
gsSVR.best_params_

{'C': 4, 'degree': 2, 'gamma': 'auto', 'kernel': 'rbf'}

In [186]:
votingR = VotingRegressor(estimators=[('rfc', RFC_best),
('ETR', ETR_best),('AB',AB_best),('svr',svr_best)],  n_jobs=-1)

votingC = votingR.fit(X_train, y_train)

In [187]:
from sklearn.metrics import r2_score
predict = votingC.predict(X_test)
predict = pd.DataFrame(predict)
y_test= y_test.reset_index().drop(["index"],axis=1)
results = pd.concat([predict,y_test],axis = 1)
print(r2_score(y_test,predict))
results

0.47519591900164915


Unnamed: 0,0,0.1
0,-1.580209,-1.581211
1,-0.590269,-0.570099
2,-0.539448,-0.679409
3,1.084176,0.659630
4,1.360003,1.670742
...,...,...
14595,-0.756655,-0.050880
14596,-1.692899,-1.854484
14597,-0.163383,0.113084
14598,0.171344,0.878249


In [None]:
'''I get verj bad results, with my models. My best guess is tha i should not have applied my log-transform or not have done OneHot
Encoding for months and year features.'''