In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from math import sqrt

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor

In [2]:
data = pd.read_csv('train_cluster3.csv', header=0, low_memory=False)
data.head()

Unnamed: 0,unit_sales,onpromotion,transactions,dcoilwtico,day_0,day_1,day_2,day_3,day_4,day_5,...,localholiday,regionalholiday,nationalother,nholidayspike,goodfriday,blackfriday,worldcupspike,worldcupdrop,earthquakespike,earthquakedrop
0,3.0,0,1317.0,92.47,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,10.0,0,932.0,88.73,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.0,0,1784.0,104.33,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,14.0,0,1200.0,94.573333,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,1.0,0,1753.0,95.35,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
data.dropna(inplace=True)

In [4]:
array=data.values

Y=array[:,0]
X=array[:,1:]

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.5)

# Linear Regression

In [5]:
lm=LinearRegression()

lm.fit(X_train, Y_train)

ptrain = lm.predict(X_train)
ptest = lm.predict(X_test)

mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |3.69877857453    |3.69564205273    |
|mape   |118.355929414    |118.107319756    |


In [6]:
# Cross Validation
lm.fit(X_test, Y_test)

ptrain = lm.predict(X_test)
ptest = lm.predict(X_train)

mae_train = mean_absolute_error(Y_test, ptrain)
mae_test = mean_absolute_error(Y_train, ptest)
mape_train = np.mean(np.abs((Y_test - ptrain) / Y_test)) * 100
mape_test = np.mean(np.abs((Y_train - ptest) / Y_train)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |3.69377823031    |3.6993573669    |
|mape   |118.037847503    |118.388289814    |


# Random Forest

In [7]:
forest=RandomForestRegressor(n_estimators=100)
forest.fit(X_train,Y_train)

ptrain = forest.predict(X_train)
ptest = forest.predict(X_test)

mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |2.39717807681    |4.17500340351    |
|mape   |70.4129640538    |124.592471277    |


In [8]:
# Cross Validation
forest=RandomForestRegressor(n_estimators=100)

forest.fit(X_test, Y_test)

ptrain = forest.predict(X_test)
ptest = forest.predict(X_train)

mae_train = mean_absolute_error(Y_test, ptrain)
mae_test = mean_absolute_error(Y_train, ptest)
mape_train = np.mean(np.abs((Y_test - ptrain) / Y_test)) * 100
mape_test = np.mean(np.abs((Y_train - ptest) / Y_train)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |2.39898309697    |4.17459238559    |
|mape   |70.3133271393    |125.2354524    |


# KNN

In [9]:
k = int(sqrt(len(data.index)))
k

456

In [10]:
knn = KNeighborsRegressor(n_neighbors=k)
knn.fit(X_train, Y_train)

ptrain = knn.predict(X_train)
ptest = knn.predict(X_test)

mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |3.69478561805    |3.70250792262    |
|mape   |118.955741999    |118.898165942    |


In [11]:
# Cross Validation
knn = KNeighborsRegressor(n_neighbors=k)
knn.fit(X_test, Y_test)

ptrain = knn.predict(X_test)
ptest = knn.predict(X_train)

mae_train = mean_absolute_error(Y_test, ptrain)
mae_test = mean_absolute_error(Y_train, ptest)
mape_train = np.mean(np.abs((Y_test - ptrain) / Y_test)) * 100
mape_test = np.mean(np.abs((Y_train - ptest) / Y_train)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |3.6938135909    |3.70269309369    |
|mape   |118.525019214    |119.082422987    |


# Neural Network

In [13]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train2 = scaler.transform(X_train)
X_test2 = scaler.transform(X_test)

mlp = MLPRegressor(hidden_layer_sizes=(50,20,10))
mlp.fit(X_train2,Y_train)
ptrain = mlp.predict(X_train2)
ptest = mlp.predict(X_test2)

mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |4.9311429106    |5.03001580521    |
|mape   |129.722023831    |131.833158508    |


In [14]:
# Cross Validation
mlp = MLPRegressor(hidden_layer_sizes=(50,20,10))
mlp.fit(X_test2,Y_test)
ptrain = mlp.predict(X_test2)
ptest = mlp.predict(X_train2)

mae_train = mean_absolute_error(Y_test, ptrain)
mae_test = mean_absolute_error(Y_train, ptest)
mape_train = np.mean(np.abs((Y_test - ptrain) / Y_test)) * 100
mape_test = np.mean(np.abs((Y_train - ptest) / Y_train)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |4.86377244088    |4.95278559711    |
|mape   |125.038205439    |127.790042866    |


# Improving the Best Model

In [13]:
from sklearn.feature_selection import RFE
model=LinearRegression()
rfe = RFE(model, n_features_to_select=20)
rfe.fit(X_train,Y_train)

ptrain = rfe.predict(X_train)
ptest = rfe.predict(X_test)

mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |3.73024403713    |3.73013902322    |
|mape   |122.301253719    |121.993220591    |


In [5]:
selected=['unit_sales','onpromotion','transactions','dcoilwtico',
          'day_0','day_1','day_2','day_3','day_4','day_5','month_1',
          'month_2','month_3','month_4','month_6','month_7','month_8',
          'month_9','month_10','month_11','Azuay','Bolivar',
          'Chimborazo','Cotopaxi','El Oro','Esmeraldas','Guayas',
          'Imbabura','Loja','Los Rios','Manabi','Pastaza','Pichincha',
          'Santa Elena','Santo Domingo de los Tsachilas',
          'regionalholiday','nationalother','nholidayspike',
          'goodfriday','blackfriday','worldcupspike']
features = data[selected]

In [6]:
array=features.values

Y=array[:,0]
X=array[:,1:]

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.5)

In [7]:
lm=LinearRegression()

lm.fit(X_train, Y_train)

ptrain = lm.predict(X_train)
ptest = lm.predict(X_test)

mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |3.6988468892    |3.69777795748    |
|mape   |118.102237856    |118.606646727    |


In [8]:
# Cross Validation
lm.fit(X_test, Y_test)

ptrain = lm.predict(X_test)
ptest = lm.predict(X_train)

mae_train = mean_absolute_error(Y_test, ptrain)
mae_test = mean_absolute_error(Y_train, ptest)
mape_train = np.mean(np.abs((Y_test - ptrain) / Y_test)) * 100
mape_test = np.mean(np.abs((Y_train - ptest) / Y_train)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |3.69341279574    |3.69702355808    |
|mape   |118.275451152    |117.856044895    |


In [9]:
features.to_csv('train_cluster3_azure.csv', index=False)