In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from math import sqrt

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor

In [2]:
data = pd.read_csv('train_cluster0.csv', header=0, low_memory=False)
data.dropna(inplace=True)

In [3]:
array=data.values

Y=array[:,0]
X=array[:,1:]

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.5)

# Linear Regression

In [8]:
lm=LinearRegression()

lm.fit(X_train, Y_train)

ptrain = lm.predict(X_train)
ptest = lm.predict(X_test)

mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |1.09045943211    |1.09007295475    |
|mape   |60.472444309    |60.5225811096    |


In [9]:
# Cross Validation
lm.fit(X_test, Y_test)

ptrain = lm.predict(X_test)
ptest = lm.predict(X_train)

mae_train = mean_absolute_error(Y_test, ptrain)
mae_test = mean_absolute_error(Y_train, ptest)
mape_train = np.mean(np.abs((Y_test - ptrain) / Y_test)) * 100
mape_test = np.mean(np.abs((Y_train - ptest) / Y_train)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |1.08928277246    |1.09021973345    |
|mape   |60.4365528978    |60.419177672    |


# Random Forest

In [10]:
forest=RandomForestRegressor(n_estimators=100)
forest.fit(X_train,Y_train)

ptrain = forest.predict(X_train)
ptest = forest.predict(X_test)

mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |0.586509085112    |1.17282732538    |
|mape   |31.6844407907    |64.3759440324    |


In [11]:
# Cross Validation
forest=RandomForestRegressor(n_estimators=100)

forest.fit(X_test, Y_test)

ptrain = forest.predict(X_test)
ptest = forest.predict(X_train)

mae_train = mean_absolute_error(Y_test, ptrain)
mae_test = mean_absolute_error(Y_train, ptest)
mape_train = np.mean(np.abs((Y_test - ptrain) / Y_test)) * 100
mape_test = np.mean(np.abs((Y_train - ptest) / Y_train)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |0.586136685444    |1.17318312479    |
|mape   |31.6833512307    |64.3324235297    |


# KNN

In [12]:
k = int(sqrt(len(data.index)))
k

708

In [13]:
knn = KNeighborsRegressor(n_neighbors=k)
knn.fit(X_train, Y_train)

ptrain = knn.predict(X_train)
ptest = knn.predict(X_test)

mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |1.09303648359    |1.09404819035    |
|mape   |60.6597187895    |60.7878175643    |


In [14]:
# Cross Validation
knn = KNeighborsRegressor(n_neighbors=k)
knn.fit(X_test, Y_test)

ptrain = knn.predict(X_test)
ptest = knn.predict(X_train)

mae_train = mean_absolute_error(Y_test, ptrain)
mae_test = mean_absolute_error(Y_train, ptest)
mape_train = np.mean(np.abs((Y_test - ptrain) / Y_test)) * 100
mape_test = np.mean(np.abs((Y_train - ptest) / Y_train)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |1.09206700736    |1.09377428158    |
|mape   |60.6341074444    |60.6580746906    |


# Neural Network

In [15]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train2 = scaler.transform(X_train)
X_test2 = scaler.transform(X_test)

mlp = MLPRegressor(hidden_layer_sizes=(50,20,10))
mlp.fit(X_train2,Y_train)
ptrain = mlp.predict(X_train2)
ptest = mlp.predict(X_test2)

mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |1.06695448418    |1.08480978783    |
|mape   |58.6629946381    |59.7490761679    |


In [16]:
# Cross Validation
mlp = MLPRegressor(hidden_layer_sizes=(50,20,10))
mlp.fit(X_test2,Y_test)
ptrain = mlp.predict(X_test2)
ptest = mlp.predict(X_train2)

mae_train = mean_absolute_error(Y_test, ptrain)
mae_test = mean_absolute_error(Y_train, ptest)
mape_train = np.mean(np.abs((Y_test - ptrain) / Y_test)) * 100
mape_test = np.mean(np.abs((Y_train - ptest) / Y_train)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |1.05207421332    |1.07357050058    |
|mape   |56.7924347653    |57.9313085831    |


# Improving the Best Model

In [17]:
mlp = MLPRegressor(hidden_layer_sizes=(200,20,10))
mlp.fit(X_train2,Y_train)
ptrain = mlp.predict(X_train2)
ptest = mlp.predict(X_test2)

mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |1.02206984239    |1.06347305377    |
|mape   |53.4843860067    |55.8957477593    |


In [20]:
mlp = MLPRegressor(hidden_layer_sizes=(200,20,10), max_iter=100)
mlp.fit(X_train2,Y_train)
ptrain = mlp.predict(X_train2)
ptest = mlp.predict(X_test2)

mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')



|metric |train            |test             | 
|mae    |1.03576491823    |1.08407829405    |
|mape   |55.7739316089    |58.5521850069    |


In [None]:
mlp = MLPRegressor(hidden_layer_sizes=(200,20,10), max_iter=400)
mlp.fit(X_train2,Y_train)
ptrain = mlp.predict(X_train2)
ptest = mlp.predict(X_test2)

mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |1.05519022857    |1.10782914081    |
|mape   |59.2817875064    |62.2384289756    |


In [4]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train2 = scaler.transform(X_train)
X_test2 = scaler.transform(X_test)

mlp = MLPRegressor(hidden_layer_sizes=(200,20,10), learning_rate='adaptive')
mlp.fit(X_train2,Y_train)
ptrain = mlp.predict(X_train2)
ptest = mlp.predict(X_test2)

mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |1.04625779031    |1.10390372501    |
|mape   |58.0700223066    |61.2616818396    |


In [5]:
mlp = MLPRegressor(max_iter=100, learning_rate='adaptive')
mlp.fit(X_train2,Y_train)
ptrain = mlp.predict(X_train2)
ptest = mlp.predict(X_test2)

mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |1.07166244391    |1.09048731546    |
|mape   |59.2293555194    |60.2417032174    |


In [6]:
mlp = MLPRegressor()
mlp.fit(X_train2,Y_train)
ptrain = mlp.predict(X_train2)
ptest = mlp.predict(X_test2)

mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |1.06854420354    |1.08347349993    |
|mape   |58.7415517042    |59.5344322539    |


In [4]:
selected = ['unit_sales','onpromotion','transactions','dcoilwtico',
            'day_0','day_1','day_2','day_3','day_4','day_5','month_1',
            'month_2','month_3','month_7','month_8','month_10','1002',
            '1006','1030','1035','1050','1052','1062','1064',
            '1068','1075','1079','1082','1087','Azuay','Bolivar',
            'Chimborazo','Cotopaxi','El Oro','Esmeraldas','Guayas',
            'Loja','Los Rios','Manabi','Santa Elena']
features = data[selected]

In [5]:
array=features.values

Y=array[:,0]
X=array[:,1:]

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.5)

In [7]:
lm=LinearRegression()

lm.fit(X_train, Y_train)

ptrain = lm.predict(X_train)
ptest = lm.predict(X_test)

mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |1.09098407143    |1.08988917219    |
|mape   |60.5112969248    |60.5157934601    |


In [8]:
# Cross Validation
lm.fit(X_test, Y_test)

ptrain = lm.predict(X_test)
ptest = lm.predict(X_train)

mae_train = mean_absolute_error(Y_test, ptrain)
mae_test = mean_absolute_error(Y_train, ptest)
mape_train = np.mean(np.abs((Y_test - ptrain) / Y_test)) * 100
mape_test = np.mean(np.abs((Y_train - ptest) / Y_train)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |1.08903641482    |1.09025388393    |
|mape   |60.4164162016    |60.4172589014    |


In [6]:
features.to_csv('train_cluster0_azure.csv', index=False)