In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from math import sqrt

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor

In [2]:
data = pd.read_csv('train_cluster4.csv', header=0, low_memory=False)
data.head()

Unnamed: 0,unit_sales,onpromotion,transactions,dcoilwtico,day_0,day_1,day_2,day_3,day_4,day_5,...,localholiday,regionalholiday,nationalother,nholidayspike,goodfriday,blackfriday,worldcupspike,worldcupdrop,earthquakespike,earthquakedrop
0,6.0,0,952.0,44.39,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,0,702.0,94.75,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,7.0,0,2332.0,35.91,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,7.0,0,1620.0,45.303333,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4.0,0,4310.0,50.05,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
data.dropna(inplace=True)

In [4]:
array=data.values

Y=array[:,0]
X=array[:,1:]

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.5)

# Linear Regression

In [5]:
lm=LinearRegression()

lm.fit(X_train, Y_train)

ptrain = lm.predict(X_train)
ptest = lm.predict(X_test)

mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |1.60452993812    |1.60753634475    |
|mape   |75.6795277545    |75.8338208977    |


In [6]:
# Cross Validation
lm.fit(X_test, Y_test)

ptrain = lm.predict(X_test)
ptest = lm.predict(X_train)

mae_train = mean_absolute_error(Y_test, ptrain)
mae_test = mean_absolute_error(Y_train, ptest)
mape_train = np.mean(np.abs((Y_test - ptrain) / Y_test)) * 100
mape_test = np.mean(np.abs((Y_train - ptest) / Y_train)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |1.60777786903    |1.60563785684    |
|mape   |75.8335080017    |75.722762408    |


# Random Forest

In [7]:
forest=RandomForestRegressor(n_estimators=100)
forest.fit(X_train,Y_train)

ptrain = forest.predict(X_train)
ptest = forest.predict(X_test)

mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |0.738743157124    |1.70469996145    |
|mape   |33.7362902051    |78.8860623088    |


In [8]:
# Cross Validation
forest=RandomForestRegressor(n_estimators=100)

forest.fit(X_test, Y_test)

ptrain = forest.predict(X_test)
ptest = forest.predict(X_train)

mae_train = mean_absolute_error(Y_test, ptrain)
mae_test = mean_absolute_error(Y_train, ptest)
mape_train = np.mean(np.abs((Y_test - ptrain) / Y_test)) * 100
mape_test = np.mean(np.abs((Y_train - ptest) / Y_train)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |0.7391441518    |1.70487898669    |
|mape   |33.7852952417    |78.8446059127    |


# KNN

In [9]:
k = int(sqrt(len(data.index)))
k

708

In [10]:
knn = KNeighborsRegressor(n_neighbors=k)
knn.fit(X_train, Y_train)

ptrain = knn.predict(X_train)
ptest = knn.predict(X_test)

mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |1.60229017465    |1.60715162962    |
|mape   |75.6890523276    |75.9029048664    |


In [11]:
# Cross Validation
knn = KNeighborsRegressor(n_neighbors=k)
knn.fit(X_test, Y_test)

ptrain = knn.predict(X_test)
ptest = knn.predict(X_train)

mae_train = mean_absolute_error(Y_test, ptrain)
mae_test = mean_absolute_error(Y_train, ptest)
mape_train = np.mean(np.abs((Y_test - ptrain) / Y_test)) * 100
mape_test = np.mean(np.abs((Y_train - ptest) / Y_train)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |1.60567745642    |1.60544399243    |
|mape   |75.7771465241    |75.7772003433    |


# Neural Network

In [12]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train2 = scaler.transform(X_train)
X_test2 = scaler.transform(X_test)

mlp = MLPRegressor(hidden_layer_sizes=(50,20,10))
mlp.fit(X_train2,Y_train)
ptrain = mlp.predict(X_train2)
ptest = mlp.predict(X_test2)

mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |1.59689036649    |1.63903255609    |
|mape   |76.6660479889    |78.6896245272    |


In [13]:
# Cross Validation
mlp = MLPRegressor(hidden_layer_sizes=(50,20,10))
mlp.fit(X_test2,Y_test)
ptrain = mlp.predict(X_test2)
ptest = mlp.predict(X_train2)

mae_train = mean_absolute_error(Y_test, ptrain)
mae_test = mean_absolute_error(Y_train, ptest)
mape_train = np.mean(np.abs((Y_test - ptrain) / Y_test)) * 100
mape_test = np.mean(np.abs((Y_train - ptest) / Y_train)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |1.57124687876    |1.60884851009    |
|mape   |73.3567182632    |75.1654819134    |


# Improving the Best Model

In [14]:
from sklearn.feature_selection import RFE
model=LinearRegression()
rfe = RFE(model, n_features_to_select=20)
rfe.fit(X_train,Y_train)

ptrain = rfe.predict(X_train)
ptest = rfe.predict(X_test)

mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |1.61660527626    |1.62091060611    |
|mape   |78.1119092572    |78.272355584    |


In [17]:
selected = ['unit_sales','onpromotion','transactions','dcoilwtico','day_0','day_1','day_2','day_3',
            'day_4','month_1','month_2','month_3','month_4','month_5','month_6','month_7','month_8',
            'month_9','month_10','month_11','1025','1028','1032','1036','1038','1039','1044',
            '1048','1054','1056','1078','1086','1092','Azuay','Bolivar','Cotopaxi','El Oro',
            'Esmeraldas','Guayas','Imbabura','Los Rios','Manabi','Pastaza',
            'Santa Elena','Santo Domingo de los Tsachilas','nationalother','nholidayspike',
            'earthquakespike']

features = data[selected]

In [18]:
array2=features.values

Y2=array2[:,0]
X2=array2[:,1:]

X_train2,X_test2,Y_train2,Y_test2=train_test_split(X2,Y2,test_size=0.5)

In [19]:
lm=LinearRegression()

lm.fit(X_train2, Y_train2)

ptrain = lm.predict(X_train2)
ptest = lm.predict(X_test2)

mae_train = mean_absolute_error(Y_train2, ptrain)
mae_test = mean_absolute_error(Y_test2, ptest)
mape_train = np.mean(np.abs((Y_train2 - ptrain) / Y_train2)) * 100
mape_test = np.mean(np.abs((Y_test2 - ptest) / Y_test2)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |1.60600070638    |1.60631672702    |
|mape   |75.7493212752    |75.8171441317    |


In [20]:
# Cross Validation
lm.fit(X_test2, Y_test2)

ptrain = lm.predict(X_test2)
ptest = lm.predict(X_train2)

mae_train = mean_absolute_error(Y_test2, ptrain)
mae_test = mean_absolute_error(Y_train2, ptest)
mape_train = np.mean(np.abs((Y_test2 - ptrain) / Y_test2)) * 100
mape_test = np.mean(np.abs((Y_train2 - ptest) / Y_train2)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |1.60631141386    |1.60650195221    |
|mape   |75.7705966149    |75.7257286507    |


In [21]:
features.to_csv('train_cluster4_azure.csv', index=False)