In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from math import sqrt

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor

In [2]:
data = pd.read_csv('train_cluster6.csv', header=0, low_memory=False)
data.dropna(inplace=True)
data.head()

Unnamed: 0,unit_sales,onpromotion,transactions,dcoilwtico,day_0,day_1,day_2,day_3,day_4,day_5,...,localholiday,regionalholiday,nationalother,nholidayspike,goodfriday,blackfriday,worldcupspike,worldcupdrop,earthquakespike,earthquakedrop
0,4.0,0,883.0,38.963333,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5.0,0,1959.0,59.75,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,6.0,0,1696.0,53.61,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5.0,0,716.0,67.3,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6.0,0,3155.0,40.4,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
array=data.values

Y=array[:,0]
X=array[:,1:]

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.5)

# Linear Regression

In [4]:
lm=LinearRegression()

lm.fit(X_train, Y_train)

ptrain = lm.predict(X_train)
ptest = lm.predict(X_test)

mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |2.6238135117    |2.61366682971    |
|mape   |99.1629727511    |98.8520829999    |


In [5]:
# Cross Validation
lm.fit(X_test, Y_test)

ptrain = lm.predict(X_test)
ptest = lm.predict(X_train)

mae_train = mean_absolute_error(Y_test, ptrain)
mae_test = mean_absolute_error(Y_train, ptest)
mape_train = np.mean(np.abs((Y_test - ptrain) / Y_test)) * 100
mape_test = np.mean(np.abs((Y_train - ptest) / Y_train)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |2.61300538458    |2.62379137033    |
|mape   |98.6884940295    |99.0253976105    |


# Random Forest

In [6]:
forest=RandomForestRegressor(n_estimators=100)
forest.fit(X_train,Y_train)

ptrain = forest.predict(X_train)
ptest = forest.predict(X_test)

mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |1.51473382888    |2.88523676887    |
|mape   |53.5266282479    |104.118826662    |


In [7]:
# Cross Validation
forest=RandomForestRegressor(n_estimators=100)

forest.fit(X_test, Y_test)

ptrain = forest.predict(X_test)
ptest = forest.predict(X_train)

mae_train = mean_absolute_error(Y_test, ptrain)
mae_test = mean_absolute_error(Y_train, ptest)
mape_train = np.mean(np.abs((Y_test - ptrain) / Y_test)) * 100
mape_test = np.mean(np.abs((Y_train - ptest) / Y_train)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |1.5087844376    |2.88624247841    |
|mape   |53.2006070685    |103.775197894    |


# KNN

In [8]:
k = int(sqrt(len(data.index)))
k

708

In [9]:
knn = KNeighborsRegressor(n_neighbors=k)
knn.fit(X_train, Y_train)

ptrain = knn.predict(X_train)
ptest = knn.predict(X_test)

mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |2.61994037669    |2.61468086213    |
|mape   |99.2721267787    |99.1319560356    |


In [10]:
# Cross Validation
knn = KNeighborsRegressor(n_neighbors=k)
knn.fit(X_test, Y_test)

ptrain = knn.predict(X_test)
ptest = knn.predict(X_train)

mae_train = mean_absolute_error(Y_test, ptrain)
mae_test = mean_absolute_error(Y_train, ptest)
mape_train = np.mean(np.abs((Y_test - ptrain) / Y_test)) * 100
mape_test = np.mean(np.abs((Y_train - ptest) / Y_train)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |2.61027084165    |2.62322264787    |
|mape   |98.8083711721    |99.2381282489    |


# Neural Network

In [11]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train2 = scaler.transform(X_train)
X_test2 = scaler.transform(X_test)

mlp = MLPRegressor(hidden_layer_sizes=(50,20,10))
mlp.fit(X_train2,Y_train)
ptrain = mlp.predict(X_train2)
ptest = mlp.predict(X_test2)

mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |2.61291117092    |2.63434320948    |
|mape   |99.264999157    |100.178377634    |


In [12]:
# Cross Validation
mlp = MLPRegressor(hidden_layer_sizes=(50,20,10))
mlp.fit(X_test2,Y_test)
ptrain = mlp.predict(X_test2)
ptest = mlp.predict(X_train2)

mae_train = mean_absolute_error(Y_test, ptrain)
mae_test = mean_absolute_error(Y_train, ptest)
mape_train = np.mean(np.abs((Y_test - ptrain) / Y_test)) * 100
mape_test = np.mean(np.abs((Y_train - ptest) / Y_train)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |2.55348329338    |2.60630369112    |
|mape   |94.1767868821    |96.0971562942    |


# Improving the Best Model

In [13]:
from sklearn.feature_selection import RFE
model=LinearRegression()
rfe = RFE(model, n_features_to_select=20)
rfe.fit(X_train,Y_train)

ptrain = rfe.predict(X_train)
ptest = rfe.predict(X_test)

mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |2.64748683283    |2.63767062677    |
|mape   |102.40855769    |102.089844997    |


In [14]:
test = data.drop('day_6', axis=1)
test.drop('month_12',axis=1)
test.drop('1088',axis=1)
test.drop('Guayas', axis=1)
test.drop('Santo Domingo de los Tsachilas', axis=1)
test.drop('Tungurahua', axis=1)
test.drop('localholiday', axis=1)
test.drop('blackfriday', axis=1)
test.drop('worldcupspike', axis=1)
test.drop('worldcupdrop', axis=1)

Unnamed: 0,unit_sales,onpromotion,transactions,dcoilwtico,day_0,day_1,day_2,day_3,day_4,day_5,...,Tungurahua,localholiday,regionalholiday,nationalother,nholidayspike,goodfriday,blackfriday,worldcupspike,earthquakespike,earthquakedrop
0,4.0,0,883.0,38.963333,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5.0,0,1959.0,59.750000,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,6.0,0,1696.0,53.610000,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5.0,0,716.0,67.300000,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6.0,0,3155.0,40.400000,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5,13.0,0,3705.0,92.300000,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,6.0,0,1412.0,96.660000,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,13.0,0,655.0,97.180000,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,13.0,1,2694.0,49.370000,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
9,7.0,0,1837.0,32.980000,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [16]:
array2=test.values

Y2=array2[:,0]
X2=array2[:,1:]

X_train2,X_test2,Y_train2,Y_test2=train_test_split(X2,Y2,test_size=0.5)

In [17]:
lm=LinearRegression()

lm.fit(X_train2, Y_train2)

ptrain = lm.predict(X_train2)
ptest = lm.predict(X_test2)

mae_train = mean_absolute_error(Y_train2, ptrain)
mae_test = mean_absolute_error(Y_test2, ptest)
mape_train = np.mean(np.abs((Y_train2 - ptrain) / Y_train2)) * 100
mape_test = np.mean(np.abs((Y_test2 - ptest) / Y_test2)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |2.61517596679    |2.6199364697    |
|mape   |98.8856303933    |98.8909117119    |


In [18]:
# Cross Validation
lm.fit(X_test2, Y_test2)

ptrain = lm.predict(X_test2)
ptest = lm.predict(X_train2)

mae_train = mean_absolute_error(Y_test2, ptrain)
mae_test = mean_absolute_error(Y_train2, ptest)
mape_train = np.mean(np.abs((Y_test2 - ptrain) / Y_test2)) * 100
mape_test = np.mean(np.abs((Y_train2 - ptest) / Y_train2)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |2.62144212892    |2.61773556085    |
|mape   |98.9595136051    |98.9895980603    |


In [19]:
test.to_csv('train_cluster6_azure.csv', index=False)