In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from math import sqrt

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor

In [2]:
data = pd.read_csv('train_cluster5.csv', header=0, low_memory=False)
data.head()

Unnamed: 0,unit_sales,onpromotion,transactions,dcoilwtico,day_0,day_1,day_2,day_3,day_4,day_5,...,localholiday,regionalholiday,nationalother,nholidayspike,goodfriday,blackfriday,worldcupspike,worldcupdrop,earthquakespike,earthquakedrop
0,1.0,0,3005.0,40.895,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,0,5523.0,82.786667,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1.0,0,1631.0,48.83,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1.0,0,2650.0,99.28,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1.0,0,2068.0,106.83,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [3]:
data.dropna(inplace=True)

In [4]:
array=data.values

Y=array[:,0]
X=array[:,1:]

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.5)

# Linear Regression

In [5]:
lm=LinearRegression()

lm.fit(X_train, Y_train)

ptrain = lm.predict(X_train)
ptest = lm.predict(X_test)

mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |0.522646305249    |0.534347661827    |
|mape   |36.9559092456    |37.3638001872    |


In [6]:
# Cross Validation
lm.fit(X_test, Y_test)

ptrain = lm.predict(X_test)
ptest = lm.predict(X_train)

mae_train = mean_absolute_error(Y_test, ptrain)
mae_test = mean_absolute_error(Y_train, ptest)
mape_train = np.mean(np.abs((Y_test - ptrain) / Y_test)) * 100
mape_test = np.mean(np.abs((Y_train - ptest) / Y_train)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |0.537838464764    |0.535620299674    |
|mape   |38.05241166    |38.3393356977    |


# Random Forest

In [7]:
forest=RandomForestRegressor(n_estimators=100)
forest.fit(X_train,Y_train)

ptrain = forest.predict(X_train)
ptest = forest.predict(X_test)

mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |0.195884302837    |0.532138591231    |
|mape   |14.0633641847    |37.7087817476    |


In [8]:
# Cross Validation
forest=RandomForestRegressor(n_estimators=100)

forest.fit(X_test, Y_test)

ptrain = forest.predict(X_test)
ptest = forest.predict(X_train)

mae_train = mean_absolute_error(Y_test, ptrain)
mae_test = mean_absolute_error(Y_train, ptest)
mape_train = np.mean(np.abs((Y_test - ptrain) / Y_test)) * 100
mape_test = np.mean(np.abs((Y_train - ptest) / Y_train)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |0.203279958544    |0.539006897589    |
|mape   |14.6364186778    |39.355409783    |


# KNN

In [9]:
k = int(sqrt(len(data.index)))
k

101

In [10]:
knn = KNeighborsRegressor(n_neighbors=k)
knn.fit(X_train, Y_train)

ptrain = knn.predict(X_train)
ptest = knn.predict(X_test)

mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |0.529245927508    |0.538909390356    |
|mape   |37.2900409574    |37.5124580083    |


In [11]:
# Cross Validation
knn = KNeighborsRegressor(n_neighbors=k)
knn.fit(X_test, Y_test)

ptrain = knn.predict(X_test)
ptest = knn.predict(X_train)

mae_train = mean_absolute_error(Y_test, ptrain)
mae_test = mean_absolute_error(Y_train, ptest)
mape_train = np.mean(np.abs((Y_test - ptrain) / Y_test)) * 100
mape_test = np.mean(np.abs((Y_train - ptest) / Y_train)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |0.541372131772    |0.540372754477    |
|mape   |38.184260517    |38.5354424655    |


# Neural Network

In [12]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train2 = scaler.transform(X_train)
X_test2 = scaler.transform(X_test)

mlp = MLPRegressor(hidden_layer_sizes=(50,20,10))
mlp.fit(X_train2,Y_train)
ptrain = mlp.predict(X_train2)
ptest = mlp.predict(X_test2)

mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |0.359604722829    |0.586569655799    |
|mape   |25.7856794848    |41.88039886    |


In [13]:
# Cross Validation
mlp = MLPRegressor(hidden_layer_sizes=(50,20,10))
mlp.fit(X_test2,Y_test)
ptrain = mlp.predict(X_test2)
ptest = mlp.predict(X_train2)

mae_train = mean_absolute_error(Y_test, ptrain)
mae_test = mean_absolute_error(Y_train, ptest)
mape_train = np.mean(np.abs((Y_test - ptrain) / Y_test)) * 100
mape_test = np.mean(np.abs((Y_train - ptest) / Y_train)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |0.408985196117    |0.577574435157    |
|mape   |29.7887270033    |42.4545084772    |


# Improving the Best Model

In [20]:
from sklearn.feature_selection import RFE
model=LinearRegression()
rfe = RFE(model, n_features_to_select=25)
rfe.fit(X_train,Y_train)

ptrain = rfe.predict(X_train)
ptest = rfe.predict(X_test)

mae_train = mean_absolute_error(Y_train, ptrain)
mae_test = mean_absolute_error(Y_test, ptest)
mape_train = np.mean(np.abs((Y_train - ptrain) / Y_train)) * 100
mape_test = np.mean(np.abs((Y_test - ptest) / Y_test)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |0.528494366497    |0.536626217572    |
|mape   |37.308028122    |37.4647658173    |


In [22]:
selected=['unit_sales','transactions','dcoilwtico','day_4','day_5','month_7',
          '1018','1029','1033','1041','Pichincha',
          'Santo Domingo de los Tsachilas','nationalother',
          'nholidayspike','worldcupdrop']

features = data[selected]

In [26]:
array2=features.values

Y2=array2[:,0]
X2=array2[:,1:]

X_train2,X_test2,Y_train2,Y_test2=train_test_split(X2,Y2,test_size=0.5)

In [27]:
lm=LinearRegression()

lm.fit(X_train2, Y_train2)

ptrain = lm.predict(X_train2)
ptest = lm.predict(X_test2)

mae_train = mean_absolute_error(Y_train2, ptrain)
mae_test = mean_absolute_error(Y_test2, ptest)
mape_train = np.mean(np.abs((Y_train2 - ptrain) / Y_train2)) * 100
mape_test = np.mean(np.abs((Y_test2 - ptest) / Y_test2)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |0.538622531977    |0.532172068532    |
|mape   |38.0694741325    |37.9295483283    |


In [28]:
# Cross Validation
lm.fit(X_test2, Y_test2)

ptrain = lm.predict(X_test2)
ptest = lm.predict(X_train2)

mae_train = mean_absolute_error(Y_test2, ptrain)
mae_test = mean_absolute_error(Y_train2, ptest)
mape_train = np.mean(np.abs((Y_test2 - ptrain) / Y_test2)) * 100
mape_test = np.mean(np.abs((Y_train2 - ptest) / Y_train2)) * 100

print('|metric |train            |test             | \n|mae    |' +str(mae_train)
      +'    |'+str(mae_test)+'    |\n|mape   |'+str(mape_train)+'    |'+str(mape_test)
      +'    |')

|metric |train            |test             | 
|mae    |0.526229836991    |0.535527237801    |
|mape   |37.2447422718    |37.5322677938    |


In [29]:
features.to_csv('train_cluster5_azure.csv', index=False)