In [1]:
import pandas as pd
import numpy as np

def loadData():
    train_raw = pd.read_csv("data/train.csv", header=0).sample(frac=1) # Eduardo
    test_raw = pd.read_csv("data/test.csv", header=0) # Eduardo
    store_raw = pd.read_csv("data/store.csv", header=0) # Eduardo

    train_raw.drop("Customers", inplace=True, axis=1) # Drop customers colum because it is not present on test set.
    test_raw.drop("Id", inplace=True, axis=1) # Not relevant

    test_raw["Sales"] = [0] * len(test_raw)

    return train_raw, test_raw, store_raw
    
train_raw, test_raw, store_raw = loadData()

  if (await self.run_code(code, result,  async_=asy)):


In [2]:
from sklearn.base import BaseEstimator, TransformerMixin

class VoidTransformer(BaseEstimator, TransformerMixin):
  def __init__(self):
    return
  def fit(self, X, y=None):
    return self
  def transform(self, X, y=None):
    return X

class DateSplitter(BaseEstimator, TransformerMixin):
  def __init__(self):
    return
  def fit(self, X, y=None):
    return self
  def transform(self, X, y=None):
    days = []
    for date in X["Date"]: # NOTE: Se podra paralelizar?
      year, month, day = date.split("-")
      total = int(year) * 365 + int(month) * 30 + int(day)
      days.append(total)
    return np.c_[days]

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

In [4]:
features_pipeline = ColumnTransformer([
                                  ("nothing1", VoidTransformer(),  ["Store"]),
                                  ("Categorical1", OneHotEncoder(), ["DayOfWeek"]),
                                  ("date", DateSplitter(), ["Date"]),
                                  ("nothing2", VoidTransformer(), ["Open", "Promo"]),
                                  # ("Categorical2", OneHotEncoder(), ["StateHoliday"]), # NOTE: Should one hot
                                  ("nothing3", VoidTransformer(), ["SchoolHoliday", "Sales"]),

])
train_prepared = features_pipeline.fit_transform(train_raw)
train_prepared = pd.DataFrame(train_prepared)
train_prepared.rename(columns = {0:'Store', 12:'Sales', 8:'Days'}, inplace = True)

test_prepared = features_pipeline.fit_transform(test_raw)
test_prepared = pd.DataFrame(test_prepared)
test_prepared.rename(columns = {0:'Store', 12:'Sales', 8:'Days'}, inplace = True)

In [5]:
nan_onehot_pipeline = Pipeline([
            ("nan_to_0", SimpleImputer(strategy="constant")),
            ("one_hot", OneHotEncoder())                    
])

store_pipeline = ColumnTransformer([
                                  ("nothing1", SimpleImputer(strategy='constant', fill_value=1), ["Store"]),
                                  ("Categorial1", nan_onehot_pipeline, ["StoreType", "Assortment"]),
                                  ("CompetitionDistance", SimpleImputer(strategy='mean'), ["CompetitionDistance"]),
                                  ("CompetitionSinceMonth", SimpleImputer(strategy='constant'), ["CompetitionOpenSinceMonth"]),
                                  ("CompetitionSinceYear", SimpleImputer(strategy='constant', fill_value=2016), ["CompetitionOpenSinceYear"]),
                                  ("nothing2", VoidTransformer(), ["Promo2"]),
                                  ("Promo2SinceWeek", SimpleImputer(strategy='constant'), ["Promo2SinceWeek"]),
                                  ("Promo2SinceYear", SimpleImputer(strategy='constant', fill_value=2016), ["Promo2SinceYear"]),
                                  ("Categorical2", nan_onehot_pipeline, ["PromoInterval"]),
])

stores_prepared = store_pipeline.fit_transform(store_raw)
stores_prepared = pd.DataFrame(stores_prepared)
stores_prepared.rename(columns = {0:'Store'}, inplace = True)

In [6]:
train = train_prepared.merge(stores_prepared, on='Store', sort=False)
test = test_prepared.merge(stores_prepared, on='Store', sort=False)

In [7]:
test_labels = test["Sales"]
test_features = test.drop("Sales", axis=1)
train_labels = train["Sales"]
train_features = train.drop("Sales", axis=1)

In [8]:
import pandas as pd
from sklearn import preprocessing

x_scaled =  preprocessing.MinMaxScaler().fit_transform(train_features.values)
train_features = pd.DataFrame(x_scaled)

In [9]:
from tensorflow.keras import backend as K

def rmspe(y_true, y_pred):
    # return K.sqrt(K.mean(K.square((y_true - y_pred) / K.clip(K.abs(y_true),K.epsilon(),None) ), axis=-1) )
    return K.sqrt(K.mean(K.square((y_true - y_pred) / (y_true+1)), axis=-1) )

def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [18]:
from tensorflow.keras import layers, Sequential, callbacks
from tensorflow.keras.optimizers import Adam
from os import path

train_features = pd.DataFrame(x_scaled)
train_features = np.expand_dims(train_features, axis=-1)
model = Sequential([
              layers.Conv1D(filters=15, kernel_size=8, dilation_rate=2, activation='relu', input_shape=(train_features.shape[1],1)),
              layers.Conv1D(filters=10, kernel_size=3, dilation_rate=2, activation='relu', input_shape=(train_features.shape[1],1)),
              layers.Conv1D(filters=5, kernel_size=3, dilation_rate=2, activation='relu', input_shape=(train_features.shape[1],1)),
              layers.Flatten(),
              layers.BatchNormalization(),
              # layers.MaxPool1D(),  
              layers.Dense(20, activation='relu'),
              layers.Dense(1)
])

model.compile(optimizer=Adam(),
              loss='mse',
              metrics=[rmspe, rmse, 'accuracy'])

checkpoint_path = "checkpoints/cp.ckpt"
checkpoint_dir = path.dirname(checkpoint_path)
cp_callback = callbacks.ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True, verbose=1)

model.fit(train_features, train_labels.values, batch_size=256, validation_split=0.15, epochs=70, callbacks=[cp_callback])      

Train on 864627 samples, validate on 152582 samples
Epoch 1/70
Epoch 00001: saving model to checkpoints/cp.ckpt
Epoch 2/70
Epoch 00002: saving model to checkpoints/cp.ckpt
Epoch 3/70
Epoch 00003: saving model to checkpoints/cp.ckpt
Epoch 4/70
Epoch 00004: saving model to checkpoints/cp.ckpt
Epoch 5/70
Epoch 00005: saving model to checkpoints/cp.ckpt
Epoch 6/70
Epoch 00006: saving model to checkpoints/cp.ckpt
Epoch 7/70
Epoch 00007: saving model to checkpoints/cp.ckpt
Epoch 8/70
Epoch 00008: saving model to checkpoints/cp.ckpt
Epoch 9/70
Epoch 00009: saving model to checkpoints/cp.ckpt
Epoch 10/70
Epoch 00010: saving model to checkpoints/cp.ckpt
Epoch 11/70
Epoch 00011: saving model to checkpoints/cp.ckpt
Epoch 12/70
Epoch 00012: saving model to checkpoints/cp.ckpt
Epoch 13/70
Epoch 00013: saving model to checkpoints/cp.ckpt
Epoch 14/70
Epoch 00014: saving model to checkpoints/cp.ckpt
Epoch 15/70
Epoch 00015: saving model to checkpoints/cp.ckpt
Epoch 16/70
Epoch 00016: saving model to c

KeyboardInterrupt: 

In [11]:
test_features = np.expand_dims(test_features, axis=-1)
predictions = model.predict(test_features)

In [12]:
# standarisar datos
# knn imputer
# RMSPError
# feature creation

In [13]:
predictions[2000:2010]

array([[3.5673318e+08],
       [3.5673190e+08],
       [3.5674810e+08],
       [3.5674115e+08],
       [3.5673952e+08],
       [3.5674134e+08],
       [3.5673840e+08],
       [3.5673021e+08],
       [3.5672883e+08],
       [3.5674365e+08]], dtype=float32)

In [14]:
test_y[2000:2010]

NameError: name 'test_y' is not defined