In [0]:
# virtualenv -p python3 venv
# source venv/bin/activate
# pip install pandas numpy scikit-learn tensorflow jupyter ipykernel
# python -m ipykernel install --user --name venv --display-name "My Env"

import pandas as pd
import numpy as np

train_raw = pd.read_csv("train.csv",header=0).sample(frac=1) # Eduardo
test_raw = pd.read_csv("test.csv",header=0) # Eduardo
store_raw = pd.read_csv("store.csv",header=0) # Eduardo
#data_train= pd.read_csv("/content/drive/My Drive/Kaggle/train.csv",header=0) #David


train_raw.drop("Customers", inplace=True, axis=1) # Drop customers colum because it is not present on test set.
test_raw.drop("Id", inplace=True, axis=1) # Not relevant

# Add sales column to train to mantain consistency in transformations.
test_raw["Sales"] = [0] * len(test_raw)

In [0]:
from sklearn.base import BaseEstimator, TransformerMixin

class VoidTransformer(BaseEstimator, TransformerMixin):
  def __init__(self):
    return
  def fit(self, X, y=None):
    return self
  def transform(self, X, y=None):
    return X

class DateSplitter(BaseEstimator, TransformerMixin):
  def __init__(self):
    return
  def fit(self, X, y=None):
    return self
  def transform(self, X, y=None):
    days = []
    for date in X["Date"]: # NOTE: Se podra paralelizar?
      year, month, day = date.split("-")
      total = int(year) * 365 + int(month) * 30 + int(day)
      days.append(total)
    return np.c_[days]

In [0]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

In [0]:
features_pipeline = ColumnTransformer([
                                  ("nothing1", VoidTransformer(),  ["Store"]),
                                  ("Categorical1", OneHotEncoder(), ["DayOfWeek"]),
                                  ("date", DateSplitter(), ["Date"]),
                                  ("nothing2", VoidTransformer(), ["Open", "Promo"]),
                                  # ("Categorical2", OneHotEncoder(), ["StateHoliday"]), # NOTE: Should one hot
                                  ("nothing3", VoidTransformer(), ["SchoolHoliday", "Sales"]),

])
train_prepared = features_pipeline.fit_transform(train_raw)
train_prepared = pd.DataFrame(train_prepared)
train_prepared.rename(columns = {0:'Store', 12:'Sales', 8:'Days'}, inplace = True)

test_prepared = features_pipeline.fit_transform(test_raw)
test_prepared = pd.DataFrame(test_prepared)
test_prepared.rename(columns = {0:'Store', 12:'Sales', 8:'Days'}, inplace = True)

In [0]:
nan_onehot_pipeline = Pipeline([
            ("nan_to_0", SimpleImputer(strategy="constant")),
            ("one_hot", OneHotEncoder())                    
])

store_pipeline = ColumnTransformer([
                                  ("nothing1", SimpleImputer(strategy='constant', fill_value=1), ["Store"]),
                                  ("Categorial1", nan_onehot_pipeline, ["StoreType", "Assortment"]),
                                  ("CompetitionDistance", SimpleImputer(strategy='mean'), ["CompetitionDistance"]),
                                  ("CompetitionSinceMonth", SimpleImputer(strategy='constant'), ["CompetitionOpenSinceMonth"]),
                                  ("CompetitionSinceYear", SimpleImputer(strategy='constant', fill_value=2016), ["CompetitionOpenSinceYear"]),
                                  ("nothing2", VoidTransformer(), ["Promo2"]),
                                  ("Promo2SinceWeek", SimpleImputer(strategy='constant'), ["Promo2SinceWeek"]),
                                  ("Promo2SinceYear", SimpleImputer(strategy='constant', fill_value=2016), ["Promo2SinceYear"]),
                                  ("Categorical2", nan_onehot_pipeline, ["PromoInterval"]),
])

stores_prepared = store_pipeline.fit_transform(store_raw)
stores_prepared = pd.DataFrame(stores_prepared)
stores_prepared.rename(columns = {0:'Store'}, inplace = True)

In [0]:
train = train_prepared.merge(stores_prepared, on='Store', sort=False)
test = test_prepared.merge(stores_prepared, on='Store', sort=False)

In [0]:
test_labels = test["Sales"]
test_features = test.drop("Sales", axis=1)
train_labels = train["Sales"]
train_features = train.drop("Sales", axis=1)

In [0]:
import pandas as pd
from sklearn import preprocessing

x_scaled =  preprocessing.MinMaxScaler().fit_transform(train_features.values)
# train_features = pd.DataFrame(x_scaled)

In [0]:
from tensorflow.keras import backend as K

def rmspe(y_true, y_pred):
    # return K.sqrt(K.mean(K.square((y_true - y_pred) / K.clip(K.abs(y_true),K.epsilon(),None) ), axis=-1) )
    return K.sqrt(K.mean(K.square((y_true - y_pred) / (y_true+1)), axis=-1) )

def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [0]:
from tensorflow.keras import layers, Sequential
train_features = pd.DataFrame(x_scaled)
train_features = np.expand_dims(train_features, axis=-1)
input_shape = train_features.shape[1]
model = Sequential([
              layers.Conv1D(filters=20, kernel_size=3, input_shape=(train_features.shape[1],1)),
              layers.Flatten(),
              # layers.MaxPool1D(),  
              layers.Dense(20, activation='sigmoid'),
              layers.Dense(1)
])

In [0]:
from tensorflow.keras.optimizers import Adam
model.compile(optimizer=Adam(),
              loss='mse',
              metrics=[rmspe, rmse, 'accuracy'])

In [0]:
model.fit(train_features, train_labels.values, batch_size=32, validation_split=0.1, epochs=10)

In [0]:
predictions = model.predict(test_features)

In [0]:
# standarisar datos
# knn imputer
# RMSPError
# feature creation

In [0]:
predictions[2000:2010]

In [0]:
test_y[2000:2010]