In [4]:
import csv
import pickle
from datetime import datetime
from sklearn import preprocessing
import numpy as np
import random
from sklearn.preprocessing import OneHotEncoder
import sys

def csv2dicts(csvfile):
    data = []
    keys = []
    for row_index, row in enumerate(csvfile):
        if row_index == 0:
            keys = row
            print(row)
            continue
        # if row_index % 10000 == 0:
        #     print(row_index)
        data.append({key: value for key, value in zip(keys, row)})
    return data


def set_nan_as_string(data, replace_str='0'):
    for i, x in enumerate(data):
        for key, value in x.items():
            if value == '':
                x[key] = replace_str
        data[i] = x

In [5]:
test_data_filename = "rossmann/test.csv"
train_data_filename = "rossmann/train.csv"
store_data_filename = "rossmann/store.csv"
store_states_filename = 'rossmann/store_states.csv'

In [6]:
csvfile = open(train_data_filename)
train_data = csv.reader(csvfile, delimiter=',')
train_data = csv2dicts(train_data)
train_data = train_data[::-1]

print(train_data[:3])

['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday']
[{'Store': '1115', 'DayOfWeek': '2', 'Date': '2013-01-01', 'Sales': '0', 'Customers': '0', 'Open': '0', 'Promo': '0', 'StateHoliday': 'a', 'SchoolHoliday': '1'}, {'Store': '1114', 'DayOfWeek': '2', 'Date': '2013-01-01', 'Sales': '0', 'Customers': '0', 'Open': '0', 'Promo': '0', 'StateHoliday': 'a', 'SchoolHoliday': '1'}, {'Store': '1113', 'DayOfWeek': '2', 'Date': '2013-01-01', 'Sales': '0', 'Customers': '0', 'Open': '0', 'Promo': '0', 'StateHoliday': 'a', 'SchoolHoliday': '1'}]


In [7]:
csvfile_test = open(test_data_filename)
test_data = csv.reader(csvfile_test, delimiter=',')
test_data = csv2dicts(test_data)
test_data = test_data[::-1]

print(test_data[:3])

['Id', 'Store', 'DayOfWeek', 'Date', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday']
[{'Id': '41088', 'Store': '1115', 'DayOfWeek': '6', 'Date': '2015-08-01', 'Open': '1', 'Promo': '0', 'StateHoliday': '0', 'SchoolHoliday': '1'}, {'Id': '41087', 'Store': '1114', 'DayOfWeek': '6', 'Date': '2015-08-01', 'Open': '1', 'Promo': '0', 'StateHoliday': '0', 'SchoolHoliday': '0'}, {'Id': '41086', 'Store': '1113', 'DayOfWeek': '6', 'Date': '2015-08-01', 'Open': '1', 'Promo': '0', 'StateHoliday': '0', 'SchoolHoliday': '0'}]


In [8]:
csvfile = open(store_data_filename)
csvfile2 = open(store_states_filename)
store_data = csv.reader(csvfile, delimiter=',')
store_states_data = csv.reader(csvfile2, delimiter=',')

store_data = csv2dicts(store_data)
store_states_data = csv2dicts(store_states_data)
set_nan_as_string(store_data)
for index, val in enumerate(store_data):
    state = store_states_data[index]
    val['State'] = state['State']
    store_data[index] = val

print(store_data[:2])

['Store', 'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval']
['Store', 'State']
[{'Store': '1', 'StoreType': 'c', 'Assortment': 'a', 'CompetitionDistance': '1270', 'CompetitionOpenSinceMonth': '9', 'CompetitionOpenSinceYear': '2008', 'Promo2': '0', 'Promo2SinceWeek': '0', 'Promo2SinceYear': '0', 'PromoInterval': '0', 'State': 'HE'}, {'Store': '2', 'StoreType': 'a', 'Assortment': 'a', 'CompetitionDistance': '570', 'CompetitionOpenSinceMonth': '11', 'CompetitionOpenSinceYear': '2007', 'Promo2': '1', 'Promo2SinceWeek': '13', 'Promo2SinceYear': '2010', 'PromoInterval': 'Jan,Apr,Jul,Oct', 'State': 'TH'}]


# Prepare features

In [9]:
random.seed(42)

def feature_list(record):
    dt = datetime.strptime(record['Date'], '%Y-%m-%d')
    store_index = int(record['Store'])
    year = dt.year
    month = dt.month
    day = dt.day
    day_of_week = int(record['DayOfWeek'])
    try:
        store_open = int(record['Open'])
    except:
        store_open = 1

    promo = int(record['Promo'])

    return [store_open,
            store_index,
            day_of_week,
            promo,
            year,
            month,
            day,
            store_data[store_index - 1]['State']
            ]

In [10]:
train_data_X = []
train_data_y = []

for record in train_data:
    if record['Sales'] != '0' and record['Open'] != '':
        fl = feature_list(record)
        train_data_X.append(fl)
        train_data_y.append(int(record['Sales']))
print("Number of train datapoints: ", len(train_data_y))
print(min(train_data_y), max(train_data_y))

Number of train datapoints:  844338
46 41551


In [11]:
train_data_X[:10]

[[1, 1097, 2, 0, 2013, 1, 1, 'RP'],
 [1, 948, 2, 0, 2013, 1, 1, 'BW'],
 [1, 769, 2, 0, 2013, 1, 1, 'NW'],
 [1, 733, 2, 0, 2013, 1, 1, 'NW'],
 [1, 682, 2, 0, 2013, 1, 1, 'BE'],
 [1, 676, 2, 0, 2013, 1, 1, 'HE'],
 [1, 562, 2, 0, 2013, 1, 1, 'HB,NI'],
 [1, 530, 2, 0, 2013, 1, 1, 'SH'],
 [1, 512, 2, 0, 2013, 1, 1, 'BY'],
 [1, 494, 2, 0, 2013, 1, 1, 'BE']]

In [12]:
test_data_X = []
for record in test_data:
    fl = feature_list(record)
    test_data_X.append(fl)

In [13]:
test_data_X[:10]

[[1, 1115, 6, 0, 2015, 8, 1, 'HE'],
 [1, 1114, 6, 0, 2015, 8, 1, 'HH'],
 [1, 1113, 6, 0, 2015, 8, 1, 'SH'],
 [1, 1112, 6, 0, 2015, 8, 1, 'NW'],
 [1, 1111, 6, 0, 2015, 8, 1, 'NW'],
 [1, 1109, 6, 0, 2015, 8, 1, 'BY'],
 [1, 1107, 6, 0, 2015, 8, 1, 'BY'],
 [1, 1106, 6, 0, 2015, 8, 1, 'SH'],
 [1, 1105, 6, 0, 2015, 8, 1, 'NW'],
 [1, 1104, 6, 0, 2015, 8, 1, 'BY']]

In [14]:
full_X = np.array(train_data_X)
train_data_X = np.array(train_data_X)
full_X.shape

(844338, 8)

In [15]:
test_data_X = np.array(test_data_X)
test_data_X.shape

(41088, 8)

In [16]:
train_data_X[:, 0]

array(['1', '1', '1', ..., '1', '1', '1'], dtype='<U11')

In [17]:
les = []
for i in range(train_data_X.shape[1]):
    le = preprocessing.LabelEncoder()
    le.fit(full_X[:, i])
    les.append(le)
    train_data_X[:, i] = le.transform(train_data_X[:, i])
    if i > 0:
        test_data_X[:, i] = le.transform(test_data_X[:, i])

In [18]:
train_data_X.shape

(844338, 8)

In [19]:
train_data_X = train_data_X.astype(int)
train_data_y = np.array(train_data_y)
# les y  (train_data_X, train_data_y)

In [20]:
print(train_data_X[0], train_data_y[0])

[  0 109   1   0   0   0   0   7] 5961


In [21]:
np.random.seed(123)

sys.setrecursionlimit(10000)

train_ratio = 0.9
shuffle_data = False
one_hot_as_input = False
embeddings_as_input = False
save_embeddings = True
saved_embeddings_fname = "embeddings.pickle"  # set save_embeddings to True to create this file

(X, y) = train_data_X, train_data_y

num_records = len(X)
train_size = int(train_ratio * num_records)

In [22]:
if shuffle_data:
    print("Using shuffled data")
    sh = numpy.arange(X.shape[0])
    np.random.shuffle(sh)
    X = X[sh]
    y = y[sh]

if embeddings_as_input:
    print("Using learned embeddings as input")
    X = embed_features(X, saved_embeddings_fname)

if one_hot_as_input:
    print("Using one-hot encoding as input")
    enc = OneHotEncoder(sparse=False)
    enc.fit(X)
    X = enc.transform(X)

X_train = X[:train_size]
X_val = X[train_size:]
y_train = y[:train_size]
y_val = y[train_size:]

print(X_train.shape, X_val.shape)

(759904, 8) (84434, 8)


In [23]:
def sample(X, y, n):
    '''random samples'''
    num_row = X.shape[0]
    indices = np.random.randint(num_row, size=n)
    return X[indices, :], y[indices]


X_train, y_train = sample(X_train, y_train, 200000)  # Simulate data sparsity
print("Number of samples used for training: " + str(y_train.shape[0]))

Number of samples used for training: 200000


In [24]:
X_train.shape

(200000, 8)

In [30]:
from models import *

In [28]:
models = []

print("Fitting NN_with_EntityEmbedding...")
for i in range(5):
    models.append(NN_with_EntityEmbedding(X_train, y_train, X_val, y_val))

Fitting NN_with_EntityEmbedding...


NameError: name 'NN_with_EntityEmbedding' is not defined

In [None]:
test_data_X[:, 1:].shape

In [None]:
predictions_norm = models[0].model.predict(np.hsplit(test_data_X[:, 1:], 7))

In [None]:
y_pred_test = np.exp(predictions_norm * models[0].max_log_y)

In [None]:
y_pred_test[(test_data_X[:, 0] == '0')] = 0

In [None]:
import pandas as pd
sample_csv = pd.read_csv('dataset/rossmann/sample_submission.csv')

In [None]:
sample_csv['Sales'] = y_pred_test

In [None]:
sample_csv.to_csv(f'submision_original.csv', index=False)

In [None]:
sample_csv.head()

In [None]:
# print("Fitting NN...")
# for i in range(5):
#     models.append(NN(X_train, y_train, X_val, y_val))

# print("Fitting RF...")
# models.append(RF(X_train, y_train, X_val, y_val))

# print("Fitting KNN...")
# models.append(KNN(X_train, y_train, X_val, y_val))

# print("Fitting XGBoost...")
# models.append(XGBoost(X_train, y_train, X_val, y_val))


if save_embeddings:
    model = models[0].model
    store_embedding = model.get_layer('store_embedding').get_weights()[0]
    dow_embedding = model.get_layer('dow_embedding').get_weights()[0]
    year_embedding = model.get_layer('year_embedding').get_weights()[0]
    month_embedding = model.get_layer('month_embedding').get_weights()[0]
    day_embedding = model.get_layer('day_embedding').get_weights()[0]
    german_states_embedding = model.get_layer('state_embedding').get_weights()[0]
    with open(saved_embeddings_fname, 'wb') as f:
        pickle.dump([store_embedding, dow_embedding, year_embedding,
                     month_embedding, day_embedding, german_states_embedding], f, -1)


def evaluate_models(models, X, y):
    assert(min(y) > 0)
    guessed_sales = numpy.array([model.guess(X) for model in models])
    mean_sales = guessed_sales.mean(axis=0)
    relative_err = numpy.absolute((y - mean_sales) / y)
    result = numpy.sum(relative_err) / len(y)
    return result


print("Evaluate combined models...")
print("Training error...")
r_train = evaluate_models(models, X_train, y_train)
print(r_train)

print("Validation error...")
r_val = evaluate_models(models, X_val, y_val)
print(r_val)

In [1]:
X_val

NameError: name 'X_val' is not defined