In [2]:
from collections import Counter
import os

import numpy as np
import pandas as pd

In [3]:
def pad_1d(array, max_len):
    array = array[:max_len]
    length = len(array)
    padded = array + [0]*(max_len - len(array))
    return padded, length


def make_word_idx(product_names):
    words = [word for name in product_names for word in name.split()]
    word_counts = Counter(words)

    max_id = 1
    word_idx = {}
    for word, count in word_counts.items():
        if count < 10:
            word_idx[word] = 0
        else:
            word_idx[word] = max_id
            max_id += 1

    return word_idx


def encode_text(text, word_idx):
    return ' '.join([str(word_idx[i]) for i in text.split()]) if text else '0'

First we process user data into user_data.csv

In [4]:
def parse_order(x):
    series = pd.Series(dtype='float64')

    series['products'] = '_'.join(x['product_id'].values.astype(str).tolist())
    series['reorders'] = '_'.join(x['reordered'].values.astype(str).tolist())
    series['aisles'] = '_'.join(x['aisle_id'].values.astype(str).tolist())
    series['departments'] = '_'.join(x['department_id'].values.astype(str).tolist())

    series['order_number'] = x['order_number'].iloc[0]
    series['order_dow'] = x['order_dow'].iloc[0]
    series['order_hour'] = x['order_hour_of_day'].iloc[0]
    series['days_since_prior_order'] = x['days_since_prior_order'].iloc[0]

    return series


def parse_user(x):
    parsed_orders = x.groupby('order_id', sort=False).apply(parse_order)

    series = pd.Series(dtype='float64')

    series['order_ids'] = ' '.join(parsed_orders.index.map(str).tolist())
    series['order_numbers'] = ' '.join(parsed_orders['order_number'].map(str).tolist())
    series['order_dows'] = ' '.join(parsed_orders['order_dow'].map(str).tolist())
    series['order_hours'] = ' '.join(parsed_orders['order_hour'].map(str).tolist())
    series['days_since_prior_orders'] = ' '.join(parsed_orders['days_since_prior_order'].map(str).tolist())

    series['product_ids'] = ' '.join(parsed_orders['products'].values.astype(str).tolist())
    series['aisle_ids'] = ' '.join(parsed_orders['aisles'].values.astype(str).tolist())
    series['department_ids'] = ' '.join(parsed_orders['departments'].values.astype(str).tolist())
    series['reorders'] = ' '.join(parsed_orders['reorders'].values.astype(str).tolist())

    series['eval_set'] = x['eval_set'].values[-1]

    return series

orders = pd.read_csv('../data/raw/orders.csv')
prior_products = pd.read_csv('../data/raw/order_products__prior.csv')
train_products = pd.read_csv('../data/raw/order_products__train.csv')
order_products = pd.concat([prior_products, train_products], axis=0)
products = pd.read_csv('../data/raw/products.csv')

df = orders.merge(order_products, how='left', on='order_id')
df = df.merge(products, how='left', on='product_id')
df['days_since_prior_order'] = df['days_since_prior_order'].fillna(0).astype(int)
null_cols = ['product_id', 'aisle_id', 'department_id', 'add_to_cart_order', 'reordered']
df[null_cols] = df[null_cols].fillna(0).astype(int)

if not os.path.isdir('../data/processed'):
    os.makedirs('../data/processed')

user_data = df.groupby('user_id', sort=False).apply(parse_user).reset_index()
user_data.to_csv('../data/processed/user_data.csv', index=False)

Then we process product.csv to create product_data.csv

In [None]:

    df = pd.read_csv('../data/processed/user_data.csv')

    products = pd.read_csv('../data/raw/products.csv')
    product_to_aisle = dict(zip(products['product_id'], products['aisle_id']))
    product_to_department = dict(zip(products['product_id'], products['department_id']))
    product_to_name = dict(zip(products['product_id'], products['product_name']))

    user_ids = []
    product_ids = []
    aisle_ids = []
    department_ids = []
    product_names = []
    eval_sets = []

    is_ordered_histories = []
    index_in_order_histories = []
    order_size_histories = []
    reorder_size_histories = []
    order_dow_histories = []
    order_hour_histories = []
    days_since_prior_order_histories = []
    order_number_histories = []

    labels = []

    longest = 0
    for _, row in df.iterrows():
        if _ % 10000 == 0:
            print _

        user_id = row['user_id']
        eval_set = row['eval_set']
        products = row['product_ids']

        products, next_products = ' '.join(products.split()[:-1]), products.split()[-1]

        reorders = row['reorders']
        reorders, next_reorders = ' '.join(reorders.split()[:-1]), reorders.split()[-1]

        product_set = set([int(j) for i in products.split() for j in i.split('_')])
        next_product_set = set([int(i) for i in next_products.split('_')])

        orders = [map(int, i.split('_')) for i in products.split()]
        reorders = [map(int, i.split('_')) for i in reorders.split()]
        next_reorders = map(int, next_reorders.split('_'))

        for product_id in product_set:

            user_ids.append(user_id)
            product_ids.append(product_id)
            labels.append(int(product_id in next_product_set) if eval_set == 'train' else -1)

            aisle_ids.append(product_to_aisle[product_id])
            department_ids.append(product_to_department[product_id])
            product_names.append(product_to_name[product_id])
            eval_sets.append(eval_set)

            is_ordered = []
            index_in_order = []
            order_size = []
            reorder_size = []

            prior_products = set()
            for order in orders:
                is_ordered.append(str(int(product_id in order)))
                index_in_order.append(str(order.index(product_id) + 1) if product_id in order else '0')
                order_size.append(str(len(order)))
                reorder_size.append(str(len(prior_products & set(order))))
                prior_products |= set(order)

            is_ordered = ' '.join(is_ordered)
            index_in_order = ' '.join(index_in_order)
            order_size = ' '.join(order_size)
            reorder_size = ' '.join(reorder_size)

            is_ordered_histories.append(is_ordered)
            index_in_order_histories.append(index_in_order)
            order_size_histories.append(order_size)
            reorder_size_histories.append(reorder_size)
            order_dow_histories.append(row['order_dows'])
            order_hour_histories.append(row['order_hours'])
            days_since_prior_order_histories.append(row['days_since_prior_orders'])
            order_number_histories.append(row['order_numbers'])

        user_ids.append(user_id)
        product_ids.append(0)
        labels.append(int(max(next_reorders) == 0) if eval_set == 'train' else -1)

        aisle_ids.append(0)
        department_ids.append(0)
        product_names.append(0)
        eval_sets.append(eval_set)

        is_ordered = []
        index_in_order = []
        order_size = []
        reorder_size = []

        for reorder in reorders:
            is_ordered.append(str(int(max(reorder) == 0)))
            index_in_order.append(str(0))
            order_size.append(str(len(reorder)))
            reorder_size.append(str(sum(reorder)))

        is_ordered = ' '.join(is_ordered)
        index_in_order = ' '.join(index_in_order)
        order_size = ' '.join(order_size)
        reorder_size = ' '.join(reorder_size)

        is_ordered_histories.append(is_ordered)
        index_in_order_histories.append(index_in_order)
        order_size_histories.append(order_size)
        reorder_size_histories.append(reorder_size)
        order_dow_histories.append(row['order_dows'])
        order_hour_histories.append(row['order_hours'])
        days_since_prior_order_histories.append(row['days_since_prior_orders'])
        order_number_histories.append(row['order_numbers'])

    data = [
        user_ids,
        product_ids,
        aisle_ids,
        department_ids,
        product_names,
        is_ordered_histories,
        index_in_order_histories,
        order_size_histories,
        reorder_size_histories,
        order_dow_histories,
        order_hour_histories,
        days_since_prior_order_histories,
        order_number_histories,
        labels,
        eval_sets
    ]
    columns = [
        'user_id',
        'product_id',
        'aisle_id',
        'department_id',
        'product_name',
        'is_ordered_history',
        'index_in_order_history',
        'order_size_history',
        'reorder_size_history',
        'order_dow_history',
        'order_hour_history',
        'days_since_prior_order_history',
        'order_number_history',
        'label',
        'eval_set'
    ]
    if not os.path.isdir('../data/processed'):
        os.makedirs('../data/processed')

    df = pd.DataFrame(dict(zip(columns, data)))
    df.to_csv('../data/processed/product_data.csv', index=False)

In [None]:
product_data = pd.read_csv('../data/raw/product_data.csv')
product_data['product_name'] = product_data['product_name'].map(lambda x: x.lower())

product_df = pd.read_csv('../data/raw/products.csv')
product_df['product_name'] = product_df['product_name'].map(lambda x: x.lower())

word_idx = make_word_idx(product_df['product_name'].tolist())
product_data['product_name_encoded'] = product_data['product_name'].map(lambda x: encode_text(x, word_idx))

num_rows = len(product_data)

user_id = np.zeros(shape=[num_rows], dtype=np.int32)
product_id = np.zeros(shape=[num_rows], dtype=np.int32)
aisle_id = np.zeros(shape=[num_rows], dtype=np.int16)
department_id = np.zeros(shape=[num_rows], dtype=np.int8)
eval_set = np.zeros(shape=[num_rows], dtype='S5')
label = np.zeros(shape=[num_rows], dtype=np.int8)

is_ordered_history = np.zeros(shape=[num_rows, 100], dtype=np.int8)
index_in_order_history = np.zeros(shape=[num_rows, 100], dtype=np.int8)
order_dow_history = np.zeros(shape=[num_rows, 100], dtype=np.int8)
order_hour_history = np.zeros(shape=[num_rows, 100], dtype=np.int8)
days_since_prior_order_history = np.zeros(shape=[num_rows, 100], dtype=np.int8)
order_size_history = np.zeros(shape=[num_rows, 100], dtype=np.int8)
reorder_size_history = np.zeros(shape=[num_rows, 100], dtype=np.int8)
order_number_history = np.zeros(shape=[num_rows, 100], dtype=np.int8)
product_name = np.zeros(shape=[num_rows, 30], dtype=np.int32)
product_name_length = np.zeros(shape=[num_rows], dtype=np.int8)
history_length = np.zeros(shape=[num_rows], dtype=np.int8)

for i, row in product_data.iterrows():
    if i % 10000 == 0:
        print i, num_rows

    user_id[i] = row['user_id']
    product_id[i] = row['product_id']
    aisle_id[i] = row['aisle_id']
    department_id[i] = row['department_id']
    eval_set[i] = row['eval_set']
    label[i] = row['label']

    is_ordered_history[i, :], history_length[i] = pad_1d(map(int, row['is_ordered_history'].split()), 100)
    index_in_order_history[i, :], _ = pad_1d(map(int, row['index_in_order_history'].split()), 100)
    order_dow_history[i, :], _ = pad_1d(map(int, row['order_dow_history'].split()), 100)
    order_hour_history[i, :], _ = pad_1d(map(int, row['order_hour_history'].split()), 100)
    days_since_prior_order_history[i, :], _ = pad_1d(map(int, row['days_since_prior_order_history'].split()), 100)
    order_size_history[i, :], _ = pad_1d(map(int, row['order_size_history'].split()), 100)
    reorder_size_history[i, :], _ = pad_1d(map(int, row['reorder_size_history'].split()), 100)
    order_number_history[i, :], _ = pad_1d(map(int, row['order_number_history'].split()), 100)
    product_name[i, :], product_name_length[i] = pad_1d(map(int, row['product_name_encoded'].split()), 30)

if not os.path.isdir('data/interim'):
    os.makedirs('data/interim')

np.save('data/interim/user_id.npy', user_id)
np.save('data/interim/product_id.npy', product_id)
np.save('data/interim/aisle_id.npy', aisle_id)
np.save('data/interim/department_id.npy', department_id)
np.save('data/interim/eval_set.npy', eval_set)
np.save('data/interim/label.npy', label)

np.save('data/interim/is_ordered_history.npy', is_ordered_history)
np.save('data/interim/index_in_order_history.npy', index_in_order_history)
np.save('data/interim/order_dow_history.npy', order_dow_history)
np.save('data/interim/order_hour_history.npy', order_hour_history)
np.save('data/interim/days_since_prior_order_history.npy', days_since_prior_order_history)
np.save('data/interim/order_size_history.npy', order_size_history)
np.save('data/interim/reorder_size_history.npy', reorder_size_history)
np.save('data/interim/order_number_history.npy', order_number_history)
np.save('data/interim/product_name.npy', product_name)
np.save('data/interim/product_name_length.npy', product_name_length)
np.save('data/interim/history_length.npy', history_length)