In [None]:
import os

import numpy as np
import pandas as pd

In [None]:
import pickle

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
product_df = pd.read_csv('/content/drive/MyDrive/recsys_data/csv/processed/product_data.csv', usecols=['user_id', 'product_id', 'label'])
products = pd.read_csv('/content/drive/MyDrive/recsys_data/csv/raw/products.csv')
product_df = product_df.merge(products, how='left', on='product_id')

orders = pd.read_csv('/content/drive/MyDrive/recsys_data/csv/raw/orders.csv')
orders = orders[orders['eval_set'].isin({'train', 'test'})]
product_df = product_df.merge(orders[['user_id', 'order_id']], how='left', on='user_id').reset_index(drop=True)
product_df['is_none'] = (product_df['product_id'] == 0).astype(int)


In [None]:
with open('/content/drive/MyDrive/recsys_data/prod2vec_data/prod_to_vec.pkl', 'rb') as file:
  pickled_prod2vec = pickle.load(file)

df = pd.DataFrame.from_dict(pickled_prod2vec, orient='index')

df.reset_index(inplace=True)
df.rename(columns={'index': 'product_id'}, inplace=True)

df.columns = ['product_id'] + [f'prod2vec_embedding_{i}' for i in range(df.shape[1] - 1)]
product_df = product_df.merge(df, how='left', on='product_id')


In [None]:
dmf_p_matrix = np.load('dmf/product_embeddings.npy')
product_emb_df = pd.DataFrame(dmf_p_matrix, columns=['dmf_product_{}'.format(i) for i in range(dmf_p_matrix.shape[1])])
product_emb_df['product_id'] = np.arange(1, 49689)
product_emb_df
product_df = product_df.merge(product_emb_df, how='left', on='product_id')

In [None]:
dmf_u_matrix = np.load('dmf/user_embeddings.npy')
user_emb_df = pd.DataFrame(dmf_u_matrix, columns=['dmf_user_{}'.format(i) for i in range(dmf_u_matrix.shape[1])])
user_emb_df['user_id'] = np.arange(1, 206210)
user_emb_df
product_df = product_df.merge(user_emb_df, how='left', on='user_id')

In [None]:
prefix = 'rnn_product'
user_ids = np.load('rnn_product/data/user_id.npy')
print(len(user_ids))
final_states = np.load('rnn_product/final_states.npy')
print(len(final_states))
h_df = pd.DataFrame(np.load('rnn_product/final_states.npy')).add_prefix('{}_h'.format(prefix))[:len(user_ids)]
h_df['user_id'] = user_ids
h_df['product_id'] = np.load('rnn_product/data/product_id.npy')
h_df['{}_prediction'.format(prefix)] = np.load('rnn_product/final_predictions.npy')[:len(user_ids)]
product_df = product_df.merge(h_df, how='left', on=['user_id', 'product_id'])

In [None]:
prefix = 'rnn_aisle'
user_ids = np.load('rnn_aisle/data/user_id.npy')
h_df = pd.DataFrame(np.load('rnn_aisle/final_states.npy')[:len(user_ids)]).add_prefix('{}_h'.format(prefix))
h_df['user_id'] = user_ids
h_df['aisle_id'] = np.load('rnn_aisle/data/aisle_id.npy')
h_df['{}_prediction'.format(prefix)] = np.load('rnn_aisle/final_predictions.npy')[:len(user_ids)]
product_df = product_df.merge(h_df, how='left', on=['user_id', 'aisle_id']).fillna(-1)

In [None]:
prefix = 'rnn_department'
user_ids = np.load('rnn_department/data/user_id.npy')
h_df = pd.DataFrame(np.load('rnn_department/final_states.npy')[:len(user_ids)]).add_prefix('{}_h'.format(prefix))
h_df['user_id'] = user_ids
h_df['department_id'] = np.load('rnn_department/data/department_id.npy')
h_df['{}_prediction'.format(prefix)] = np.load('rnn_department/final_predictions.npy')[:len(user_ids)]
product_df = product_df.merge(h_df, how='left', on=['user_id', 'department_id']).fillna(-1)

In [None]:
prefix = 'rnn_order'
user_ids = np.load('rnn_order/data/user_id.npy')
h_df = pd.DataFrame(np.load('rnn_order/final_states.npy')[:len(user_ids)]).add_prefix('{}_h'.format(prefix))
h_df['user_id'] = user_ids
h_df['{}_prediction'.format(prefix)] = np.load('rnn_order/final_predictions.npy')[:len(user_ids)]
product_df = product_df.merge(h_df, how='left', on='user_id')

In [None]:
prefix = 'rnn_order_gmm'
user_ids = np.load('rnn_order/data/user_id.npy')
h_df = pd.DataFrame(np.load('rnn_order/final_states_gmm.npy')[:len(user_ids)]).add_prefix('{}_h'.format(prefix))
h_df['user_id'] = user_ids
product_df = product_df.merge(h_df, how='left', on='user_id')

In [None]:
drop_cols = [
    'label',
    'user_id',
    'product_id',
    'order_id',
    'product_name',
    'aisle_id',
    'department_id',
]
user_id = product_df['user_id']
product_id = product_df['product_id']
order_id = product_df['order_id']
label = product_df['label']

product_df.drop(drop_cols, axis=1, inplace=True)
features = product_df.values
feature_names = product_df.columns.values
feature_maxs = features.max(axis=0)
feature_mins = features.min(axis=0)
feature_means = features.mean(axis=0)

In [None]:
if not os.path.isdir('prepped_data'):
    os.makedirs('prepped_data')

np.save('prepped_data/user_id.npy', user_id)
np.save('prepped_data/product_id.npy', product_id)
np.save('prepped_data/order_id.npy', order_id)
np.save('prepped_data/features.npy', features)
np.save('prepped_data/feature_names.npy', product_df.columns)
np.save('prepped_data/feature_maxs.npy', feature_maxs)
np.save('prepped_data/feature_mins.npy', feature_mins)
np.save('prepped_data/feature_means.npy', feature_means)
np.save('prepped_data/label.npy', label)

In [None]:
!zip Archive.zip -r prepped_data

In [None]:
!mv Archive.zip /content/drive/MyDrive/recsys_data/blend_data

In [None]:
from google.colab import runtime
runtime.unassign()