In [None]:
import pandas as pd
import numpy as np
import os, sys
from pathlib import Path
import pickle
from sklearn.decomposition import PCA
import tensorflow as tf
from tensorflow.keras.layers import Normalization, Discretization
from tensorflow.keras.layers import CategoryEncoding, Hashing, StringLookup, IntegerLookup

In [None]:
pd.options.display.max_columns = 500

In [None]:
DATA_DIR = '../input/h-and-m-personalized-fashion-recommendations/'
IMAGE_DIR = '../input/hm-image-features-w-resnet50/'
TEXT_DIR = '../input/hm-text-features-w-roberta/'

In [None]:
csv_list = [os.path.join(DATA_DIR, p) for p in os.listdir(DATA_DIR) if p.endswith('.csv') if p != 'sample_submission.csv']

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
# reading all csv
data = {}
for file_path in csv_list:
    file_name = file_path.split('.')[-2].split("/")[-1].strip(" ")
    print(f"Reading {file_name}.csv")
    data[file_name] = reduce_mem_usage(pd.read_csv(file_path))

In [None]:
for k, v in data.items():
    print(f"******** {k} ********")
    print(v.head(2))

In [None]:
for k, v in data.items():
    print(f"******** {k} ********")
    print(v.shape, v.columns.tolist())

In [None]:
def reduce_dimensionality(features, n_components):
    features_ = np.array(features.values.tolist())
    pca_ = PCA(n_components=n_components)
    features_decomposed = pca_.fit_transform(features_)
    return features_decomposed.tolist()

## Image features

In [None]:
# with open(os.path.join(IMAGE_DIR, 'image_df.pkl'), 'rb') as f:
#     image_df = pickle.load(f)
#     image_df['image_features'] = image_df['image_features'].apply(lambda x: x.tolist())
#     image_df.sample(4)
#     image_df['image_features'] = reduce_dimensionality(image_df['image_features'], 10)

## Text features

In [None]:
# with open(os.path.join(TEXT_DIR, 'text_df.pkl'), 'rb') as f:
#     text_df = pickle.load(f)
#     text_df['detail_desc_features'] = text_df['detail_desc_features'].apply(lambda x: x.tolist())
#     text_df.head(4)
#     text_df['detail_desc_features'] = reduce_dimensionality(text_df['detail_desc_features'], 10)

## Customer Feature Extraction Pipeline

FN is if a customer get Fashion News newsletter, Active is if the customer is active for communication, sales channel id, 2 is online and 1 store.

Grouping postal codes based on sales and number of customers

In [None]:
data['customers'].head()

In [None]:
data['customers'].isna().sum()

In [None]:
data['customers'].info()

In [None]:
data['customers'].nunique()

In [None]:
data['customers'].describe()

In [None]:
data['customers']['age'] = data['customers']['age'].astype('float32')
data['customers']['club_member_status'] = data['customers']['club_member_status'].str.lower()
data['customers']['fashion_news_frequency'] = data['customers']['fashion_news_frequency'].str.lower()

missing_value_impute_dict = {
    'FN': 0.0,
    'Active': 0.0,
    'club_member_status': 'Not Applicable',
    'fashion_news_frequency': 'NONE',
    'age': np.round(data['customers']['age'].mean())
}

for col, impute_value in missing_value_impute_dict.items():
    data['customers'][col].loc[data['customers'][col].isna()] = impute_value

In [None]:
{col:data['customers'][col].unique() for col in data['customers'] if col not in ['customer_id', 'postal_code']}

## Key observations
1. The transaction data is not at the correct level and hence will need to be aggregated to `t_dat`, `article_id`, `customer_id`, `sales_channel_id` , `price` level  and `qty` column to be created to adjust for the missing information (28805603 rows vs 31788324 rows)
2. `article_id` and `product_code` seem to map n-to-1
3. Submission dataset has some customers which are not present in transaction file or customer file
`data['sample_submission']['customer_id'].nunique(), data['customers']['customer_id'].nunique(), data['transactions_train']['customer_id'].nunique()` --> 1371980, 1371980, 1362281
4. Breaking the data into 7 day rolling periods can be a good way generate data (a lot of it)

## Transactions Feature Extraction Pipeline

In [None]:
seeded_value = 8888
pd.set_option('display.max_colwidth', 50)
np.random.seed(seeded_value)

# suppress scientific notation
pd.options.display.precision = 2
np.set_printoptions(suppress=True)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [None]:
data['transactions_train'].shape

In [None]:
data['transactions_train'].nunique()

In [None]:
data['transactions_train'].info()

In [None]:
data['transactions_train']['t_dat'].describe()

In [None]:
data['transactions_train']['t_dat'].min(), data['transactions_train']['t_dat'].max()

In [None]:
dates_set = data['transactions_train']['t_dat'].sort_values().unique().tolist()

In [None]:
NUM_DATES = len(dates_set)
print(NUM_DATES)

In [None]:
data['transactions_train']['t_dat'] = pd.to_datetime(data['transactions_train']['t_dat'], format = ("%Y-%m-%d"))

In [None]:
m = data['transactions_train'].groupby(['article_id'], as_index=False)[['t_dat']].agg({'t_dat':min})
m2 = m.groupby('t_dat', as_index=False)['article_id'].count().sort_values('t_dat', ascending=True)
m2['cmltve'] = 100 * m2['article_id'].cumsum() / m2['article_id'].sum()

In [None]:
m2[['t_dat', 'cmltve']].plot(x='t_dat', y='cmltve', figsize=(10,6))

In [None]:
PERCENTILE_CUTOFF = 80 # percent 0 -- 100
PERCENTILE_PRODUCT_COVERED_DATE = str(m2['t_dat'].loc[m2['cmltve'] > 80].min())[:10]
print(PERCENTILE_PRODUCT_COVERED_DATE, np.round(100 * dates_set.index(PERCENTILE_PRODUCT_COVERED_DATE) / len(dates_set)))

In [None]:
def preprocess_dates(data, datecolname):
    data['date_time'] = pd.to_datetime(data[datecolname], format = ("%Y-%m-%d"))
#     print({
#         "min_": data["date_time"].min(),
#         "max_": data["date_time"].max(),
#         "nunique_": data["date_time"].nunique()
#     })
    data['year_dt'] = data['date_time'].dt.year.astype('int16')
    data['month_dt'] = data['date_time'].dt.month.astype('int16')
    data['day_dt'] = data['date_time'].dt.day.astype('int16')
    data['weekofyear_dt'] = data['date_time'].dt.isocalendar().week.astype('int16')
    data['dayofweek_dt'] = data['date_time'].dt.dayofweek.astype('int16') + 1 
    data['dayofyear_dt'] = data['date_time'].dt.dayofyear.astype('int16')
    data['quarter_dt'] = data['date_time'].dt.quarter.astype('int16')
    data['is_month_start_dt'] = data['date_time'].dt.is_month_start.astype('int16')
    data['is_month_end_dt'] = data['date_time'].dt.is_month_end.astype('int16')
    data['is_quarter_start_dt'] = data['date_time'].dt.is_quarter_start.astype('int16')
    data['is_quarter_end_dt'] = data['date_time'].dt.is_quarter_end.astype('int16')
    data['is_year_start_dt'] = data['date_time'].dt.is_year_start.astype('int16')
    data['is_year_end_dt'] = data['date_time'].dt.is_year_end.astype('int16')
    data['is_leap_year_dt'] = data['date_time'].dt.is_leap_year.astype('int16')
    data['daysinmonth_dt'] = data['date_time'].dt.daysinmonth.astype('int16')
    return data

In [None]:
date_columns = [
    'year_dt', 'month_dt', 'day_dt', 'week_dt', 
    'dayofweek_dt', 'weekday_dt', 'dayofyear_dt', 'quarter_dt',
    'is_month_start_dt', 'is_month_end_dt', 'is_quarter_start_dt',
    'is_quarter_end_dt', 'is_year_start_dt', 'is_year_end_dt',
    'is_leap_year_dt', 'daysinmonth_dt'
]

In [None]:
def merge_additional_info(dataset):
    results = dataset.merge(data['customers'], on='customer_id', how='left')
    results = results.merge(image_df[['article_id', 'image_features']], on='article_id', how='left')
    KEEP_COLS = [col for col in text_df.columns if col not in ['prod_name', 'detail_desc']]
    results = results.merge(text_df[KEEP_COLS], on='article_id', how='left')
    return results

# Strategy:
1. Most popular baseline (in the training data)
2. Repeat last ordered item baseline (in the training data)
3. Experiments
    1. Users & Products only
    2. Order features
    3. Customer features
    4. Add date parts
    5. Product features
        1. Product metadata
        2. Product Image
        3. Product Description


In [None]:
def slice_and_agg(dates_subset, dates_subset_y):
    data_slice_y = data['transactions_train'].loc[(data['transactions_train']['t_dat'].isin(dates_subset_y))].copy()
    data_slice_y = data_slice_y.groupby('customer_id', as_index=False).agg({
        'article_id': lambda x: x.tolist()
    }).rename(columns={'article_id':'y'})[['customer_id', 'y']]
    
    print(data_slice_y['customer_id'].nunique(), "customers found in validation period..")
    
    data_slice = data['transactions_train'].loc[(data['transactions_train']['t_dat'].isin(dates_subset))].copy()
    data_slice = data_slice.loc[(data['transactions_train']['customer_id'].isin(data_slice_y['customer_id'].unique()))].copy()
    data_slice['qty'] = 1
    
    print(data_slice['customer_id'].nunique(), "customers found in training period..")
    
    TXN_GROUP_COLS = ['t_dat', 'customer_id', 'sales_channel_id', 'article_id', 'price']

    data_slice = data_slice.groupby(TXN_GROUP_COLS, as_index=False).agg({'qty': 'sum'}).sort_values([
        't_dat', 'customer_id', 'sales_channel_id', 'article_id', 'price'
    ],ascending=[
        True, True, True, True , True
    ])

#     data_slice = merge_additional_info(data_slice)
    data_slice = preprocess_dates(data_slice, 't_dat')
    data_slice = data_slice.groupby('customer_id', as_index=False).agg({
        col: lambda x: x.tolist()
        for col in data_slice.columns # TXN_GROUP_COLS + ['qty'] + date_columns
        if col not in ['customer_id', 't_dat', 'date_time']
    }).reset_index(drop=True)
    
    results = data_slice.merge(data_slice_y, how='inner', on='customer_id')
    print(results['customer_id'].nunique(), "customers found in the final dataset..")
    return results

### Preprocessing module

In [None]:
column_metadata = {
    'products' : {
        'categorical': [],
        'continuous' : []
    },
    'customers' : {
        'categorical': [],
        'continuous' : []
    },
    'date' : {
        'categorical': [],
        'continuous' : []
    },
    'image' : {
        'categorical': [],
        'continuous' : []
    },
    'text' : {
        'categorical': [],
        'continuous' : []
    },
}

In [None]:
CAT_FEATURES = [
    'product_code', 'product_type_no', 'product_type_name', 'product_group_name', 'graphical_appearance_no', 'graphical_appearance_name',
    'colour_group_code', 'colour_group_name', 'perceived_colour_value_id', 'perceived_colour_value_name', 'perceived_colour_master_id',
    'perceived_colour_master_name', 'department_no', 'department_name', 'index_code', 'index_name', 'index_group_no', 'index_group_name',
    'section_no', 'section_name', 'garment_group_no', 'garment_group_name'
]

CAT_FEATURES = CAT_FEATURES + [
    'customer_id', 'FN', 'Active', 'club_member_status', 'fashion_news_frequency', 'postal_code'
] + [
    'sales_channel_id', 'article_id', 
    'month_dt', 'day_dt', 'week_dt', 'weekofyear_dt', 'dayofweek_dt', 'dayofyear_dt', 'quarter_dt', 
]

CONT_FEATURES = [
    'age', 'price', 'year_dt', 'qty',
    'is_month_start_dt', 'is_month_end_dt', 'is_quarter_start_dt', 'is_quarter_end_dt', 
    'is_year_start_dt', 'is_year_end_dt', 'is_leap_year_dt', 'daysinmonth_dt',
]

TEXT_FEATURES = ['detail_desc_features']

IMAGE_FEATURES = ['image_features']

DEP_FEATURES = ['y']

### Setting up the train validation and CV

In [None]:
dates_set_array = np.array(dates_set)

In [None]:
train_validation_indices = [
    (
        list(range(0, i * 7)), 
        list(range(i * 7, (i + 1) * 7)),
        list(range((i + 1) * 7, (i + 2) * 7))
    )
    for i in range((len(dates_set) // 7) + 1)
    if i != 0
]

In [None]:
# for train_i, valid_i, test_i in train_validation_indices:
train_i, valid_i, test_i = train_validation_indices[0]
train_df = slice_and_agg(dates_set_array[train_i], dates_set_array[valid_i])
valid_df = slice_and_agg(dates_set_array[train_i + valid_i], dates_set_array[test_i])

## Preprocessing

1. Date columns will not have a missing value hence no need for a mask_value

#### Padding and Truncation

In [None]:
MAX_SEQ_LEN = 10

In [None]:
# P = ['1.1', '2.0']
# # P = ['1', '2']
# # P = [1.2,2.2]
# # P = [1,2]

# print(type(P[0]))

# np.array([0]*5 + P, dtype=type(P[0])).tolist()

In [None]:
def truncate_and_add_padding(x: list, max_seq_len: int, padding_value: int=0):
    dtype_ = type(x[0])
    x = x[-max_seq_len:]
    len_ = len(x)
    return np.array([padding_value] * (max_seq_len - len_) + x, dtype=dtype_).tolist()

In [None]:
train_data = {
    col: train_df[col].apply(lambda x: truncate_and_add_padding(x, MAX_SEQ_LEN)).tolist()
    if col not in ['customer_id', 'y']
    else train_df[col].tolist()
    for col in train_df.columns
}

#### https://www.tensorflow.org/guide/keras/preprocessing_layers#preprocessing_data_before_the_model_or_inside_the_model

In [None]:
train_df.columns

In [None]:
train_data['customer_id'][:5]

In [None]:
# Hashing processing -- flat dimensions
input_layer = tf.keras.layers.Input(
    shape=(1,),
    name='customer_id_input',
    dtype=tf.string)
# Use the Hashing layer to hash the values to the range [0, 64]
hasher = tf.keras.layers.Hashing(num_bins=64, salt=1337, name='customer_id_hasher')

# Use the CategoryEncoding layer to multi-hot encode the hashed values
# encoder = tf.keras.layers.IntegerLookup(max_tokens=64, output_mode="int", name='customer_id_category_encoding')
# encoded_data = encoder(hasher(x['customer_id'].values))
encoded_data = hasher(train_data['customer_id'])

In [None]:
encoded_data.numpy().shape

In [None]:
np.unique([len(i) for i in train_data['article_id']], return_counts=True)

In [None]:
# Hasing processing --- array dimensions

input_layer = tf.keras.layers.Input(
    shape=(None, None),
    name='article_id_input',
    dtype=tf.string)
hasher = tf.keras.layers.Hashing(num_bins=64, mask_value=0, name='article_id_hasher')
encoded_data = hasher(train_data['article_id'])

In [None]:
hasher([[0,0,0,0,1,1,1,1], [0,0,0,0,1,1,1,1], [0,0,0,0,1,1,1,1]])

In [None]:
encoded_data

In [None]:
data['articles']['product_group_name'].unique(), data['articles']['product_group_name'].nunique()

In [None]:
# String processing
train_data_sample = data['articles']['product_group_name']

input_layer = tf.keras.layers.Input(
    shape=1,
    name='product_group_name_input',
    dtype=tf.string)

l2 = tf.keras.layers.StringLookup(max_tokens=None, num_oov_indices=1, output_mode='int', vocabulary=data['articles']['product_group_name'].unique())

In [None]:
l2(data['articles']['product_group_name'].values)

In [None]:
l2([['a', 'b', 'Accessories'], ['a', 'b', 'Accessories']])

#### numerical processing

In [None]:
np.unique(data['customers']['age'], return_counts=True)

In [None]:
x = data['customers']['age'].values
n = Normalization(axis=None)
n.adapt(x)

In [None]:
n([16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28.,
        29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
        42., 43., 44., 45., 46., 47., 48., 49., 50., 51., 52., 53., 54.,
        55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65., 66., 67.,
        68., 69., 70., 71., 72., 73., 74., 75., 76., 77., 78., 79., 80.,
        81., 82., 83., 84., 85., 86., 87., 88., 89., 90., 91., 92., 93.,
        94., 95., 96., 97., 98., 99.])

In [None]:
d = Discretization(bin_boundaries=[18, 21, 25, 30, 35, 40 , 45, 50, 55, 60])
d([16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28.,
        29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
        42., 43., 44., 45., 46., 47., 48., 49., 50., 51., 52., 53., 54.,
        55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65., 66., 67.,
        68., 69., 70., 71., 72., 73., 74., 75., 76., 77., 78., 79., 80.,
        81., 82., 83., 84., 85., 86., 87., 88., 89., 90., 91., 92., 93.,
        94., 95., 96., 97., 98., 99.])

In [None]:
d = Discretization(num_bins=10)
d.adapt(data['customers']['age'].values)

In [None]:
d([[16., 17., 18., 19.], [20., 21., 22., 23.], [24., 25., 26., 27.], [96., 97., 98., 99.]])