In [1]:
from collections import Counter
from datetime import datetime
from tqdm import tqdm
import pandas as pd
import numpy as np
import cudf
import os

In [2]:
def load_and_preprocess_transactions(path='./transactions_train.csv', parquet_path='transactions.pqt'):
    transaction_columns = [
        't_dat',
        'customer_id',
        'article_id'
    ]
    if os.path.exists(parquet_path):
        print(f"Loading cached Parquet file: {parquet_path}")
        return cudf.read_parquet(parquet_path)
    
    
    print(f"Processing raw CSV file: {path}")
    df = pd.read_csv(path)
    df = df[transaction_columns]
    
    gdf = cudf.DataFrame.from_pandas(df)
    gdf['customer_id'] = gdf['customer_id'].str[-16:].str.hex_to_int().astype('int64')
    gdf['article_id'] = gdf['article_id'].astype('int32')
    gdf['t_dat'] = cudf.to_datetime(gdf['t_dat'])
    
    gdf.to_parquet(parquet_path, index=False)
    print(f"Saved processed data to: {parquet_path}")
    
    return gdf

In [3]:

data = load_and_preprocess_transactions()

Loading cached Parquet file: transactions.pqt


In [4]:
train1 = data.loc[(data["t_dat"] >= datetime(2020, 9, 8)) & (data["t_dat"] < datetime(2020, 9, 16))]
train2 = data.loc[(data["t_dat"] >= datetime(2020, 9, 1)) & (data["t_dat"] < datetime(2020, 9, 8))]
train3 = data.loc[(data["t_dat"] >= datetime(2020, 8, 23)) & (data["t_dat"] < datetime(2020, 9, 1))]
train4 = data.loc[(data["t_dat"] >= datetime(2020, 8, 15)) & (data["t_dat"] < datetime(2020, 8, 23))]

val = data.loc[data["t_dat"] >= datetime(2020, 9, 16)]

In [5]:
def group_articles(df):
    return df.groupby('customer_id').agg({'article_id': 'collect'})

In [6]:
positive_items_per_user1 = group_articles(train1)
positive_items_per_user2 = group_articles(train2)
positive_items_per_user3 = group_articles(train3)
positive_items_per_user4 = group_articles(train4)


In [7]:
train = cudf.concat([train1, train2, train3, train4], axis=0)



In [8]:
ref_date = datetime(2020, 9, 16)
train['days_diff'] = (ref_date - train['t_dat']).dt.days
train['pop_factor'] = 1 / (train['days_diff'] ** 2)

In [9]:
popular_items_group = train.groupby(['article_id'])['pop_factor'].sum()

In [10]:
items_total_count = train.groupby(['article_id'])['article_id'].count()

In [11]:
users_total_count = train.groupby(['customer_id'])['customer_id'].count()

In [12]:
train['feedback'] = 1
train= train.groupby(['customer_id', 'article_id']).agg({'feedback': 'sum'}).reset_index()
train = train.merge(popular_items_group, on='article_id')
train['feedback'] = train['feedback'] / train['pop_factor']

In [13]:
train['feedback'] = train['feedback'].clip(upper=5.0)


In [14]:
train = train.sample(frac=1).reset_index(drop=True)
train['feedback'].describe()

count    1.044811e+06
mean     8.252942e-01
std      1.455205e+00
min      5.373625e-03
25%      6.460413e-02
50%      1.771086e-01
75%      6.486035e-01
max      5.000000e+00
Name: feedback, dtype: float64

In [15]:
train_pop = data[(data['t_dat'] >= datetime(2020, 9, 1)) & (data['t_dat'] < datetime(2020, 9, 16))]
train_pop['pop_factor'] = 1 / (ref_date - train_pop['t_dat']).dt.days
popular_items_group = train_pop.groupby('article_id')['pop_factor'].sum()

values = popular_items_group.to_pandas().values
keys = popular_items_group.index.to_pandas().values
_, popular_items = zip(*sorted(zip(values, keys))[::-1])

In [19]:
from collections import defaultdict

In [22]:
def get_most_freq_next_item_cudf(train_cudf):
    
    customer_ids = train_cudf['customer_id'].to_pandas().values
    article_ids = train_cudf['article_id'].to_pandas().values

    user_group = defaultdict(list)
    for cid, aid in zip(customer_ids, article_ids):
        user_group[cid].append(aid)

    next_items = {}
    for items in tqdm(user_group.values()):
        for i in range(len(items) - 1):
            item = items[i]
            next_item = items[i + 1]
            if item != next_item:
                if item not in next_items:
                    next_items[item] = []
                next_items[item].append(next_item)

    pred_next = {}
    for item, next_list in next_items.items():
        if len(next_list) >= 5:
            counter = Counter(next_list)
            most_common = counter.most_common(1)[0]
            ratio = most_common[1] / len(next_list)
            if ratio >= 0.1:
                pred_next[item] = most_common[0]

    return pred_next


In [23]:
pred_next = get_most_freq_next_item_cudf(train)

100%|██████████| 254618/254618 [00:01<00:00, 173957.82it/s]


In [28]:
train

Unnamed: 0,customer_id,article_id,feedback,pop_factor
0,1922204863122141028,884319001,0.032755,30.529496
1,371834049679164774,896170005,0.137613,7.266739
2,-5531003004444685263,878079001,1.046723,0.955363
3,-4967268061075549581,893133001,0.028883,34.622530
4,-2717377534273311790,562245084,0.138711,7.209208
...,...,...,...,...
1044806,6063253376555412458,832307007,0.018132,55.152144
1044807,-2410089262427810293,727539001,1.874579,1.600359
1044808,8005633013617885822,761406001,0.014400,69.444728
1044809,7483539152545825756,906114002,3.837562,0.521164


In [32]:
import cupy as cp


In [24]:
from reco.recommender import FunkSVD
from reco.metrics import rmse

# k = number of dimensions of the latent embedding. formatizer dict takes in names of the columns
# for user, item and values/feedback/ratings respectively.

svd = FunkSVD(k=8, learning_rate=0.008, regularizer = .01, iterations = 80, method = 'stochastic', bias=True)
svd.fit(X=train, formatizer={'user':'customer_id', 'item':'article_id', 'value':'feedback'},verbose=True)

ModuleNotFoundError: No module named 'reco'

In [None]:
def apk(actual, predicted, k=12):
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=12):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [None]:
positive_items_val = val.groupby(['customer_id'])['article_id'].apply(list)
val_users = positive_items_val.keys()
val_items = []

for i,user in tqdm(enumerate(val_users)):
    val_items.append(positive_items_val[user])
    
print("Total users in validation:", len(val_users))

In [None]:
from collections import Counter
outputs = []
cnt = 0

popular_items = list(popular_items)

for user in tqdm(val_users):
    user_output = []
    if user in positive_items_per_user1.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user1[user]).most_common()}
        user_output += list(most_common_items_of_user.keys())[:12]
    if user in positive_items_per_user2.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user2[user]).most_common()}
        user_output += list(most_common_items_of_user.keys())[:12]
    if user in positive_items_per_user3.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user3[user]).most_common()}
        user_output += list(most_common_items_of_user.keys())[:12]
    if user in positive_items_per_user4.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user4[user]).most_common()}
        user_output += list(most_common_items_of_user.keys())[:12]
    
    user_output += [pred_next[item] for item in user_output if item in pred_next and pred_next[item] not in user_output]      
    
    user_output += list(popular_items[:12 - len(user_output)])
    outputs.append(user_output)
    
print("mAP Score on Validation set:", mapk(val_items, outputs))

#### Now, prediction WITH the SVD reranking!

In [None]:
from collections import Counter
outputs = []
cnt = 0

popular_items = list(popular_items)
userindexes = {svd.users[i]:i for i in range(len(svd.users))}

for user in tqdm(val_users):
    user_output = []
    if user in positive_items_per_user1.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user1[user]).most_common()}
        user_index = userindexes[user]
        new_order = {}
        for k in list(most_common_items_of_user.keys())[:20]:
            try:
                itemindex = svd.items.index(k)
                pred_value = np.dot(svd.userfeatures[user_index], svd.itemfeatures[itemindex].T) + svd.item_bias[0, itemindex]
            except:
                pred_value = most_common_items_of_user[k]
            new_order[k] = pred_value
        user_output += [k for k, v in sorted(new_order.items(), key=lambda item: item[1])][:12]
        
    if user in positive_items_per_user2.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user2[user]).most_common()}
        user_index = userindexes[user]
        new_order = {}
        for k in list(most_common_items_of_user.keys())[:20]:
            try:
                itemindex = svd.items.index(k)
                pred_value = np.dot(svd.userfeatures[user_index], svd.itemfeatures[itemindex].T) + svd.item_bias[0, itemindex]
            except:
                pred_value = most_common_items_of_user[k]
            new_order[k] = pred_value
        user_output += [k for k, v in sorted(new_order.items(), key=lambda item: item[1])][:12]
        
    if user in positive_items_per_user3.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user3[user]).most_common()}
        user_index = userindexes[user]
        new_order = {}
        for k in list(most_common_items_of_user.keys())[:20]:
            try:
                itemindex = svd.items.index(k)
                pred_value = np.dot(svd.userfeatures[user_index], svd.itemfeatures[itemindex].T) + svd.item_bias[0, itemindex]
            except:
                pred_value = most_common_items_of_user[k]
            new_order[k] = pred_value
        user_output += [k for k, v in sorted(new_order.items(), key=lambda item: item[1])][:12]
        
    if user in positive_items_per_user4.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user4[user]).most_common()}
        user_index = userindexes[user]
        new_order = {}
        for k in list(most_common_items_of_user.keys())[:20]:
            try:
                itemindex = svd.items.index(k)
                pred_value = np.dot(svd.userfeatures[user_index], svd.itemfeatures[itemindex].T) + svd.item_bias[0, itemindex]
            except:
                pred_value = most_common_items_of_user[k]
            new_order[k] = pred_value
        user_output += [k for k, v in sorted(new_order.items(), key=lambda item: item[1])][:12]
        
    user_output += [pred_next[item] for item in user_output if item in pred_next and pred_next[item] not in user_output]      
    
    user_output += list(popular_items[:12 - len(user_output)])
    outputs.append(user_output)
    
print("mAP Score on Validation set:", mapk(val_items, outputs))

Decent jump of 0.001 I would say with 4 weeks of training data!

# Submission

We will do all the same things all over again just with different time periods.

In [None]:
train1 = data.loc[(data["t_dat"] >= datetime.datetime(2020,9,16)) & (data['t_dat'] < datetime.datetime(2020,9,23))]
train2 = data.loc[(data["t_dat"] >= datetime.datetime(2020,9,8)) & (data['t_dat'] < datetime.datetime(2020,9,16))]
train3 = data.loc[(data["t_dat"] >= datetime.datetime(2020,8,31)) & (data['t_dat'] < datetime.datetime(2020,9,8))]
train4 = data.loc[(data["t_dat"] >= datetime.datetime(2020,8,23)) & (data['t_dat'] < datetime.datetime(2020,8,31))]

positive_items_per_user1 = train1.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user2 = train2.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user3 = train3.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user4 = train4.groupby(['customer_id'])['article_id'].apply(list)

train = pd.concat([train1, train2], axis=0)
train['pop_factor'] = train['t_dat'].apply(lambda x: 1/(datetime.datetime(2020,9,23) - x).days)
popular_items_group = train.groupby(['article_id'])['pop_factor'].sum()

_, popular_items = zip(*sorted(zip(popular_items_group, popular_items_group.keys()))[::-1])

user_group = pd.concat([train1, train2, train3, train4], axis=0).groupby(['customer_id'])['article_id'].apply(list)

In [None]:
# SVD
train = pd.concat([train1, train2, train3, train4], axis=0)
train['pop_factor'] = train['t_dat'].apply(lambda x: 1/(datetime.datetime(2020,9,23) - x).days**2)
popular_items_group = train.groupby(['article_id'])['pop_factor'].sum()

train['feedback'] = 1
train = train.groupby(['customer_id', 'article_id']).sum().reset_index()

train['feedback'] = train.apply(lambda row: row['feedback']/popular_items_group[row['article_id']], axis=1)

train['feedback'] = train['feedback'].apply(lambda x: 5.0 if x>5.0 else x)
train.drop(['price', 'sales_channel_id'], axis=1, inplace=True)
train['feedback'].describe()

In [None]:
train = train.sample(frac=1).reset_index(drop=True)
train.head()

In [None]:
from reco.recommender import FunkSVD
from reco.metrics import rmse

f = FunkSVD(k=8, learning_rate=0.005, regularizer = .01, iterations = 200, method = 'stochastic', bias=True)
f.fit(X=train, formatizer={'user':'customer_id', 'item':'article_id', 'value':'feedback'},verbose=True)

In [None]:
submission = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv")
submission.head()

In [None]:
outputs = []
cnt = 0

popular_items = list(popular_items)
userindexes = {f.users[i]:i for i in range(len(f.users))}

for user in tqdm(submission['customer_id']):
    user_output = []
    if user in positive_items_per_user1.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user1[user]).most_common()}
        
        user_index = userindexes[user]
        new_order = {}
        for k in list(most_common_items_of_user.keys())[:20]:
            try:
                itemindex = f.items.index(k)
                pred_value = np.dot(f.userfeatures[user_index], f.itemfeatures[itemindex].T) + f.item_bias[0, itemindex]
            except:
                pred_value = most_common_items_of_user[k]
            new_order[k] = pred_value
        user_output += [k for k, v in sorted(new_order.items(), key=lambda item: item[1])][:12]
        
    if user in positive_items_per_user2.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user2[user]).most_common()}
        
        user_index = userindexes[user]
        new_order = {}
        for k in list(most_common_items_of_user.keys())[:20]:
            try:
                itemindex = f.items.index(k)
                pred_value = np.dot(f.userfeatures[user_index], f.itemfeatures[itemindex].T) + f.item_bias[0, itemindex]
            except:
                pred_value = most_common_items_of_user[k]
            new_order[k] = pred_value
        user_output += [k for k, v in sorted(new_order.items(), key=lambda item: item[1])][:12]
        
    if user in positive_items_per_user3.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user3[user]).most_common()}
        
        user_index = userindexes[user]
        new_order = {}
        for k in list(most_common_items_of_user.keys())[:20]:
            try:
                itemindex = f.items.index(k)
                pred_value = np.dot(f.userfeatures[user_index], f.itemfeatures[itemindex].T) + f.item_bias[0, itemindex]
            except:
                pred_value = most_common_items_of_user[k]
            new_order[k] = pred_value
        user_output += [k for k, v in sorted(new_order.items(), key=lambda item: item[1])][:12]
        
    if user in positive_items_per_user4.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user4[user]).most_common()}
        
        user_index = userindexes[user]
        new_order = {}
        for k in list(most_common_items_of_user.keys())[:20]:
            try:
                itemindex = f.items.index(k)
                pred_value = np.dot(f.userfeatures[user_index], f.itemfeatures[itemindex].T) + f.item_bias[0, itemindex]
            except:
                pred_value = most_common_items_of_user[k]
            new_order[k] = pred_value
        user_output += [k for k, v in sorted(new_order.items(), key=lambda item: item[1])][:12]
        
    user_output += [pred_next[item] for item in user_output if item in pred_next and pred_next[item] not in user_output]      
    
    user_output += list(popular_items[:12 - len(user_output)])
    outputs.append(user_output)
    
str_outputs = []
for output in outputs:
    str_outputs.append(" ".join([str(x) for x in output]))

In [None]:
submission['prediction'] = str_outputs
submission.to_csv("submissions.csv", index=False)

In [None]:
submission.head()

#### That's it! Upvote and Enjoy!
Examples of FunkSVD and FM are over at https://github.com/mayukh18/reco/tree/master/examples. I'll try to add an example of [Wide And Deep Network](https://arxiv.org/pdf/1606.07792.pdf) in the coming days. That is also a pretty cool model to try next.