In [None]:
%load_ext kedro.extras.extensions.ipython

In [None]:
%reload_kedro

In [None]:
import pandas as pd
import numpy as np

In [None]:
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [None]:
catalog.list()

In [None]:
candidates = context.catalog.load('candidates_sample')

In [None]:
candidates.shape

In [None]:
candidates.isna().sum()/candidates.shape[0]

In [None]:
candidates.set_index('customer_id', inplace=True)

In [None]:
dfs = list()

In [None]:
%%time
for column in candidates.columns:
    print(column)
    df = candidates[[column]].explode(column).rename({column: 'article_id'}, axis=1).dropna()
    df = df.assign(strategy_name=lambda x: column)
    print(df.shape)
    dfs.append(df)

In [None]:
dfs[4]

In [None]:
unpacked_candidates = pd.concat(dfs, axis=0).reset_index()

In [None]:
unpacked_candidates.shape

In [None]:
unpacked_candidates.head()

### random

In [None]:
unpacked_candidates.sample(frac=1).groupby(['customer_id', 'article_id']).head(1).shape

In [None]:
unpacked_candidates.sample(frac=1).groupby(['customer_id', 'article_id']).head(1).head()

### removing duplicate CANDIDATES (from multiple strategies)

In [None]:
unpacked_candidates['count_strategy_name'] = (
    unpacked_candidates
    .groupby(['customer_id', 'article_id'])['strategy_name']
    .transform('count'))

In [None]:
unpacked_candidates.shape

In [None]:
unpacked_candidates.head()

In [None]:
unpacked_candidates.strategy_name.value_counts()

In [None]:
unpacked_candidates[['strategy_name', 'count_strategy_name']].value_counts()

In [None]:
unpacked_candidates['strategy_name'] = np.where(unpacked_candidates.count_strategy_name>1, 'multiple_strategies', unpacked_candidates.strategy_name)

In [None]:
unpacked_candidates[['strategy_name', 'count_strategy_name']].value_counts()

In [None]:
unpacked_candidates.drop(['count_strategy_name'], axis=1).drop_duplicates().shape

In [None]:
unpacked_candidates.drop(['count_strategy_name'], axis=1).drop_duplicates().groupby(['customer_id'])['article_id'].count().describe(percentiles=[0.05, 0.1, 0.2, 0.8, 0.9, 0.95])

In [None]:
unpacked_candidates.drop(['count_strategy_name'], axis=1).drop_duplicates().strategy_name.value_counts()

In [None]:
articles = context.catalog.load('articles_sample')

In [None]:
customers = context.catalog.load('customers_sample')

In [None]:
transactions = context.catalog.load('transactions_sample')

In [None]:
articles.shape, customers.shape, transactions.shape

### articles, customer features (non-interaction)

##### count -> user-item, user-category of last week/month/season/same week of last year

In [None]:
# count of article id per customer_id
transactions.groupby(['customer_id'])['article_id'].count().reset_index(name='count_of_article_per_customer')

In [None]:
# count of prod_group_name per customer_id
(transactions
    .merge(articles[['article_id', 'product_group_name']], on='article_id')[['customer_id', 'product_group_name']]
    .drop_duplicates()
    .groupby(['customer_id'])['product_group_name']
    .count()
    .reset_index(name='count_of_product_group_name_per_customer')
)

##### time -> first, last days of transactions

In [None]:
transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])

In [None]:
max_date = transactions['t_dat'].max()

In [None]:
max_date

In [None]:
# days since first transaction for each customer
(max_date - 
    (transactions
        .groupby(['customer_id'])['t_dat']
        .min()
    )
).dt.days.reset_index(name='days_since_first_transaction')

In [None]:
# days since last transaction for each customer
(max_date - 
    (transactions
        .groupby(['customer_id'])['t_dat']
        .max()
    )
).dt.days.reset_index(name='days_since_last_transaction')

In [None]:
# average purchase span for each customer
sorted_transactions = transactions.sort_values(by=['customer_id', 't_dat'], ascending=[True, False])[['customer_id', 't_dat']].drop_duplicates()
sorted_transactions['t_dat_next'] = sorted_transactions['t_dat'].shift(1)
sorted_transactions['customer_id_next'] = sorted_transactions['customer_id'].shift(1)
sorted_transactions = sorted_transactions.assign(avg_purchase_span = lambda x: x.t_dat_next-x.t_dat)
(
    sorted_transactions[sorted_transactions['customer_id']==sorted_transactions['customer_id_next']]
    .groupby(['customer_id'])['avg_purchase_span']
    .mean().dt.days
    .reset_index()
)

In [None]:
# average/median purchase span for each customer/article pair
print('todo later')

##### percentage of rebuying items

In [None]:
(
    transactions
    .groupby(['customer_id', 'article_id'])['t_dat'].count()
    .reset_index()
    .assign(rebought=lambda x: (x.t_dat>1).astype(int))
    .groupby(['article_id'])['rebought'].mean()
    .reset_index(name='perc_rebought')
)

##### sales_channel_id percentage

In [None]:
transactions.sales_channel_id.value_counts()

In [None]:
# mean percentage sales offline per customer
(
    transactions.assign(if_offline=lambda x: x.sales_channel_id-1)
    .groupby(['customer_id'])['if_offline']
    .mean()
    .reset_index(name='perc_customer_sales_offline')
)

In [None]:
# mean percentage sales offline per article
(
    transactions.assign(if_offline=lambda x: x.sales_channel_id-1)
    .groupby(['article_id'])['if_offline']
    .mean()
    .reset_index(name='perc_article_sales_offline')
)

##### item attributes count

##### count prod_group_name

In [None]:
# count prod_group_name
(
    transactions
    .merge(articles[['article_id', 'product_group_name']], on='article_id')
    .groupby(['customer_id', 'product_group_name'])['article_id']
    .count()
    .reset_index(name='count_from_prod_group_name')
)

In [None]:
## YOU DID NOT IMPLEMENT THIS IN KEDRO
## BECAUSE IT'S TOO SIMILAR ^ ABOVE
# has bought from this product_group_name
(
    transactions
    .merge(articles[['article_id', 'product_group_name']], on='article_id')[['customer_id', 'product_group_name']]
    .drop_duplicates()
    .assign(has_bought_prod_group=lambda x: 1)
)

- count of user-item and user-item attributes in the last 7/30/all days

In [None]:
(
    transactions
    .groupby(['customer_id', 'article_id'])['t_dat']
    .count()
    .reset_index(name='count_customer_article_id')
)

### candidate features

##### jaccard similarity

The actual comparison was actually simpler. I calculated average jaccard index of attributes. So when a customer had 3 items in the history:

item_1 attributes A,B,C  
item_2 attributes B,C,D  
item_3 attributes A,B,D

Then you consider a new item_4 with attributes A,B so the average jaccard is calculated like this

item_1 attributes A,B,C = jaccard 0.66  
item_2 attributes B,C,D = jaccard 0.33  
item_3 attributes A,B,D = jaccard 0.66

so the average similarity is 0.55

I had a different version to calculate "fuzzy" attribute similarity based on their cooccurence in baskets so if the attribute=X cooccured with attribute=Y in 50% of the baskets its similarity is 50% not 0 or 1 like in the original.

In [None]:
articles.columns

In [None]:
cat_articles = articles[['product_type_name', 'product_group_name', 'graphical_appearance_name', 'colour_group_name', 'perceived_colour_value_name', 'perceived_colour_master_name', 'index_name', 'index_group_name', 'section_name', 'garment_group_name']]

In [None]:
jaccard_articles = articles[['article_id']].copy()

In [None]:
jaccard_articles['set_of_attributes'] = cat_articles.apply(set, axis=1)

In [None]:
jaccard_articles

In [None]:
jaccard_articles.set_index(['article_id'], inplace=True)

In [None]:
jaccard_articles

In [None]:
type(jaccard_articles)

### so inefficient

In [None]:
jaccard_articles.add_suffix('_first').merge(jaccard_articles.add_suffix('_second'), how='cross')

### so inefficient :DD

In [None]:
attributes_users = list(zip(jaccard_articles["set_of_attributes"], jaccard_articles["article_id"]))

In [None]:
result = []

In [None]:
# create list of all possible combinations between sets of (wording, user) and loop through them
for item in list(itertools.combinations(attributes_users, 2)):
    similarity = jaccard_similarity(item[0][0], item[1][0])
    data = {"user1": item[0][1], "user2": item[1][1], "similarity": similarity}
    result.append(data)

In [None]:
pd.DataFrame(result)

In [None]:
def filter_last_n_transactions_per_customer(transactions: pd.DataFrame, last_n_transactions: int) -> pd.DataFrame:
    transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])
    sorted_transactions = transactions.sort_values(by='t_dat', ascending=False)
    latest_transactions = sorted_transactions.groupby(['customer_id']).head(last_n_transactions)
    return latest_transactions

In [None]:
transactions.shape

In [None]:
for n in [1, 3, 5, 10]:
    print(filter_last_n_transactions_per_customer(transactions, n).shape)

In [None]:
transactions.customer_id.nunique()

### works

In [None]:
def jaccard(x, y):
    """ returns the jaccard similarity between two lists """
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    return intersection_cardinality/float(union_cardinality)

In [None]:
jaccard_articles.set_index(['article_id'], inplace=True)

In [None]:
jaccard_articles

In [None]:
jaccard_articles.loc['0679630001']['set_of_attributes']

In [None]:
def avg_jaccard_similarity(
    candidate_item: str,
    customer_id: str,
    articles_attributes: pd.DataFrame,
    transactions: pd.DataFrame):
    
    candidate_item_attributes = articles_attributes.loc[candidate_item]['set_of_attributes']
    bought_items = list(transactions[transactions['customer_id']==customer_id].article_id.unique())
    jaccard_similarity_list = []
    for item in bought_items:
        item_attributes = articles_attributes.loc[item]['set_of_attributes']
        jaccard_similarity = jaccard(candidate_item_attributes, item_attributes)
        jaccard_similarity_list.append(jaccard_similarity)
    print(jaccard_similarity_list)
    return sum(jaccard_similarity_list)/len(jaccard_similarity_list)    

In [None]:
avg_jaccard_similarity(candidate_item='0545612001',
                       articles_attributes=jaccard_articles,
                       customer_id='fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e593881ae6007d775f0f',
                       transactions=transactions
                      )

make it into whole dataframe calculation

In [None]:
def calculate_avg_jaccard_similarity(
    candidates_df: pd.DataFrame,
    articles_attributes: pd.DataFrame,
    transactions: pd.DataFrame):
    candidate_item, candidate_user = candidates_df.article_id, candidates_df.customer_id
    candidate_item_attributes = articles_attributes.loc[candidate_item]['set_of_attributes']
    bought_items = list(transactions[transactions['customer_id']==candidate_user].article_id.unique())
    if not bought_items:
        return 0
    jaccard_similarity_list = []
    for item in bought_items:
        item_attributes = articles_attributes.loc[item]['set_of_attributes']
        jaccard_similarity = jaccard(candidate_item_attributes, item_attributes)
        jaccard_similarity_list.append(jaccard_similarity)
    return sum(jaccard_similarity_list)/len(jaccard_similarity_list)    

In [None]:
candidates_df = pd.DataFrame({'customer_id': sorted(customers['customer_id'].sample(5).to_list()*3),
                              'article_id': articles['article_id'].sample(15).to_list()
                             })

In [None]:
candidates_df.head()

In [None]:
candidates_df.apply(lambda x: calculate_avg_jaccard_similarity(x, jaccard_articles, transactions), axis=1)

### image/text embeddings

In [None]:
image_embeddings = context.catalog.load('image_embeddings')

In [None]:
image_embeddings

In [None]:
def calculate_customer_embedding(df: pd.DataFrame, embeddings: pd.DataFrame):
    list_of_articles = list(df.article_id.unique())
    mean_embeddings = list(embeddings[embeddings.index.isin(list_of_articles)].mean(axis=0))
    return mean_embeddings

In [None]:
%%time
cust_embeddings = transactions.groupby(['customer_id']).apply(lambda x: calculate_customer_embedding(x, image_embeddings)).reset_index(name='embeddings')

In [None]:
cust_embeddings.shape

In [None]:
image_embeddings.shape

In [None]:
text_embeddings = context.catalog.load('text_embeddings')

In [None]:
text_embeddings.shape

In [None]:
%%time
transactions.groupby(['customer_id']).apply(lambda x: calculate_customer_embedding(x, text_embeddings)).reset_index(name='embeddings')

to split embeddings from list to single columns

df3 = df2.teams.apply(pd.Series) <br>
df3.columns = ['team1', 'team2']

In [None]:
np.array(cust_embeddings.set_index('customer_id').loc['000531ecd2754a0cf214373cc7b6dbcf7ce9a77c3284a2facbe072f763618bbc'].embeddings)

In [None]:
image_embeddings.loc['0878510001'].values

In [None]:
def cosine_similarity(A, B):
    return np.dot(A,B)/(np.linalg.norm(A)*np.linalg.norm(B))

In [None]:
def cosine_embedding_similarity(candidates_df, customers_embeddings, items_embeddings):
    """
    """
    candidate_item, candidate_user = candidates_df.article_id, candidates_df.customer_id
    try:
        customer_embeddings = np.array(cust_embeddings.set_index('customer_id').loc[candidate_user].embeddings)
    except KeyError:
        return 0
    try:
        candidate_embeddings = items_embeddings.loc[candidate_item].values
    except KeyError:
        return 0
    return cosine_similarity(customer_embeddings, candidate_embeddings)

In [None]:
candidates_df = pd.DataFrame({'customer_id': sorted(customers['customer_id'].sample(5).to_list()*3),
                              'article_id': articles['article_id'].sample(15).to_list()
                             })

In [None]:
candidates_df.head()

In [None]:
candidates_df.apply(lambda x: cosine_embedding_similarity(x, cust_embeddings, image_embeddings), axis=1)

text similarities

In [None]:
%%time
cust_embeddings = transactions.groupby(['customer_id']).apply(lambda x: calculate_customer_embedding(x, text_embeddings)).reset_index(name='embeddings')

In [None]:
candidates_df.apply(lambda x: cosine_embedding_similarity(x, cust_embeddings, text_embeddings), axis=1)

all similarities

collaborative filtering still AWAITS :<<<

candidates -> STRATEGY FEATURE, MAYBE SCORE?