In [None]:
%load_ext kedro.extras.extensions.ipython

In [None]:
%reload_kedro

In [None]:
import pandas as pd
import numpy as np

In [None]:
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [None]:
catalog.list()

In [None]:
candidates = context.catalog.load('candidates_sample')

In [None]:
candidates.shape

In [None]:
candidates.isna().sum()/candidates.shape[0]

In [None]:
candidates.set_index('customer_id', inplace=True)

In [None]:
dfs = list()

In [None]:
%%time
for column in candidates.columns:
    print(column)
    df = candidates[[column]].explode(column).rename({column: 'article_id'}, axis=1).dropna()
    df = df.assign(strategy_name=lambda x: column)
    print(df.shape)
    dfs.append(df)

In [None]:
dfs[4]

In [None]:
candidates = pd.concat(dfs, axis=0).reset_index()

In [None]:
candidates.shape

In [None]:
candidates.head()

#### other dataframes

In [None]:
articles = context.catalog.load('articles')

In [None]:
customers = context.catalog.load('customers_sample')

In [None]:
transactions = context.catalog.load('transactions_sample')

In [None]:
articles.shape, customers.shape, transactions.shape

### candidate features

##### jaccard similarity

The actual comparison was actually simpler. I calculated average jaccard index of attributes. So when a customer had 3 items in the history:

item_1 attributes A,B,C  
item_2 attributes B,C,D  
item_3 attributes A,B,D

Then you consider a new item_4 with attributes A,B so the average jaccard is calculated like this

item_1 attributes A,B,C = jaccard 0.66  
item_2 attributes B,C,D = jaccard 0.33  
item_3 attributes A,B,D = jaccard 0.66

so the average similarity is 0.55

I had a different version to calculate "fuzzy" attribute similarity based on their cooccurence in baskets so if the attribute=X cooccured with attribute=Y in 50% of the baskets its similarity is 50% not 0 or 1 like in the original.

In [None]:
articles.columns

In [None]:
cat_articles = articles[['product_type_name', 'product_group_name', 'graphical_appearance_name', 'colour_group_name', 'perceived_colour_value_name', 'perceived_colour_master_name', 'index_name', 'index_group_name', 'section_name', 'garment_group_name']]

In [None]:
jaccard_articles = articles[['article_id']].copy()

In [None]:
jaccard_articles['set_of_attributes'] = cat_articles.apply(set, axis=1)

In [None]:
jaccard_articles

In [None]:
jaccard_articles.set_index(['article_id'], inplace=True)

In [None]:
jaccard_articles

### works

In [None]:
def jaccard(x, y):
    """ returns the jaccard similarity between two sets"""
    intersection_cardinality = len(x.intersection(y))
    union_cardinality = len(x.union(y))
    return float(intersection_cardinality/union_cardinality)

In [None]:
df_cust_article = transactions[['customer_id', 'article_id']].drop_duplicates().groupby(['customer_id'])['article_id'].apply(list).reset_index().set_index('customer_id')

In [None]:
df_cust_article.head()

In [None]:
df_cust_article.loc['000346516dd355b40badca0c0f5f37a318ddae31f0e0f76a3a0454eb591b6384'][0]

In [None]:
def calculate_avg_jaccard_similarity(
    candidates_df: pd.DataFrame,
    articles_attributes: pd.DataFrame,
    df_cust_article: pd.DataFrame):
    
    candidate_item, candidate_user = candidates_df.article_id, candidates_df.customer_id
    candidate_item_attributes = articles_attributes.loc[candidate_item]['set_of_attributes']
    try:
        bought_items = df_cust_article.loc[candidate_user][0]
    except KeyError:
        return 0
    jaccard_similarity_list = []
    for item in bought_items:
        item_attributes = articles_attributes.loc[item]['set_of_attributes']
        jaccard_similarity = jaccard(candidate_item_attributes, item_attributes)
        jaccard_similarity_list.append(jaccard_similarity)
    return sum(jaccard_similarity_list)/len(jaccard_similarity_list)    

In [None]:
candidates.head()

In [None]:
sample_candidates = candidates.sample(100_000)

In [None]:
sample_candidates.shape

### apply

In [None]:
%%time
sample_candidates.apply(lambda x: calculate_avg_jaccard_similarity(x, jaccard_articles, df_cust_article), axis=1)

In [None]:
9*candidates.shape[0]/sample_candidates.shape[0]/60

In [None]:
(9*60)/6.59

after

In [None]:
6.59*candidates.shape[0]/sample_candidates.shape[0]/60/60

### itertuples

In [None]:
%%time
pd.Series(
    calculate_avg_jaccard_similarity(row, jaccard_articles, df_cust_article)
    for row in sample_candidates.itertuples()
)

In [None]:
9*candidates.shape[0]/sample_candidates.shape[0]/60

### maybe faster jaccard_similarity

In [None]:
def jaccard(x, y):
    """ returns the jaccard similarity between two lists """
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    return intersection_cardinality/float(union_cardinality)

In [None]:
def jaccard2(x, y):
    """ returns the jaccard similarity between two lists """
    intersection_cardinality = len(x.intersection(y))
    union_cardinality = len(x.union(y))
    return intersection_cardinality/float(union_cardinality)

In [None]:
def jaccard3(x, y):
    """ returns the jaccard similarity between two lists """
    intersection_cardinality = len(x.intersection(y))
    union_cardinality = len(x.union(y))
    return float(intersection_cardinality/union_cardinality)

In [None]:
a = jaccard_articles.sample(1).set_of_attributes[0]

In [None]:
b = jaccard_articles.sample(1).set_of_attributes[0]

In [None]:
%timeit -n 5000 jaccard(a, b)

In [None]:
%timeit -n 5000 jaccard2(a, b)

In [None]:
%timeit -n 5000 jaccard3(a, b)

### image/text embeddings

In [None]:
image_embeddings = context.catalog.load('image_embeddings')

In [None]:
image_embeddings

In [None]:
df_cust_article = transactions[['customer_id', 'article_id']].drop_duplicates().groupby(['customer_id'])['article_id'].apply(list).reset_index().set_index('customer_id')

In [None]:
df_cust_article.head()

In [None]:
def calculate_customer_embedding(df: pd.DataFrame, embeddings: pd.DataFrame):
    list_of_articles = df.article_id[0]
    mean_embeddings = list(embeddings[embeddings.index.isin(list_of_articles)].mean(axis=0))
    return mean_embeddings

In [None]:
%%time
cust_embeddings = df_cust_article.groupby(['customer_id']).apply(lambda x: calculate_customer_embedding(x, image_embeddings)).reset_index(name='embeddings')

In [None]:
def calculate_customer_embedding(df: pd.DataFrame, embeddings: pd.DataFrame):
    list_of_articles = list(df.article_id.unique())
    mean_embeddings = list(embeddings[embeddings.index.isin(list_of_articles)].mean(axis=0))
    return mean_embeddings

In [None]:
%%time
cust_embeddings = transactions.groupby(['customer_id']).apply(lambda x: calculate_customer_embedding(x, image_embeddings)).reset_index(name='embeddings')

In [None]:
cust_embeddings.shape

In [None]:
image_embeddings.shape

### text

In [None]:
text_embeddings = context.catalog.load('text_embeddings')

In [None]:
text_embeddings.shape

In [None]:
def calculate_customer_embedding(df: pd.DataFrame, embeddings: pd.DataFrame):
    list_of_articles = df.article_id[0]
    mean_embeddings = list(embeddings[embeddings.index.isin(list_of_articles)].mean(axis=0))
    return mean_embeddings

In [None]:
%%time
cust_embeddings = df_cust_article.groupby(['customer_id']).apply(lambda x: calculate_customer_embedding(x, text_embeddings)).reset_index(name='embeddings')

In [None]:
def calculate_customer_embedding(df: pd.DataFrame, embeddings: pd.DataFrame):
    list_of_articles = list(df.article_id.unique())
    mean_embeddings = list(embeddings[embeddings.index.isin(list_of_articles)].mean(axis=0))
    return mean_embeddings

In [None]:
%%time
cust_embeddings = transactions.groupby(['customer_id']).apply(lambda x: calculate_customer_embedding(x, text_embeddings)).reset_index(name='embeddings')

In [None]:
%%time
transactions.groupby(['customer_id']).apply(lambda x: calculate_customer_embedding(x, text_embeddings)).reset_index(name='embeddings')

to split embeddings from list to single columns

df3 = df2.teams.apply(pd.Series) <br>
df3.columns = ['team1', 'team2']

In [None]:
np.array(cust_embeddings.set_index('customer_id').loc['000531ecd2754a0cf214373cc7b6dbcf7ce9a77c3284a2facbe072f763618bbc'].embeddings)

In [None]:
image_embeddings.loc['0878510001'].values

In [None]:
def cosine_similarity(A, B):
    return np.dot(A,B)/(np.linalg.norm(A)*np.linalg.norm(B))

In [None]:
def cosine_embedding_similarity(candidates_df, customers_embeddings, items_embeddings):
    """
    """
    candidate_item, candidate_user = candidates_df.article_id, candidates_df.customer_id
    try:
        customer_embeddings = np.array(cust_embeddings.set_index('customer_id').loc[candidate_user].embeddings)
    except KeyError:
        return 0
    try:
        candidate_embeddings = items_embeddings.loc[candidate_item].values
    except KeyError:
        return 0
    return cosine_similarity(customer_embeddings, candidate_embeddings)

In [None]:
candidates_df = pd.DataFrame({'customer_id': sorted(customers['customer_id'].sample(5).to_list()*3),
                              'article_id': articles['article_id'].sample(15).to_list()
                             })

In [None]:
candidates_df.head()

In [None]:
candidates_df.apply(lambda x: cosine_embedding_similarity(x, cust_embeddings, image_embeddings), axis=1)

text similarities

In [None]:
%%time
cust_embeddings = transactions.groupby(['customer_id']).apply(lambda x: calculate_customer_embedding(x, text_embeddings)).reset_index(name='embeddings')

In [None]:
candidates_df.apply(lambda x: cosine_embedding_similarity(x, cust_embeddings, text_embeddings), axis=1)

all similarities

collaborative filtering still AWAITS :<<<

candidates -> STRATEGY FEATURE, MAYBE SCORE?