In [None]:
%load_ext kedro.extras.extensions.ipython

In [None]:
%reload_kedro

In [None]:
import pandas as pd
import numpy as np

In [None]:
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [None]:
catalog.list()

In [None]:
transactions = context.catalog.load('transactions_sample')

In [None]:
customers = context.catalog.load('customers_sample')

In [None]:
articles = context.catalog.load('articles_sample')

In [None]:
candidates = context.catalog.load('candidates')

In [None]:
candidates.shape

In [None]:
candidates[candidates.isna().any(axis=1)]

In [None]:
transactions[transactions.customer_id.isin(['ff18c106287931d170618b59fd402ed0ecc292869aad5f'])]

In [None]:
transactions[transactions.customer_id.isin(candidates[candidates.isna().any(axis=1)].customer_id.to_list())]

In [None]:
transactions[~(transactions.customer_id.isin(candidates[candidates.isna().any(axis=1)].customer_id.to_list()))].shape

In [None]:
transactions.shape, customers.shape, articles.shape

In [None]:
my_dict = {'01': {'0101', '02020', '03030'}, '02': {'02032', '32131', '321321', '32131321'}}

In [None]:
my_dict

In [None]:
my_df = pd.DataFrame(my_dict.keys(), columns=['blasda'])

In [None]:
my_df

In [None]:
my_df['blabla'] = my_df.loc[:, 'blasda'].map(my_dict)

In [None]:
my_df

In [None]:
pd.DataFrame.from_dict(my_dict, orient='index')

### Popularity items

- recent popular items (1, 3, 7, 30, 90 days), most popular products (overall)
- products purchased previously (all)
- popular items by SEGMENTS (age, gender) (1, 3, 7, 30, 90 days)
- items which have the same `prod_name`
<br><br>
- IMAGE/TEXT EMBEDDINGS
- BASKET MARKET ANALYSIS
<br><br>
- just choose the `N` that maximizes recall for EACH METHOD

In [None]:
transactions.t_dat = pd.to_datetime(transactions.t_dat)

In [None]:
transactions.t_dat.describe(datetime_is_numeric=True)[['min', 'max']]

In [None]:
transactions.head()

## global articles

In [None]:
articles_bought_by_most_customers = (transactions
    .groupby(['article_id'])['customer_id']
    .nunique()
    .reset_index(name='cd_customer_id')
    .sort_values(by='cd_customer_id', ascending=False)
)

In [None]:
articles_bought_by_most_customers.head(200).article_id.to_list()

In [None]:
articles_sold_most = (transactions
    .groupby(['article_id'])['customer_id']
    .size()
    .reset_index(name='sales_sum_count')
    .sort_values(by='sales_sum_count', ascending=False)
)

In [None]:
articles_sold_most.head()

### trending articles (nope)

Trending articles seem to much engineered by hand, you can just take popularity items from last 7 days. I suspect result will be comparable

In [None]:
tmp = transactions[['t_dat']].copy()

In [None]:
tmp['day_of_week'] = tmp['t_dat'].dt.dayofweek

In [None]:
tmp['ldbw'] = tmp['t_dat'] - pd.TimedeltaIndex(tmp['day_of_week'] - 1, unit='D')

In [None]:
tmp.loc[tmp['day_of_week'] >=2 , 'ldbw'] = tmp.loc[tmp['day_of_week'] >=2 , 'ldbw'] + pd.TimedeltaIndex(np.ones(len(tmp.loc[tmp['day_of_week'] >=2])) * 7, unit='D')

ldbw - next nearest last day (in our case thursday) or current

## segment articles

In [None]:
transactions.head()

In [None]:
customers.head()

In [None]:
customers['age_bin'] = pd.qcut(customers['age'], q=5, labels=False).astype(str).str.zfill(2)

In [None]:
cols = ['age']

In [None]:
customers.groupby(['age_bin'])['age'].agg(['min', 'max'])

- first question -> what about age_bin (WHERE to do it)
- second -> filling NA -> it's done in SAMPLING, but nowhere else

In [None]:
articles_sold_most_by_segment = (transactions
    .merge(customers, on='customer_id')
    .groupby(['age_bin', 'article_id'])['customer_id']
    .size()
    .reset_index(name='most_sold_by_segment')
    .sort_values(by='most_sold_by_segment', ascending=False)
)

In [None]:
articles_sold_most_by_segment.query('age_bin=="1"')

In [None]:
articles_sold_most_by_segment.query('age_bin=="5"')

check how much they intersect, maybe there is no point in doing for different days?

## Previously bought

### previously bought the same `prod_name`

In [None]:
customers_prod_name = (
    transactions
    .merge(articles[['article_id', 'prod_name']], on='article_id')[['customer_id', 'prod_name']]
    .drop_duplicates()
    .sort_values(by='customer_id')
)

In [None]:
customers_prod_name.shape

In [None]:
prev_bought_prod_name = (
    customers_prod_name
    .merge(articles[['article_id', 'prod_name']], on='prod_name')
    .drop(['prod_name'], axis=1)
)

In [None]:
prev_bought_prod_name.shape

### previously bought

In [None]:
prev_bought = transactions[['customer_id', 'article_id']].drop_duplicates()

In [None]:
prev_bought.groupby(['customer_id'])['article_id'].apply(list).reset_index(name='previously_bought')

### embeddings search - faiss

In [None]:
emb = context.catalog.load('text_embeddings')

In [None]:
emb

In [None]:
from sklearn.neighbors import KDTree

In [None]:
tree = KDTree(emb.values, leaf_size=5)

In [None]:
def find_similar_images(query_article_id, embeddings, tree):
    _, ind = tree.query(embeddings.loc[query_article_id].values.reshape(1, -1), k=5)
    closest_embeddings = embeddings.iloc[ind[0]].index.tolist()
    return closest_embeddings

In [None]:
find_similar_images('0108775015', emb, tree)

In [None]:
emb.loc[['0108775051', '0108775044', '0108775015', '0623522001', '0623522010'], :]

In [None]:
transactions = context.catalog.load('transactions_sample')

In [None]:
transactions.sort_values(by='t_dat', ascending=False, inplace=True)

In [None]:
sample = transactions.head(100)

In [None]:
sample['article_id'].apply(lambda x: find_similar_images(x, emb, tree))

In [None]:
transactions

In [None]:
transactions.groupby(['customer_id']).head(5)

In [None]:
transactions.groupby(['customer_id'])['article_id'].count().reset_index().groupby(['article_id'])['customer_id'].nunique().to_clipboard()

correct sum

In [None]:
candidates = context.catalog.load('candidates')

In [None]:
candidates.shape

In [None]:
candidates.customer_id.nunique()

In [None]:
candidates

In [None]:
my_list = ['321312', '123', '3213']

In [None]:
[my_list for i in candidates.index]

In [None]:
pd.Series(my_list * 8)

In [None]:
all_customers.loc[:, 'global_articles'] = pd.Series(list(articles) * len(all_customers))