In [None]:
import pandas as pd
import xgboost as xgb
import warnings
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, roc_auc_score
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle as pkl
from tensorflow.keras.models import load_model
from sklearn.neighbors import NearestNeighbors

import gc                       
gc.enable()

%pylab inline

from google.colab import drive
drive.mount('/content/drive')

In [None]:
with open('/content/drive/My Drive/Colab Notebooks/Diplom/instacart-market-basket-analysis/test_train/id_to_token.pkl', 'rb') as f:
  id_to_token = pkl.load(f)

In [None]:
len(id_to_token)

24818

##Выделяем заказы для валидации и теста

In [None]:
order_products_prior = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Diplom/instacart-market-basket-analysis/order_products__prior.csv')
order_products_train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Diplom/instacart-market-basket-analysis/order_products__train.csv')
orders = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Diplom/instacart-market-basket-analysis/orders.csv')
orders['days_since_prior_order'].fillna(0, inplace=True)

In [None]:
test_orders = []
val_orders = []

for user_id, group_u in tqdm(orders.groupby('user_id'), position=0):
  max_ord = group_u['order_number'].values.max()
  test_orders.append(group_u[group_u['order_number'] == max_ord]['order_id'].values[0])
  test_orders.append(group_u[group_u['order_number'] == max_ord-1]['order_id'].values[0])
  val_orders.append(group_u[group_u['order_number'] == max_ord-2]['order_id'].values[0])
  val_orders.append(group_u[group_u['order_number'] == max_ord-3]['order_id'].values[0])

100%|██████████| 206209/206209 [10:51<00:00, 316.69it/s]


In [None]:
np.intersect1d(test_orders, val_orders)

array([], dtype=int64)

In [None]:
with open('/content/drive/My Drive/Colab Notebooks/Diplom/instacart-market-basket-analysis/test_train/val_orders.pkl', 'wb') as f:
  pkl.dump(val_orders, f)

with open('/content/drive/My Drive/Colab Notebooks/Diplom/instacart-market-basket-analysis/test_train/test_orders.pkl', 'wb') as f:
  pkl.dump(test_orders, f)

##Создаем последовательности продуктов в заказе

In [None]:
with open('/content/drive/My Drive/Colab Notebooks/Diplom/instacart-market-basket-analysis/test_train/val_orders.pkl', 'rb') as f:
  val_orders = pkl.load(f)

with open('/content/drive/My Drive/Colab Notebooks/Diplom/instacart-market-basket-analysis/test_train/test_orders.pkl', 'rb') as f:
  test_orders = pkl.load(f)

In [None]:
order_products = pd.concat((order_products_prior, order_products_train))

In [None]:
op = orders.merge(order_products, on='order_id', how='inner')

In [None]:
users_set = list(set(op[(~op['order_id'].isin(val_orders)) & (~op['order_id'].isin(test_orders))].user_id))[:10000]

In [None]:
op = op[op['user_id'].isin(users_set)].reset_index(drop=True)

In [None]:
orders_seq = {}
for order_id, group_o in tqdm(op.groupby('order_id'), position=0):
  orders_seq[order_id] = group_o['product_id'].values

100%|██████████| 126389/126389 [00:33<00:00, 3755.59it/s]


In [None]:
order_features = orders.merge(pd.DataFrame(orders_seq.items(), columns=['order_id', 'prod_seq']), how='inner', on='order_id')
order_features['prod_seq'] = order_features['prod_seq'].apply(lambda x: list(x))

In [None]:
orders_prev_seq = {}
for user_id, group_u in tqdm(order_features.groupby('user_id'), position=0):
  for i in group_u.index:
    order_id = group_u.loc[i, 'order_id']
    order_number = group_u.loc[i, 'order_number']
    try:
      orders_prev_seq[order_id] = group_u[group_u['order_number'] == order_number-1]['prod_seq'].values[0]
    except:
      orders_prev_seq[order_id] = group_u[group_u['order_number'] == order_number-1]['prod_seq'].values

100%|██████████| 6605/6605 [01:40<00:00, 65.91it/s]


In [None]:
order_features = order_features.merge(pd.DataFrame(orders_prev_seq.items(), columns=['order_id', 'prev_seq']), how='inner', on='order_id')

## Собираем словарь и выделяем заказы соответственно продуктам в словаре

In [None]:
prods = []
for seq in tqdm(order_features[(~order_features['order_id'].isin(val_orders)) & (~order_features['order_id'].isin(test_orders))]['prod_seq'], position=0):
  for prod in seq:
    prods.append(prod)

id_to_token = {}
for el in set(prods):
  id_to_token[len(id_to_token)] = el

token_to_id = {}
for i_d, token in id_to_token.items():
  token_to_id[token] = i_d

100%|██████████| 102474/102474 [00:00<00:00, 600323.63it/s]


In [None]:
bad_users = []
for user_id, seq in tqdm(order_features[['user_id','prod_seq']].values, position=0):
  bad = False
  for el in seq:
    if el not in token_to_id:
      bad = True
      break
  if bad:
    bad_users.append(user_id)
bad_users = set(bad_users)

100%|██████████| 126389/126389 [00:00<00:00, 294165.50it/s]


In [None]:
len(bad_users)

0

In [None]:
order_features = order_features[~order_features['user_id'].isin(bad_users)]

In [None]:
users_set = list(set(order_features.user_id))

In [None]:
order_features.to_csv('/content/drive/My Drive/Colab Notebooks/Diplom/instacart-market-basket-analysis/test_train/order_features.csv', index=False)

In [None]:
with open('/content/drive/My Drive/Colab Notebooks/Diplom/instacart-market-basket-analysis/test_train/users_set.pkl', 'wb') as f:
  pkl.dump(users_set, f)

In [None]:
with open('/content/drive/My Drive/Colab Notebooks/Diplom/instacart-market-basket-analysis/test_train/id_to_token.pkl', 'wb') as f:
  pkl.dump(id_to_token, f)

with open('/content/drive/My Drive/Colab Notebooks/Diplom/instacart-market-basket-analysis/test_train/token_to_id.pkl', 'wb') as f:
  pkl.dump(token_to_id, f)

## Создаем фичи для покупателя

In [None]:
train_orders = order_features[(~order_features['order_id'].isin(val_orders)) & (~order_features['order_id'].isin(test_orders))]['order_id']

In [None]:
op = orders.merge(order_products, on='order_id', how='inner')
op = op[op['order_id'].isin(train_orders)]

In [None]:
op.shape

(978754, 10)

Полное число заказов

In [None]:
user = op.groupby('user_id')['order_number'].max().to_frame('u_total_orders')
user = user.reset_index()
user.head()

Unnamed: 0,user_id,u_total_orders
0,1,7
1,2,11
2,3,9
3,4,2
4,5,1


Доля перезаказанных продуктов

In [None]:
u_reorder = op.groupby('user_id')['reordered'].mean().to_frame('u_reordered_ratio')
u_reorder = u_reorder.reset_index()
u_reorder.head()

Unnamed: 0,user_id,u_reordered_ratio
0,1,0.657895
1,2,0.476821
2,3,0.549296
3,4,0.0
4,5,0.0


Мержим фичи и чистим кэш

In [None]:
user = user.merge(u_reorder, on='user_id', how='left')

del u_reorder
gc.collect()

user.head()

Unnamed: 0,user_id,u_total_orders,u_reordered_ratio
0,1,7,0.657895
1,2,11,0.476821
2,3,9,0.549296
3,4,2,0.0
4,5,1,0.0


## Создаем фичи для продуктов

Количество заказов каждого продукта

In [None]:
prd = op.groupby('product_id')['order_id'].count().to_frame('p_total_purchases')
prd = prd.reset_index()
prd.head()

Unnamed: 0,product_id,p_total_purchases
0,1,58
1,2,1
2,3,13
3,4,3
4,7,1


Количество перезаказов для каждого продукта

In [None]:
#p_reorder = op.groupby('product_id').filter(lambda x: x.shape[0] > 0)
#p_reorder.head()
#
p_reorder = op.groupby('product_id')['reordered'].mean().to_frame('p_reorder_ratio')
p_reorder = p_reorder.reset_index()
p_reorder.head()

Unnamed: 0,product_id,p_reorder_ratio
0,1,0.482759
1,2,0.0
2,3,0.769231
3,4,0.0
4,7,0.0


Мержим фичи и чистим кэш

In [None]:
prd = prd.merge(p_reorder, on='product_id', how='left')

prd['p_reorder_ratio'] = prd['p_reorder_ratio'].fillna(value=0)

del p_reorder
gc.collect()

prd.head()

Unnamed: 0,product_id,p_total_purchases,p_reorder_ratio
0,1,58,0.482759
1,2,1,0.0
2,3,13,0.769231
3,4,3,0.0
4,7,1,0.0


Сколько раз конкретный покупатель купил данный продукт

In [None]:
uxp = op.groupby(['user_id', 'product_id'])['order_id'].count().to_frame('uxp_total_bought')
uxp = uxp.reset_index()
uxp.head()

Unnamed: 0,user_id,product_id,uxp_total_bought
0,1,196,7
1,1,10258,6
2,1,10326,1
3,1,12427,7
4,1,13032,2


Как часто покупатель покупал данный продукт после первой покупки этого продукта

In [None]:
#сколько раз (в скольких заказах) покупатель купил продукт после первой покупки этого продукта
times = op.groupby(['user_id', 'product_id'])[['order_id']].count()
times.columns = ['Times_Bought_N']

#сколько всего заказов сделал покупатель
total_orders = op.groupby('user_id')['order_number'].max().to_frame('total_orders')

#на каком по счету заказе покупатель купил данный продукт
first_order_no = op.groupby(['user_id', 'product_id'])['order_number'].min().to_frame('first_order_number')
first_order_no  = first_order_no.reset_index()

#мержим количество всех заказов и номер первого заказа данного продукта
span = pd.merge(total_orders, first_order_no, on='user_id', how='right')

#считаем количество заказов которые покупатель совершил после покупки данного продукта + сам заказ первой
#покупки этого продукта
span['Order_Range_D'] = span.total_orders - span.first_order_number + 1

#мержим количество покупок данного продукта и количество заказов после покупки этого продукта
uxp_ratio = pd.merge(times, span, on=['user_id', 'product_id'], how='left')

#считаем нашу метрику
uxp_ratio['uxp_reorder_ratio'] = uxp_ratio.Times_Bought_N / uxp_ratio.Order_Range_D

#дропаем ненужные колонки
uxp_ratio = uxp_ratio.drop(['Times_Bought_N', 'total_orders', 'first_order_number', 'Order_Range_D'], axis=1)

#чистим кэш
del [times, first_order_no, span]
#мержим с нашим фичами новую фичу
uxp = uxp.merge(uxp_ratio, on=['user_id', 'product_id'], how='left')

del uxp_ratio
uxp.head()

Unnamed: 0,user_id,product_id,uxp_total_bought,uxp_reorder_ratio
0,1,196,7,1.0
1,1,10258,6,1.0
2,1,10326,1,0.333333
3,1,12427,7,1.0
4,1,13032,2,0.333333


Как часто покупатель покупал данный продук за свои последние N заказов

In [None]:
N = 5

In [None]:
#считаем сколько заказов осталось от данного до последнего
op['order_number_back'] = op.groupby('user_id')['order_number'].transform(max) - op.order_number +1 

#отбираем последние N заказов
opN = op[op.order_number_back <= N]

#сколько раз покупатель покупал данный продукт за свои последние N заказов
last_N = opN.groupby(['user_id','product_id'])[['order_id']].count()
last_N.columns = ['times_lastN']

#мержим фичи и чистим кэш
uxp = uxp.merge(last_N, on=['user_id', 'product_id'], how='left')

del [opN , last_N]

#заполняем NAN если есть нулями
uxp = uxp.fillna(0)
uxp.head()

Unnamed: 0,user_id,product_id,uxp_total_bought,uxp_reorder_ratio,times_lastN
0,1,196,7,1.0,5.0
1,1,10258,6,1.0,5.0
2,1,10326,1,0.333333,1.0
3,1,12427,7,1.0,5.0
4,1,13032,2,0.333333,1.0


## Создаем фичи для заказов

In [None]:
op1 = orders.merge(order_products, on='order_id', how='inner')
op1 = op1[op1['order_id'].isin(order_features.order_id)].reset_index(drop=True)

In [None]:
ordr = op1[['order_id', 'user_id', 'product_id', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order']]

## Мержим все фичи

In [None]:
#мержим фичи покупателя
data = uxp.merge(user, on='user_id', how='left')
data.head()
#мержим фичи продуктов
data = data.merge(prd, on='product_id', how='left')
data.head()
#чистим кэш
del [user, prd, uxp]
gc.collect()

data.head()

Unnamed: 0,user_id,product_id,uxp_total_bought,uxp_reorder_ratio,times_lastN,u_total_orders,u_reordered_ratio,p_total_purchases,p_reorder_ratio
0,1,196,7,1.0,5.0,7,0.657895,1271,0.81904
1,1,10258,6,1.0,5.0,7,0.657895,108,0.777778
2,1,10326,1,0.333333,1.0,7,0.657895,167,0.586826
3,1,12427,7,1.0,5.0,7,0.657895,153,0.69281
4,1,13032,2,0.333333,1.0,7,0.657895,84,0.404762


In [None]:
data.shape

(363719, 9)

In [None]:
train_positive = data.merge(ordr, how='right', on=['user_id', 'product_id'])

In [None]:
train_positive.shape

(1202304, 14)

In [None]:
nan_index = train_positive[pd.isna(train_positive['uxp_total_bought'])].index

In [None]:
nan_df = train_positive.loc[nan_index, :]

Заполняем NaN'ы

In [None]:
u_index = []
for user_id, group_u in tqdm(nan_df.groupby('user_id'), position=0):
  u_total_orders, u_reordered_ratio = data[data['user_id'] == user_id][['u_total_orders', 'u_reordered_ratio']].values[0]
  for ind in group_u.index:
    u_index.append([ind, u_total_orders, u_reordered_ratio])

u_index = np.reshape(u_index, (-1, 3))
train_positive.loc[u_index[:, 0], ['u_total_orders', 'u_reordered_ratio']] = u_index[:, 1:]

p_index = []
for product_id, group_p in tqdm(nan_df.groupby('product_id'), position=0):
  p_total_purchases, p_reorder_ratio = data[data['product_id'] == product_id][['p_total_purchases', 'p_reorder_ratio']].values[0]
  for ind in group_p.index:
    p_index.append([ind, p_total_purchases, p_reorder_ratio])

p_index = np.reshape(p_index, (-1, 3))
train_positive.loc[p_index[:, 0], ['p_total_purchases', 'p_reorder_ratio']] = p_index[:, 1:]

100%|██████████| 6250/6250 [00:18<00:00, 336.99it/s]
100%|██████████| 11838/11838 [00:38<00:00, 310.53it/s]


Cold start product problem

In [None]:
nan_df = train_positive.loc[nan_index, :]

In [None]:
indexes = []
for user_id, group_u in tqdm(nan_df.groupby('user_id'), position=0):
  for product_id, group_p in group_u.groupby('product_id'):

    X = data[(data['user_id'] != user_id) & (data['product_id'] == product_id)][['user_id', 'uxp_total_bought', 'uxp_reorder_ratio',
       'times_lastN', 'u_total_orders', 'u_reordered_ratio']]

    n_neighbors = min((10, X.shape[0]))

    nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm='ball_tree').fit(X[['u_total_orders', 'u_reordered_ratio']].values)
    distances, indices = nbrs.kneighbors(group_p[['u_total_orders',	'u_reordered_ratio']].values[0].reshape(1, -1))
    uxp_total_bought, uxp_reorder_ratio, times_lastN = X.iloc[indices[0], [1, 2, 3]].mean()

    for ind in group_p.index:
      indexes.append([ind, uxp_total_bought, uxp_reorder_ratio, times_lastN])

100%|██████████| 6250/6250 [11:32<00:00,  9.02it/s]


In [None]:
with open('/content/drive/My Drive/Colab Notebooks/Diplom/instacart-market-basket-analysis/test_train/indexes.pkl', 'wb') as f:
  pkl.dump(indexes, f)

In [None]:
nan_ind = np.reshape(indexes, (-1, 4))

In [None]:
train_positive.loc[nan_ind[:,0], ['uxp_total_bought', 'uxp_reorder_ratio', 'times_lastN']] = nan_ind[:,1:]

In [None]:
del [orders, order_products_train, order_products_prior]
del [op]

gc.collect()

1870

In [None]:
train_positive['days_since_prior_order'].fillna(0, inplace=True)

In [None]:
train_positive.to_csv('/content/drive/My Drive/Colab Notebooks/Diplom/instacart-market-basket-analysis/test_train/train_positive.csv', index=False)

In [None]:
train_positive = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Diplom/instacart-market-basket-analysis/test_train/train_positive.csv')

## Создаем датасет с отрицательными классами

In [None]:
prod_to_ind = {}

for user_id, group_u in tqdm(train_positive.groupby('user_id'), position=0):
  prod_to_ind[user_id] = {}
  for product_id, group_p in group_u.groupby('product_id'):
    orders = group_p['order_id'].values
    size = orders.size

    target_orders = list(set(group_u[~group_u['order_id'].isin(orders)]['order_id']))

    if size > len(target_orders):
      size = len(target_orders)
    #elif size < len(target_orders):
     # size += 1

    prod_to_ind[user_id][product_id] = np.random.choice(target_orders, size, replace=False)

with open('/content/drive/My Drive/Colab Notebooks/Diplom/instacart-market-basket-analysis/test_train/prod_to_ind.pkl', 'wb') as f:
  pkl.dump(prod_to_ind, f)

100%|██████████| 6605/6605 [11:05<00:00,  9.93it/s]


In [None]:
with open('/content/drive/My Drive/Colab Notebooks/Diplom/instacart-market-basket-analysis/test_train/prod_to_ind.pkl', 'rb') as f:
  prod_to_ind = pkl.load(f)

In [None]:
ind_ord = []
for user_id in tqdm(prod_to_ind.keys(), position=0):
  for product_id in prod_to_ind[user_id].keys():
    for el in prod_to_ind[user_id][product_id]:
      ind_ord.append([user_id, product_id, el])

100%|██████████| 6605/6605 [00:02<00:00, 2917.94it/s]


In [None]:
tr_neg = pd.DataFrame(ind_ord, columns=['user_id', 'product_id', 'order_id']).merge(data, how='inner', on=['user_id', 'product_id'])

In [None]:
train_negative = tr_neg.merge(order_features[['order_id', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order']], how='inner', on='order_id')

In [None]:
train_positive['days_since_prior_order'].fillna(0, inplace=True)
train_negative['days_since_prior_order'].fillna(0, inplace=True)

In [None]:
train_positive['label'] = 1
train_negative['label'] = 0

In [None]:
train_positive['uxp_total_bought'] = train_positive['uxp_total_bought'].apply(lambda x: int(x))
train_positive['times_lastN'] = train_positive['times_lastN'].apply(lambda x: int(x))

train_negative['uxp_total_bought'] = train_negative['uxp_total_bought'].apply(lambda x: int(x))
train_negative['times_lastN'] = train_negative['times_lastN'].apply(lambda x: int(x))

In [None]:
train_positive.to_csv('/content/drive/My Drive/Colab Notebooks/Diplom/instacart-market-basket-analysis/test_train/train_positive.csv', index=False)
train_negative.to_csv('/content/drive/My Drive/Colab Notebooks/Diplom/instacart-market-basket-analysis/test_train/train_negative.csv', index=False)

In [None]:
len(set(train_positive.user_id))

6605

In [None]:
len(set(order_features.user_id))

In [None]:
len(set(train_positive.order_id))

In [None]:
len(set(order_features.order_id))

## Дополняем order_features последовательностями продуктов за все предыдущие заказы

In [None]:
order_features = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Diplom/instacart-market-basket-analysis/test_train/order_features.csv')
order_features['prod_seq'] = order_features['prod_seq'].apply(lambda x: list(np.fromstring(x.strip('[ ]'), dtype=int, sep=', ')))
order_features['prev_seq'] = order_features['prev_seq'].apply(lambda x: list(np.fromstring(x.strip('[ ]'), dtype=int, sep=', ')))

In [None]:
prev_orders_seq = {}
for user_id, group_u in tqdm(order_features.groupby('user_id'), position=0):
  for ind in group_u.index:
    prev_orders_seq[group_u.loc[ind, 'order_id']] = group_u[group_u['order_number'] < group_u.loc[ind, 'order_number']]['prod_seq'].sum()

100%|██████████| 6605/6605 [02:01<00:00, 54.21it/s]


In [None]:
order_features = order_features.merge(pd.DataFrame(prev_orders_seq.items(), columns=['order_id', 'prev_orders_seq']), how='inner', on='order_id')

In [None]:
order_features.to_csv('/content/drive/My Drive/Colab Notebooks/Diplom/instacart-market-basket-analysis/test_train/order_features.csv', index=False)

In [None]:
train_positive = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Diplom/instacart-market-basket-analysis/test_train/train_positive.csv')
train_negative = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Diplom/instacart-market-basket-analysis/test_train/train_negative.csv')

In [None]:
train = pd.concat((train_negative, train_positive), axis=0)