In [1]:
import os
import gc
import torch
import pickle
import joblib
import warnings
import numpy as np
import pandas as pd
from tqdm import tqdm

warnings.filterwarnings('ignore')
os.environ["CUDA_VISIBLE_DEVICES"] = "2" 

In [2]:
data = pickle.load(open("./data/raw_data.pkl", "rb"))
data

Unnamed: 0,order_id,created_at,customer_id,product_id,city_id,pincode,week
0,61992975,2022-06-01,7561500,44140,5,560012.0,0
1,61993732,2022-06-01,7457908,44140,1,400022.0,0
2,61994661,2022-06-01,34786465,11551,1,400104.0,0
3,61998270,2022-06-01,1914309,9031,82,208017.0,0
4,61998673,2022-06-01,671615,44140,1,400042.0,0
...,...,...,...,...,...,...,...
41100376,280800873609601024,2023-02-04,10124061,203981,9,400706.0,17
41100377,280798430469464064,2023-02-04,30003657,3562292,234,313001.0,17
41100378,280748217963716608,2023-02-04,34076875,34725,3777,783301.0,17
41100379,280692405370482688,2023-02-04,35563174,30252,2854,700058.0,17


In [3]:
# retain both med and otc products present in catalog
catalog = pd.read_csv("/home/qblocks/production/piper/data/ranknet/training_data/pe_catalog.csv")
data = data[data['product_id'].isin(catalog['product_id'])].copy()
data

Unnamed: 0,order_id,created_at,customer_id,product_id,city_id,pincode,week
0,61992975,2022-06-01,7561500,44140,5,560012.0,0
1,61993732,2022-06-01,7457908,44140,1,400022.0,0
2,61994661,2022-06-01,34786465,11551,1,400104.0,0
3,61998270,2022-06-01,1914309,9031,82,208017.0,0
4,61998673,2022-06-01,671615,44140,1,400042.0,0
...,...,...,...,...,...,...,...
41100376,280800873609601024,2023-02-04,10124061,203981,9,400706.0,17
41100377,280798430469464064,2023-02-04,30003657,3562292,234,313001.0,17
41100378,280748217963716608,2023-02-04,34076875,34725,3777,783301.0,17
41100379,280692405370482688,2023-02-04,35563174,30252,2854,700058.0,17


In [4]:
def load_train():
    train_data = pickle.load(open("../data/train.pkl", "rb"))

    # remove invalid users
    sample = train_data.groupby("customer_id")["pincode"].nunique().reset_index()
    invalid_users = set(sample[sample["pincode"] > 3]["customer_id"])

    return train_data[~train_data["customer_id"].isin(invalid_users)].copy()

train_data = load_train()
train_data = train_data[["customer_id", "product_id"]].drop_duplicates().reset_index(drop=True)

train_data

Unnamed: 0,customer_id,product_id
0,10862428,3612105
1,972882,172151
2,33124483,236783
3,1682415,28424
4,8786746,6860
...,...,...
144712,6376620,3645982
144713,4838366,2930817
144714,10593246,55423
144715,10360879,172213


In [5]:
from torch.nn.functional import normalize

corpus = pickle.load(open('/home/qblocks/instance1/piper/utils/corpus.pkl', 'rb'))

# ucode2p = dict(catalog[catalog['product_id'].isin(set(data['product_id'].unique()))][['ucode', 'product_id']].values)
ucode2p = dict(catalog[['ucode', 'product_id']].values)

embs = [emb for ucode, emb in zip(corpus[2], corpus[1]) if ucode in ucode2p]
ucode = [ucode2p[ucode] for ucode in corpus[2] if ucode in ucode2p]
sales = [sales for ucode, sales in zip(corpus[2], corpus[3]) if ucode in ucode2p]

corpus = [np.array(ucode), normalize(torch.tensor(embs)).to('cuda'), torch.tensor(sales).to('cuda')]
pid_emb = {corpus[0][idx]: corpus[1][idx] for idx in range(len(corpus[0]))}

In [6]:
otc_catalog = pd.read_csv("/home/qblocks/instance1/personalise/dira/src/otc/ranking/data/catalog.csv")
otc_catalog

Unnamed: 0,product_id,ucode,brand,category,sales,mrp
0,856,280271,DIGENE,ORAL ANTACID,94.200000,16.5
1,1758,254073,MIS D SUN,SUNSCREEN,79.431616,1100.0
2,783,225566,PROSURE,PROTEIN SUPPLEMENTS,89.317812,789.0
3,10124,239418,UVMED,SUNSCREEN,72.921811,349.0
4,4323,229320,PIORHOIDS,,11.483195,175.0
...,...,...,...,...,...,...
79104,3125415,W63645,SKG ANTI,FACE SHIELD,1.106900,330.0
79105,3677666,R11154,KIOSK,LAB AND DIAGNOSTICS,1.000000,3304.0
79106,3677667,N33494,KIOSK,LAB AND DIAGNOSTICS,6.109978,2500.0
79107,3677668,B16510,LAB COAT,COAT,54.852945,260.0


In [7]:
otc_pid = set(otc_catalog["product_id"])

In [8]:
relevant_customers = set(train_data["customer_id"])

In [9]:
i2i = data[['order_id', 'product_id']].copy()
pidcount = dict(i2i.groupby(['product_id']).size().reset_index(name='count').values)

i2i = i2i.merge(i2i, on=['order_id'])
i2i = i2i[i2i['product_id_x'] != i2i['product_id_y']].copy()
i2i = i2i[(i2i["product_id_y"].isin(otc_pid)) & (i2i["product_id_x"].isin(otc_pid))].copy()

i2i = i2i.groupby(['product_id_x', 'product_id_y']).size().reset_index(name='intersection')
i2i['union'] = i2i['product_id_x'].map(pidcount) +  i2i['product_id_y'].map(pidcount) - i2i['intersection']

i2i = i2i[i2i['intersection'] > 1].copy()

i2i['cf_score'] = i2i['intersection'] / i2i['union']
i2i = i2i.sort_values(['product_id_x', 'cf_score'], ascending=[True, False]).groupby(['product_id_x'], sort=False).head(50)

i2i

Unnamed: 0,product_id_x,product_id_y,intersection,union,cf_score
0,9,11,68,1919,0.035435
321,9,2984990,11,1009,0.010902
98,9,53997,10,918,0.010893
245,9,236688,16,1549,0.010329
311,9,2978115,8,839,0.009535
...,...,...,...,...,...
2733140,3899871,856,3,24990,0.000120
2733170,3899872,3899870,2,18,0.111111
2733242,3900337,3828219,2,28,0.071429
2733237,3900337,3030660,2,604,0.003311


In [10]:
i2i = data[data["customer_id"].isin(relevant_customers)][['customer_id', 'product_id']].drop_duplicates().merge(i2i.rename(columns={"product_id_x": 'product_id'}), on=['product_id'])
i2i = i2i[['customer_id', 'product_id', 'product_id_y', 'intersection', 'union']].rename(columns={"product_id": 'product_id_x'}).drop_duplicates().copy()

i2i

Unnamed: 0,customer_id,product_id_x,product_id_y,intersection,union
0,10862428,3612105,487436,3,173
1,10862428,3612105,529694,3,177
2,10862428,3612105,529692,2,188
3,10862428,3612105,530139,2,360
4,10862428,3612105,236978,3,621
...,...,...,...,...,...
6370577,5386528,3571403,499566,2,5006
6370578,5386528,3571403,171057,2,5708
6370579,5386528,3571403,237257,2,12674
6370580,5386528,3571403,214380,2,13755


In [11]:
i2i["case_cf_similarity"] = [(pid_emb[x] * pid_emb[y]).sum(-1).item() for x, y in tqdm(i2i[["product_id_x", "product_id_y"]].values)]
i2i

100%|██████████| 6370582/6370582 [06:33<00:00, 16207.97it/s]


Unnamed: 0,customer_id,product_id_x,product_id_y,intersection,union,case_cf_similarity
0,10862428,3612105,487436,3,173,0.333898
1,10862428,3612105,529694,3,177,0.512404
2,10862428,3612105,529692,2,188,0.388406
3,10862428,3612105,530139,2,360,0.180850
4,10862428,3612105,236978,3,621,0.188783
...,...,...,...,...,...,...
6370577,5386528,3571403,499566,2,5006,0.110568
6370578,5386528,3571403,171057,2,5708,0.095497
6370579,5386528,3571403,237257,2,12674,0.080030
6370580,5386528,3571403,214380,2,13755,0.131563


In [12]:
i2i = i2i.groupby(["customer_id", "product_id_y"]).agg({"intersection": "sum", "union": "sum", "case_cf_similarity": "sum"}).reset_index()
i2i["cf_score"] = i2i["intersection"] / i2i["union"]

del i2i["intersection"]
del i2i["union"]

In [13]:
i2i.rename(columns={"product_id_y": 'product_id'}, inplace=True)
i2i

Unnamed: 0,customer_id,product_id,case_cf_similarity,cf_score
0,1231,762,0.544274,0.005996
1,1231,846,0.270239,0.002297
2,1231,854,0.166818,0.001413
3,1231,856,0.220908,0.002295
4,1231,881,0.033504,0.002174
...,...,...,...,...
3994271,38256587,3663905,0.436906,0.002116
3994272,38256587,3677765,0.298430,0.006305
3994273,38256587,3790211,0.330432,0.011628
3994274,38256587,3793918,0.207006,0.001467


In [14]:
i2i.sort_values(["customer_id", "cf_score"], ascending=[True, False], inplace=True)

In [15]:
i2i

Unnamed: 0,customer_id,product_id,case_cf_similarity,cf_score
179,1231,192679,1.301649,0.027120
89,1231,32173,0.643638,0.026498
13,1231,7489,2.477002,0.023526
331,1231,3754509,1.015297,0.023152
12,1231,7483,1.279446,0.019939
...,...,...,...,...
3993976,38256587,2967,-0.049996,0.000049
3994157,38256587,501050,0.153121,0.000044
3994242,38256587,3514609,0.060080,0.000034
3994021,38256587,46324,0.060582,0.000022


In [16]:
pickle.dump(i2i, open("./recall_data/otc_i2i_cf.pkl", "wb"))