In [None]:
import implicit 
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import glob
import time
from datetime import datetime
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import os
import statistics
from scipy.sparse import csr_matrix

In [None]:
#os.environ["OPENBLAS_NUM_THREADS"] = "1"
#os.environ["GOTO_NUM_THREADS"] = "1"
#os.environ["OMP_NUM_THREADS"] = "1"

In [None]:
data = pd.read_csv('/media/root/data/swiggy/flipkart/data2/ppv2.csv')
print("Data read.")
#data[data['count'] > 500] = 500

In [None]:
#First creating required sparse csr matrix
import scipy
products = data.product_id.unique()
accounts = data.account_id_enc.unique()

print(len(products))
print(len(accounts))

prod_to_int = {}
acc_to_int = {}
int_to_prod = {}
int_to_acc = {}

count=0
for prod in products:
    prod_to_int[prod] = count
    int_to_prod[count] = prod
    count += 1

count=0
for acc in accounts:
    acc_to_int[acc] = count
    int_to_acc[count] = acc
    count += 1

In [None]:
data['count'].describe()

In [None]:
start_time = time.time()
print("Preprocessing started")
df_acc_to_int = pd.DataFrame(acc_to_int.items(), columns=['account_id', 'account_index'])
df_prod_to_int = pd.DataFrame(prod_to_int.items(), columns=['product_id', 'product_index'])
data_userIdx = pd.merge(data, df_acc_to_int, left_on=['account_id_enc'], right_on = ['account_id'])
data_userIdx_productIdx = pd.merge(data_userIdx, df_prod_to_int, left_on=['product_id'], right_on = ['product_id'])
rows = np.array(data_userIdx_productIdx['account_index'])
cols = np.array(data_userIdx_productIdx['product_index'])
values = np.array(data_userIdx_productIdx['count'])
print("Preprocessing done.")
print("Time taken ->", time.time() - start_time)

In [None]:
#Create CSR matrix
sparseMatrix = csr_matrix((values, (cols, rows)), shape = ( len(products),len(accounts)))

In [None]:
#ALS wont work well enough.
matrix_size = sparseMatrix.shape[0]*sparseMatrix.shape[1] # Number of possible interactions in the matrix
num_purchases = len(sparseMatrix.nonzero()[0]) # Number of items interacted with
sparsity = 100*(1 - (num_purchases/matrix_size))
print(sparsity)


In [None]:
model = implicit.als.AlternatingLeastSquares(factors=50)

model.fit(sparseMatrix)

In [None]:
item_vecs = model.item_factors
user_vecs = model.user_factors
print('Shape of item vector matrix : ', item_vecs.shape)
print('Shape of User vector matrix : ', user_vecs.shape)

In [None]:
np.save('user_feature.npy',user_vecs)
np.save('item_feature.npy',item_vecs)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from kneed import KneeLocator
import seaborn as sns

In [None]:
def find_k(df, increment=0, decrement=0):
    """Find the optimum k clusters"""
    
    df_norm = df
    sse = {}
    
    for k in range(1, 20):
        print("Cluster no: ",k)
        kmeans = KMeans(n_clusters=k, random_state=1)
        kmeans.fit(df_norm)
        sse[k] = kmeans.inertia_
    
    kn = KneeLocator(x=list(sse.keys()), 
                 y=list(sse.values()), 
                 curve='convex', 
                 direction='decreasing')
    k = kn.knee + increment - decrement
    return k

k = find_k(user_vecs)
print("Ideal no of clusters is: ",k)

In [None]:
x = np.load("user_feature.npy")
start_time = time.time()
print("Clustering started")
kmeans = KMeans(n_clusters=10,random_state=0).fit(x)

np.save('labels.npy',kmeans.labels_)
np.save('centers.npy',kmeans.cluster_centers_)
print("Clustering done.")
print("Time taken ->", time.time() - start_time)

In [None]:
labels = kmeans.labels_
centers = kmeans.cluster_centers_
print(labels.shape)
(unique, counts) = np.unique(labels, return_counts=True)
print(labels.shape)
frequencies = np.asarray((unique, counts)).T
print(frequencies)

In [None]:
shopsy_dataset = pd.read_csv('/media/root/data/swiggy/flipkart/data2/reseller_profile.csv')
print(shopsy_dataset['s.account_id_enc'].nunique())
print(shopsy_dataset['s.product_id'].nunique())
print(shopsy_dataset.shape[0])

shopsy_user_product = shopsy_dataset.groupby(['s.account_id_enc','s.product_id'],as_index=False).agg({'s.order_item_id_enc':'count'})
print("user product combination in shopsy", shopsy_user_product.shape[0])

In [None]:
shopsy_user_product_encode_temp = pd.merge(shopsy_user_product, df_acc_to_int, left_on=['s.account_id_enc'], right_on = ['account_id'])
shopsy_user_product_encode = pd.merge(shopsy_user_product_encode_temp, df_prod_to_int, left_on=['s.product_id'], right_on = ['product_id'])
shopsy_click_users = pd.merge(data_userIdx_productIdx, shopsy_user_product_encode, right_on=['account_index','product_index'], left_on = ['account_index','product_index'],how='inner')

In [None]:
from random import sample

def intersection(lst1, lst2):
    store = list(set(lst1) & set(lst2))
    return len(store)

mean_items_bought = 0
user_idx = shopsy_click_users['account_index'].unique()
print("Sampling done")
user_idx = sample(list(user_idx),500)
start_time = time.time()
for user in user_idx:
    count = count+1
    item_interest = np.dot(item_vecs,x[user].reshape(-1, 1)).reshape(-1)
    pred_items = (-item_interest).argsort()[:20]
    actual_items = shopsy_click_users[shopsy_click_users['account_index']==user]
    items_bought = actual_items['product_index'].unique()
    frac_actually_bought = intersection(items_bought,pred_items)/20
    mean_items_bought += frac_actually_bought 
mean_items_bought = mean_items_bought/len(user_idx)
print("Mean item fraction brought: ",mean_items_bought)
print("Time taken ->", time.time() - start_time)

In [None]:
click_productid_category = data.groupby(['product_id','cms_vertical'],as_index=False).agg({'count':'sum'})
click_productid_category

In [None]:
click_productid_category_productidx = pd.merge(click_productid_category, df_prod_to_int, left_on=['product_id'], right_on = ['product_id'])
click_productid_category_productidx

In [None]:
fin_dataset = click_productid_category_productidx.sort_values(['product_index'])
fin_dataset

In [None]:
print(fin_dataset.shape)

In [None]:
from random import sample

def mapper(item_index_list):
    result = []
    for item in item_index_list:
        result.append(fin_dataset.iloc[item,1])
    return result    
    
def intersection(lst1, lst2):
    lst1.sort()
    set2 = set(lst2)
    common=0
    for item in lst1:
        if item in set2:
            common += 1
    return common/len(lst1)

                          
mean_items_bought = 0
user_idx = shopsy_click_users['account_index'].unique()
user_idx = sample(list(user_idx),500)
print("Sampling done")
start_time = time.time()
for user in user_idx:
    count = count+1
    item_interest = np.dot(item_vecs,x[user].reshape(-1, 1)).reshape(-1)
    pred_items = (-item_interest).argsort()[:20]
    actual_items = shopsy_click_users[shopsy_click_users['account_index']==user]
    items_bought = actual_items['product_index'].unique()
    frac_actually_bought = intersection(mapper(items_bought),mapper(pred_items))
    mean_items_bought += frac_actually_bought 
mean_items_bought = mean_items_bought/len(user_idx)
print("Mean item fraction brought: ",mean_items_bought)
print("Time taken ->", time.time() - start_time)

In [None]:
labels = kmeans.labels_
centers = kmeans.cluster_centers_
mean_items_bought = 0
user_idx = shopsy_click_users['account_index'].unique()
user_idx = sample(list(user_idx),250)
print("Sampling done")
start_time = time.time()
for user in user_idx:
    cluster_item = np.dot(item_vecs,centers[labels[user]].reshape(-1,1)).reshape(-1)
    pred_center = (-cluster_item).argsort()[:5]
    item_interest = np.dot(item_vecs,x[user].reshape(-1, 1)).reshape(-1)
    pred_items = (-item_interest).argsort()[:5]
    frac_actually_bought = intersection(mapper(items_bought),mapper(pred_items))
    mean_items_bought += frac_actually_bought 
mean_items_bought = mean_items_bought/len(user_idx)
print("Mean item fraction brought: ",mean_items_bought)
print("Time taken ->", time.time() - start_time)  
    

In [None]:
for c in range(0,10):
    if c==3 or c==4 or c==6 or c==8:
        continue
    cluster_index = np.where(labels==c)
    mean_items_bought = 0
    user_idx = cluster_index[0]
    user_idx = sample(list(user_idx),250)
    print("Sampling done")
    start_time = time.time()
    cluster_item = np.dot(item_vecs,centers[c].reshape(-1,1)).reshape(-1)
    pred_center = (-cluster_item).argsort()[:20]
    for user in user_idx:    
        item_interest = np.dot(item_vecs,x[user].reshape(-1, 1)).reshape(-1)
        pred_items = (-item_interest).argsort()[:20]
        frac_actually_bought = intersection(mapper(items_bought),mapper(pred_items))
        mean_items_bought += frac_actually_bought 
    mean_items_bought = mean_items_bought/len(user_idx)
    print("Mean item fraction brought: ",mean_items_bought)
    print("Time taken ->", time.time() - start_time)

In [None]:
def jaccardian(lst1,lst2):
    lst1.sort()
    lst2.sort()
    i = 0
    j = 0
    common = 0
    while(i<len(lst1) and j<len(lst2)):
        if(lst1[i]>lst2[j]):
            j += 1
        elif(lst1[i]<lst2[j]):
            i += 1
        else:
            i += 1
            j += 1
            common += 1
    return common/(len(lst1)+len(lst2) - common)
            

for c in range(0,10):
    if c==3 or c==4 or c==6 or c==8:
        continue
    cluster_index = np.where(labels==c)
    mean_items_bought = 0
    user_idx = cluster_index[0]
    user_idx_1 = sample(list(user_idx),250)
    user_idx_2 = sample(list(user_idx),250)
    start_time = time.time()
    arr1 = []
    print("Sampling done")
    for i in range(0,250):
        item_interest = np.dot(item_vecs,x[user_idx_1[i]].reshape(-1, 1)).reshape(-1)
        pred_items_1 = (-item_interest).argsort()[:20]
        item_interest = np.dot(item_vecs,x[user_idx_2[i]].reshape(-1, 1)).reshape(-1)
        pred_items_2 = (-item_interest).argsort()[:20]
        arr1.append(jaccardian(mapper(pred_items_1),mapper(pred_items_2)))
    mean_items_bought = np.mean(np.array(arr1))
    print("Mean item fraction brought: ",mean_items_bought)
    print("Time taken ->", time.time() - start_time)
        

In [None]:
print(data.shape)
data_fin = data[data['count'] >5] 
print(data_fin.shape)