# Import  Libraries

In [27]:
from datetime import datetime, timedelta
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer ,HashingVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity, euclidean_distances

# Data cleaning

In [28]:
def user_data_cleaner(x):
    if isinstance(x, str):
        return str.lower(x.replace("@@@", " "))
    else:
        return ''

In [29]:
def product_data_cleaner(x):
    if isinstance(x, str):
        x = x.lower().replace('nan', '').replace(';', ',').replace('&amp', '')
        return x
    else:
        return ''

def product_data_cleaner2(x):
    if isinstance(x, list):
        return [i.lower().replace(" ", "") for i in x]
    elif isinstance(x, str):
        return x.lower().replace(" ", "")
    else:
        return ''

In [30]:
def couple(x):
    return ' '.join(x['categories'])+ ' ' + ' '.join(x['short_description'])+ ',' + ''.join(x['product_description'])

# Loading Data

In [31]:
def userdf(sitename):
    
    #User data loading
    path=sitename +'/userdata.csv'
    user_df=pd.read_csv(path)
    
    
    #User data cleaning
    for col in ['cart_history', 'user_purchase_history', 'user_viewed_history']:
        user_df[col] = user_df[col].apply(user_data_cleaner) 
        
    # Cart History
    user_cart_df=user_df.copy()
    user_cart_df.drop(['user_purchase_history','user_viewed_history'],axis=1,inplace=True)
    user_cart_df.set_index(['user_id'],inplace=True)
    
    # Purchase History
    user_purchased_df=user_df.copy()
    user_purchased_df.drop(['cart_history','user_viewed_history'],axis=1,inplace=True)
    user_purchased_df.set_index(['user_id'],inplace=True)
     
    # View History
    user_view_df=user_df.copy()
    user_view_df.drop(['cart_history','user_purchase_history'],axis=1,inplace=True)
    user_view_df.set_index(['user_id'],inplace=True)
    
    return user_cart_df,user_purchased_df,user_view_df

In [32]:
def product_df(sitename):
    path= sitename + '/productdata.csv'
    df=pd.read_csv(path)
    
    
    # Product Data cleaning
    df.rename({"id":"product_id"},axis=1, inplace=True)
    df.drop_duplicates(subset='product_id',inplace=True)
    
    df['product_rating'] = df['product_rating'].fillna(0)
    df['product_description'] = df['product_description'].fillna('None')
    df['selling_price'] = df['selling_price'].fillna(0)
    df = df.sort_values(by='product_id').reset_index(drop=True)

    columns_to_clean = ['product_parent_cat', 'product_cat_name', 'short_description']
    for column in columns_to_clean:
        df[column] = df[column].apply(product_data_cleaner)
        df[column] = df[column].apply(product_data_cleaner2)

    #--------------------------------- combing the category and sub category ---------------------------------
    df.drop_duplicates(subset='product_id',inplace=True)
    category_columns = ['product_cat_name', 'product_parent_cat']
    df['categories'] = df[category_columns].apply(lambda row: ','.join(filter(None, row)), axis=1)
    
    
    
    def remove_whitespace(text):
        return text.strip()

    def get_list(text):
        return list(map(remove_whitespace, re.split('&|,|\*|\n', text)))

    # Apply the functions to the 'categories' and 'type' columns
    df['categories'] = df['categories'].apply(get_list)
    df['short_description'] = df['short_description'].apply(get_list)
    # collection of imp data in one column for similarity algorithm
    df['soup'] = df.apply(couple, axis=1)
    
    df['sale_percentage'] = 100 - (df['selling_price'] / df['regular_price']) * 100
    df['sale_percentage'] = df['sale_percentage'].replace({np.inf: 0, -np.inf: 0}).fillna(0)

    return df

# Cosine Similarity

In [33]:
def similarity(sitename,df):
#     df=product_df(sitename)
    count = TfidfVectorizer(stop_words='english')
    count_matrix = count.fit_transform(df['soup'])
    cosine_sim = cosine_similarity(count_matrix, count_matrix)
    return cosine_sim

# Product similarity

In [34]:
def product_similarity(p_id,sitename):
    
    df=product_df(sitename)
#     print(df.columns)
    cosine_sim=similarity(sitename,df)
    
    indices = pd.Series(df.index, index=df['product_id'])
    idx = indices[p_id]


    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]

    product_indices = [i[0] for i in sim_scores]

    new_rec = df['product_id'].iloc[product_indices]
    recommend=pd.DataFrame({'New Recommendor':new_rec})
    new_rec2=new_rec.values.tolist()
    
    return new_rec2

In [35]:
def reco_name(product_id, recommended_products,sitename):
    df = product_df(sitename)
    reco_name = []
    for i in recommended_products:
        idx = df[(df["product_id"] == i)].index.values
        for j in idx:
            name1 = df["product_name"].iloc[j]
            reco_name.append(name1)
    idx = df[(df['product_id'] == product_id)].index.values
    pro_name = df['product_name'].iloc[idx].values.tolist()
    return reco_name,pro_name

In [53]:
product_id = 96166
user = 'dataset/site1'

recommended_products = product_similarity(product_id, user)
print(recommended_products)
reco_name(product_id, recommended_products, user)

[99013, 105976, 106993, 103905, 109045, 111620, 98823, 113472, 117231, 111497]


(['Spicy Chipotle Mayonnaise Dip',
  'Tikka Mayonnaise Dip',
  'Thousand Island Eggless Dressing',
  'Minty Mayonnaise Dip',
  'Arrabbiata Pasta Sauce With Black Olives',
  'Whole Grain Mustard Sauce',
  'Alphonso Mango Conserve With No Added Sugar',
  'Momo Hot Sauce',
  'Szechwan Hot Sauce',
  'Arrabbiata Pasta Sauce With Bell Peppers'],
 ['Eggless Mayonnaise Dip'])

In [54]:
product_id = 106993
user = 'dataset/site1'

recommended_products = product_similarity(product_id, user)
print(recommended_products)
reco_name(product_id, recommended_products, user)

[99013, 105976, 96166, 111620, 103905, 98823, 109045, 113472, 98009, 115286]


(['Spicy Chipotle Mayonnaise Dip',
  'Tikka Mayonnaise Dip',
  'Eggless Mayonnaise Dip',
  'Whole Grain Mustard Sauce',
  'Minty Mayonnaise Dip',
  'Alphonso Mango Conserve With No Added Sugar',
  'Arrabbiata Pasta Sauce With Black Olives',
  'Momo Hot Sauce',
  'Jamun Conserve With No Added Sugar',
  'Orange Marmalade With No Added Sugar'],
 ['Thousand Island Eggless Dressing'])

# Popular Products

In [38]:
import random
def popular_products(sitename):
    df2=product_df(sitename)
    
    
    #------------
    for i in range (0,len(df2)):
        num=random.randint(0,5)
        df2.at[i,'product_rating']=num
    #------------
    
    
    popular_pro=df2.sort_values(by='product_rating',ascending=False)
    idx=popular_pro.index.values
    pids=[]
    for i in idx:
        pid=df2['product_id'].loc[i]
        pids.append(pid)
    return pids[0:5]

In [39]:
popular_products('dataset/site1')

[108374, 104429, 104395, 99341, 112396]

# On Sale Products

In [40]:
def on_sale(sitename):
    df=product_df(sitename)
    for i in range (0,len(df)):
        market_price=df['regular_price'].iloc[i]
        sale_price=df['selling_price'].iloc[i]
        if sale_price==0.0:
            df.at[i,'sale_percentage']=0
        else:
            percent_off= 100-(sale_price/market_price)*100
            df.at[i,'sale_percentage']=percent_off
    df=df.sort_values(by='sale_percentage',ascending=False)
    df.reset_index(drop=True,inplace=True)
    idx=df.index.values
    pids=[]
    for i in idx:
        pid=df['product_id'].iloc[i]
        pids.append(pid)
    
    return pids[0:5],df


In [41]:
df=on_sale('dataset/site1')
df[0]

[111612, 107997, 108351, 105498, 100180]

In [42]:
def check_percentage(sitename):
    ids=on_sale(sitename)
    ids2=ids[0]
    df2=ids[1]
    s_p=[]
    for i in ids2:
        sale_percentage = df2[df2["product_id"] == i]["sale_percentage"].unique()
        s_p.append(sale_percentage)
    s_p=[j for i in s_p for j in i]
    return s_p

In [43]:
check_percentage('dataset/site1')

[82.5062656641604, 81.203007518797, 80.98271155595997, 80.49979123173279, 80.0]

# User Cart

In [44]:
def user_cart_reco(uid,sitename):
    
    user_df=userdf(sitename)
    user_cart=user_df[0]
    
    defined_user_pid=user_cart.loc[uid]
    test = (defined_user_pid['cart_history']).split()
    
    print(f'Product ids in user cart: {test}')
    
    defined_user_pids=[int(ids) for segments in defined_user_pid for ids in str(segments).split()]
    
    result=[]
    for p_id in test:

        new_rec = product_similarity(int(p_id),sitename)
        recom_df = pd.DataFrame({'Recommendation':new_rec})
      
        recom_df.reset_index(drop=True, inplace=True)
        red=recom_df.values.tolist()
        result.append(red)
        
    li=[ k for i in result for j in i for k in j]
    
    final_result=set(li)
        
    return final_result

# re=user_cart_reco(4,'site1')
# re

In [45]:
customer_id = 4
user = 'dataset/site1'
user_cart_reco(customer_id, user)

Product ids in user cart: ['115650', '104865', '96583', '111856', '103815']


{96560,
 99158,
 99297,
 100170,
 100572,
 100634,
 101088,
 102600,
 103038,
 103143,
 103815,
 104637,
 105117,
 105418,
 105453,
 106530,
 106622,
 107429,
 107748,
 109270,
 109660,
 109911,
 110001,
 110122,
 110941,
 111113,
 111856,
 112631,
 112660,
 112872,
 113209,
 114260,
 114966,
 115650,
 115677,
 116014,
 117691,
 118424,
 118534}

# User Purchased

In [46]:
def user_purchase_reco(uid,sitename):
    
    user_df=userdf(sitename)
    user_purchased=user_df[1]
    
    defined_user_pid=user_purchased.loc[uid]
    test = (defined_user_pid['user_purchase_history']).split()
    print(f"User's Purchased product ids: {test}")
    
    result=[]
    for p_id in test:
        
        new_rec = product_similarity(int(p_id),sitename)
        recom_df = pd.DataFrame({'Recommendation':new_rec})
      
        recom_df.reset_index(drop=True, inplace=True)
        red=recom_df.values.tolist()
        result.append(red)
        
    li=[ k for i in result for j in i for k in j]
    
    final_result=set(li)
        
    return list(final_result)

# re=user_purchase_reco(3,'site1')
# re

In [47]:
def name(pids,user):
    reco_name = []
    path = user + '/productdata.csv'
    df = pd.read_csv(path)
    for i in pids:
        idx = df[(df["product_id"] == i)].index.values
        for j in idx:
            name1 = df["product_name"].iloc[j]
            reco_name.append(name1)
    return list(set(reco_name))

In [48]:
customer_id = 4
user = 'dataset/site1'
pids = user_purchase_reco(customer_id, user)
print(pids)

User's Purchased product ids: ['106391', '118899']
[104193, 96650, 98964, 97558, 111383, 112665, 115230, 114462, 97440, 108961, 98597, 105276, 97223, 114013, 102368, 117990, 115046, 102896, 109175, 104185]


In [49]:
name(pids,user)

['Whole Red Chilly Pickle',
 'Green Stuffed Hot Pepper Pickle',
 'Green Chilli Pickle',
 'Tender Mango Pickle',
 'Ginger Pickle',
 'Grated Mango Pickle',
 'Pickle - Mango Ginger (Without Garlic)',
 'Grandma Mango Pickle',
 'Caper Berries Pickle',
 'Amla Marmalade',
 'Rajasthani Lehsun Chutney',
 'Clammy Berry Pickle',
 'Amla Spicy Pickle',
 'Garlic Ganhiali Pickle',
 'Garlic Chutney',
 'Fenugreek Pickle',
 'Pickle - Kalyana Mango',
 'Jackfruit Pickle',
 'Green Hot Pepper Pickle',
 'Mango Pickle']

# User Veiw

In [50]:
def user_view_reco(uid,sitename):
    
    user_df=userdf(sitename)
    user_view=user_df[2]
    
    defined_user_pid=user_view.loc[uid]
    test = (defined_user_pid['user_viewed_history']).split()
    print(f"User's View product ids: {test}")
        
    result=[]
    for p_id in test:
        
        new_rec = product_similarity(int(p_id),sitename)
        recom_df = pd.DataFrame({'Recommendation':new_rec})
      
        recom_df.reset_index(drop=True, inplace=True)
        red=recom_df.values.tolist()
        result.append(red)
        
    li=[ k for i in result for j in i for k in j]
    
    final_result=set(li)
        
    return list(final_result)

# re=user_view_reco(3,'site1')
# re

In [51]:
customer_id = 4
user = 'dataset/site1'
view_pids = user_view_reco(customer_id, user)
view_pids

User's View product ids: ['95819', '95781', '95821', '95852', '95846']


[118272,
 108290,
 102275,
 112516,
 118786,
 108806,
 98697,
 98320,
 97937,
 108179,
 107415,
 96279,
 102170,
 100769,
 115494,
 105647,
 110771,
 116154,
 118332,
 103999,
 103744,
 109636,
 98116,
 95942,
 100810,
 105546,
 110540,
 118094,
 109008,
 118227,
 101461,
 105174,
 102614,
 116823,
 115163,
 112987,
 107102,
 119135,
 101983,
 107107,
 113252,
 102371,
 109927,
 115177,
 106858,
 108147,
 96887,
 110335]

In [52]:
name(view_pids,user)

['Nourishing Face Wash',
 'Color Naturals Creme Riche Ultra Hair Color - Raspberry Red',
 'Rose & Honey Facial Kit - Honey Lust',
 'Hair Oil - Amla',
 'Shower Gel - Soft & Fresh',
 'Deodorant Body Spray - Magnetism for Men',
 'Purple Body Spray',
 'Homme Gold Body Spray',
 'Color Naturals Creme Riche Sachet',
 'Vaporisateur Natural Spray - Hot Red Box',
 'Code Vaporisateur Natural Spray for Men',
 'Hand Wash - Pure & Gentle',
 'Face Wash - Oil Clear Glow',
 'Multani Mitti Face Pack Powder',
 'Face Wash Oily Skin & Face Scrub De-Tan',
 'Perfume Body Spray - Storm',
 'Pure & Gentle Bathing Bar',
 'Bodywash Moisturising 98% Pure Glycerine',
 'Classic All Day Cream Intensive Care and Protection',
 'Beauty Moisture Face Wash',
 'Deodorant - Storm',
 'Neemwash Neem & Clove Ultra-Purifying Face Wash With Active Neem Slices',
 'Face Wash - Fresh Renewal',
 'Classic Deodorant Spray for Men',
 'Palette Intensive Colour Cream Long Lasting Intensity 5-68 Medium Chestnut',
 'Perfume - Ultra Sensual