### Import Libraries 

In [16]:
import pandas as pd
import numpy as np
import seaborn as sns

import joblib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel

import warnings
warnings.filterwarnings('ignore')

### Load the Dataset

In [17]:
data = products = pd.read_csv('Datasets/flipkart_com-ecommerce_sample.csv', usecols=[ 'uniq_id', 'product_name', 'pid', 'description'], encoding='utf8')
data.columns = ['unique_id', 'product_name', 'product_id', 'description']
data.head()

Unnamed: 0,unique_id,product_name,product_id,description
0,c2d766ca982eca8304150849735ffef9,Alisha Solid Women's Cycling Shorts,SRTEH2FF9KEDEFGF,Key Features of Alisha Solid Women's Cycling S...
1,7f7036a6d550aaa89d34c77bd39a5e48,FabHomeDecor Fabric Double Sofa Bed,SBEEH3QGU7MFYJFY,FabHomeDecor Fabric Double Sofa Bed (Finish Co...
2,f449ec65dcbc041b6ae5e6a32717d01b,AW Bellies,SHOEH4GRSUBJGZXE,Key Features of AW Bellies Sandals Wedges Heel...
3,0973b37acd0c664e3de26e97e5571454,Alisha Solid Women's Cycling Shorts,SRTEH2F6HUZMQ6SJ,Key Features of Alisha Solid Women's Cycling S...
4,bc940ea42ee6bef5ac7cea3fb5cfbee7,Sicons All Purpose Arnica Dog Shampoo,PSOEH3ZYDMSYARJ5,Specifications of Sicons All Purpose Arnica Do...


### Data Statistics

##### Handling Missing Data

In [18]:
print('Number of missing values across columns: \n', data.isnull().sum())

data['description'] = data['description'].fillna('')
print('\nNumber of missing values across columns: \n', data.isnull().sum())

Number of missing values across columns: 
 unique_id       0
product_name    0
product_id      0
description     2
dtype: int64

Number of missing values across columns: 
 unique_id       0
product_name    0
product_id      0
description     0
dtype: int64


##### Truncating Data for training on local machine

In [19]:
data = data.iloc[:5000]

### Statistics

In [20]:
print(f"Total no of unique product names: {len(data['product_name'].unique())}")
print(f"Total No of unique product ids: {len(data['unique_id'].unique())}")

Total no of unique product names: 3396
Total No of unique product ids: 5000


In [21]:
print(f'Columns: {data.shape[0]} \nRows: {data.shape[1]}')

Columns: 5000 
Rows: 4


In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   unique_id     5000 non-null   object
 1   product_name  5000 non-null   object
 2   product_id    5000 non-null   object
 3   description   5000 non-null   object
dtypes: object(4)
memory usage: 156.4+ KB


In [23]:
data.head()

Unnamed: 0,unique_id,product_name,product_id,description
0,c2d766ca982eca8304150849735ffef9,Alisha Solid Women's Cycling Shorts,SRTEH2FF9KEDEFGF,Key Features of Alisha Solid Women's Cycling S...
1,7f7036a6d550aaa89d34c77bd39a5e48,FabHomeDecor Fabric Double Sofa Bed,SBEEH3QGU7MFYJFY,FabHomeDecor Fabric Double Sofa Bed (Finish Co...
2,f449ec65dcbc041b6ae5e6a32717d01b,AW Bellies,SHOEH4GRSUBJGZXE,Key Features of AW Bellies Sandals Wedges Heel...
3,0973b37acd0c664e3de26e97e5571454,Alisha Solid Women's Cycling Shorts,SRTEH2F6HUZMQ6SJ,Key Features of Alisha Solid Women's Cycling S...
4,bc940ea42ee6bef5ac7cea3fb5cfbee7,Sicons All Purpose Arnica Dog Shampoo,PSOEH3ZYDMSYARJ5,Specifications of Sicons All Purpose Arnica Do...


### Content-Based Recommendation System

##### Creating A TF-IDF Vector of tokens in the product descriptions

In [24]:
tfv = TfidfVectorizer(max_features=None,
                     strip_accents='unicode',
                     analyzer='word',
                     min_df=10,
                     token_pattern=r'\w{1,}',
                     ngram_range=(1,3),
                     stop_words='english')
tfidf_matrix = tfv.fit_transform(data['description'])
tfidf_matrix

<5000x6377 sparse matrix of type '<class 'numpy.float64'>'
	with 343378 stored elements in Compressed Sparse Row format>

In [25]:
tfidf_matrix.shape

(5000, 6377)

##### Transforming the vector into a decision model using Sigmoid

In [26]:
sig = sigmoid_kernel(tfidf_matrix,tfidf_matrix)
sig[0]

array([0.76166001, 0.76159583, 0.76159833, ..., 0.76159416, 0.76159416,
       0.76159416])

##### Creating an index of products

In [27]:
indices = pd.Series(data.index,index=data['product_name']).drop_duplicates()
indices.head()

product_name
Alisha Solid Women's Cycling Shorts      0
FabHomeDecor Fabric Double Sofa Bed      1
AW Bellies                               2
Alisha Solid Women's Cycling Shorts      3
Sicons All Purpose Arnica Dog Shampoo    4
dtype: int64

##### Recommendation Function

In [28]:
def product_recommendation(title,sig=sig):
    indx = indices[title]
    
    #getting pairwise similarity scores
    sig_scores = list(enumerate(sig[indx]))
    
    #sorting products
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)
    
    #10 most similar products score
    sig_scores = sig_scores[1:11]
    
    #product indexes
    product_indices = [i[0] for i in sig_scores]
    
    return data.iloc[product_indices]

##### Demonstration

In [29]:
product_recommendation("Oye Boy's Dungaree")

Unnamed: 0,unique_id,product_name,product_id,description
3180,4ef1dd0a04aec0f93e776cd9d55c3959,benext Regular Fit Boy's Red Trousers,TROEHYUVAJ6NBBAE,Specifications of benext Regular Fit Boy's Red...
3143,7befb5ea4ef6ad9574c4eb3363d192e8,LUMBER BOY Printed Boy's Hooded Reversible Gre...,TSHEG57GACSGRQGM,Specifications of LUMBER BOY Printed Boy's Hoo...
3112,e5bfe986e0c734f596009871b48a4673,YK Solid Boy's Round Neck Dark Blue T-Shirt,TSHEGEXDHEZVCAMT,Specifications of YK Solid Boy's Round Neck Da...
352,9d011a8574d1c477659be03fafa6bd25,Nine Maternity Wear Women's Fit and Flare Dress,DREEEYGRYGXYB9BU,Key Features of Nine Maternity Wear Women's Fi...
3118,606d65f980cb8724569f3f7b74747d43,VRTYA Casual Sleeveless Solid Women's Blue Top,TOPEHZMKMYHYGQ9S,Specifications of VRTYA Casual Sleeveless Soli...
3184,69fee308e78fbaf41b2f4c468c24f042,"LUMBER BOY Self Design Boy's Polo Neck Blue, R...",TSHEG57TG27P9EZM,Specifications of LUMBER BOY Self Design Boy's...
3131,438c3e7b293c01a07d787db0eaff9efa,Lilliput Regular Fit Boy's Brown Trousers,TROEHGFYZTGCAUM9,Specifications of Lilliput Regular Fit Boy's B...
3139,5d37093da0d61073db682d36cac53128,"LUMBER BOY Self Design Boy's Polo Neck Red, Wh...",TSHEG57QP6AKHGDK,Specifications of LUMBER BOY Self Design Boy's...
3182,7308f94c231023ac2897a7157c72c990,VRTYA Casual Sleeveless Solid Women's Red Top,TOPEHZMHJHEMRDHY,Specifications of VRTYA Casual Sleeveless Soli...
3111,2c28ce18e078b4c6b724452494506a3a,BIKER BOYS Printed Boy's Round Neck Blue T-Shirt,TSHEGCZTUAZAYE7G,Specifications of BIKER BOYS Printed Boy's Rou...


#### Saving Correlation Matrix

In [30]:
joblib.dump(tfidf_matrix, 'Outputs/Content Based TFIDV Matrix.csv')
joblib.dump(sig, 'Outputs/Content Based SIG.csv')
joblib.dump(indices, 'Outputs/Content Based Product Indices.csv')

['Outputs/Content Based Product Indices.csv']