#### Loading Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#### Reading data

In [2]:
df = pd.read_csv("D:/Minor Project/data.csv", encoding="ISO-8859-1")

In [3]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [4]:
df.nunique()

InvoiceNo      25900
StockCode       4070
Description     4223
Quantity         722
InvoiceDate    23260
UnitPrice       1630
CustomerID      4372
Country           38
dtype: int64

In [5]:
df.shape

(541909, 8)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


In [7]:
df.dropna(subset=['Description'],how='all',inplace=True)

In [8]:
df.Description[0]

'WHITE HANGING HEART T-LIGHT HOLDER'

## <font color='grey'> Keyword Matching to select top products using similarities in description </font>

In [9]:
tfv = TfidfVectorizer(max_features=None,
                     strip_accents='unicode',
                     analyzer='word',
                     min_df=10,
                     token_pattern=r'\w{1,}',
                     ngram_range=(1,3), #take the combination of 1-3 different kind of words
                     stop_words='english') #removes all the unnecessary characters like the,in etc.
df['Description'] = df['Description'].fillna('')

In [10]:
products = df.groupby(['Description'], as_index=False)['InvoiceDate'].max()

In [11]:
products

Unnamed: 0,Description,InvoiceDate
0,4 PURPLE FLOCK DINNER CANDLES,9/7/2011 14:09
1,50'S CHRISTMAS GIFT BAG LARGE,9/8/2011 8:56
2,DOLLY GIRL BEAKER,9/9/2011 15:09
3,I LOVE LONDON MINI BACKPACK,9/30/2011 16:42
4,I LOVE LONDON MINI RUCKSACK,10/6/2011 9:11
...,...,...
4218,wrongly marked carton 22804,11/10/2011 18:20
4219,wrongly marked. 23343 in box,7/14/2011 14:27
4220,wrongly sold (22719) barcode,3/11/2011 16:25
4221,wrongly sold as sets,3/8/2011 17:23


In [12]:
products

Unnamed: 0,Description,InvoiceDate
0,4 PURPLE FLOCK DINNER CANDLES,9/7/2011 14:09
1,50'S CHRISTMAS GIFT BAG LARGE,9/8/2011 8:56
2,DOLLY GIRL BEAKER,9/9/2011 15:09
3,I LOVE LONDON MINI BACKPACK,9/30/2011 16:42
4,I LOVE LONDON MINI RUCKSACK,10/6/2011 9:11
...,...,...
4218,wrongly marked carton 22804,11/10/2011 18:20
4219,wrongly marked. 23343 in box,7/14/2011 14:27
4220,wrongly sold (22719) barcode,3/11/2011 16:25
4221,wrongly sold as sets,3/8/2011 17:23


In [13]:
#fitting the description column.
tfv_matrix = tfv.fit_transform(products['Description']) #converting everythinng to sparse matrix.

In [14]:
tfv_matrix

<4223x497 sparse matrix of type '<class 'numpy.float64'>'
	with 15311 stored elements in Compressed Sparse Row format>

In [15]:
tfv_matrix.shape

(4223, 497)

In [16]:
sig = sigmoid_kernel(tfv_matrix,tfv_matrix) #how description of first product is related to first product and so on.

In [17]:
sig[0]

array([0.76243788, 0.76159416, 0.76159416, ..., 0.76159416, 0.76159416,
       0.76159416])

In [18]:
indices = pd.Series(products.index,index=products['Description']).drop_duplicates()

In [20]:
    def keywords_matcher(title,sig=sig):
        indx = indices[title]

        #getting pairwise similarity scores
        sig_scores = list(enumerate(sig[indx]))

        #sorting products
        sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

        #10 most similar products score
        sig_scores = sig_scores[1:100]

        #product indexes
        product_indices = [i[0] for i in sig_scores]

        #Top 10 most similar products
        return products['Description'].iloc[product_indices]

In [40]:
list(keywords_matcher('4 BLUE DINNER CANDLES SILVER FLOCK', sig=sig).unique())

['4 IVORY DINNER CANDLES SILVER FLOCK',
 '4 SKY BLUE DINNER CANDLES',
 ' 4 PURPLE FLOCK DINNER CANDLES',
 '4 PINK DINNER CANDLE SILVER FLOCK',
 '4 BURGUNDY WINE DINNER CANDLES',
 '4 ROSE PINK DINNER CANDLES',
 '4 LILY  BOTANICAL DINNER CANDLES',
 '4 PEAR BOTANICAL DINNER CANDLES',
 '4 LAVENDER BOTANICAL DINNER CANDLES',
 '4 PINK FLOCK CHRISTMAS BALLS',
 'BLUE FLOCK GLASS CANDLEHOLDER',
 'PINK PILLAR CANDLE SILVER FLOCK',
 'S/4 CACTI CANDLES',
 '4 GOLD FLOCK CHRISTMAS BALLS',
 'IVORY PILLAR CANDLE SILVER FLOCK',
 '4 WILDFLOWER BOTANICAL CANDLES',
 'SET/4 BLUE FLOWER CANDLES IN BOWL',
 'SET/4 GARDEN ROSE DINNER CANDLE',
 'SILVER  CANDLEPOT JARDIN ',
 'SILVER FISHING GNOME ',
 'SILVER LARIAT 40CM',
 'SILVER ROCCOCO CHANDELIER',
 'SILVER STANDING GNOME   ',
 'SILVER TEDDY BEAR',
 'CUBIC MUG FLOCK BLUE ON BROWN',
 'TV DINNER TRAY AIR HOSTESS ',
 '4 VANILLA BOTANICAL CANDLES',
 'PINK FLOCK GLASS CANDLEHOLDER',
 'PINK CHRISTMAS FLOCK DROPLET ',
 'BLUE FLOCK CUSHION COVER ',
 'SET OF 4 ROSE BO

In [35]:
n=input("Enter the name of the product: ")
print("\nTop matched products are: \n")
print(keywords_matcher(n, sig=sig).unique())

Enter the name of the product:  4 PURPLE FLOCK DINNER CANDLES

Top matched products are: 

['4 BLUE DINNER CANDLES SILVER FLOCK'
 '4 IVORY DINNER CANDLES SILVER FLOCK' '4 BURGUNDY WINE DINNER CANDLES'
 '4 SKY BLUE DINNER CANDLES' '4 ROSE PINK DINNER CANDLES'
 '4 LILY  BOTANICAL DINNER CANDLES' '4 PEAR BOTANICAL DINNER CANDLES'
 '4 PINK DINNER CANDLE SILVER FLOCK' '4 LAVENDER BOTANICAL DINNER CANDLES'
 '4 PINK FLOCK CHRISTMAS BALLS' 'S/4 CACTI CANDLES'
 '4 GOLD FLOCK CHRISTMAS BALLS' 'PURPLE FRANGIPANI HAIRCLIP'
 '4 WILDFLOWER BOTANICAL CANDLES' 'SET/4 GARDEN ROSE DINNER CANDLE'
 'TV DINNER TRAY AIR HOSTESS ' '4 VANILLA BOTANICAL CANDLES'
 'PINK FLOCK GLASS CANDLEHOLDER' 'PINK CHRISTMAS FLOCK DROPLET '
 'BLUE FLOCK GLASS CANDLEHOLDER' 'PURPLE ANEMONE ARTIFICIAL FLOWER'
 'PURPLE FOXGLOVE ARTIIFCIAL FLOWER' 'UBO-LIGHT TRIOBASE PURPLE'
 'SET/4 BLUE FLOWER CANDLES IN BOWL' 'SET OF 4 ROSE BOTANICAL CANDLES'
 'METAL BASE FOR CANDLES' '3 GARDENIA MORRIS BOXED CANDLES'
 'PURPLE FRANGIPANI NECKL

In [34]:
# np.savetxt('CosinSimilarity.txt', sig)

In [41]:
df[df.Description.isin(list(keywords_matcher('4 BLUE DINNER CANDLES SILVER FLOCK', sig=sig).unique()))]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
192,536388,22969,HOMEMADE JAM SCENTED CANDLES,12,12/1/2010 9:59,1.45,16250.0,United Kingdom
233,536390,22969,HOMEMADE JAM SCENTED CANDLES,96,12/1/2010 10:19,1.25,17511.0,United Kingdom
317,536400,22969,HOMEMADE JAM SCENTED CANDLES,12,12/1/2010 10:53,1.45,13448.0,United Kingdom
329,536401,22473,TV DINNER TRAY VINTAGE PAISLEY,1,12/1/2010 11:21,4.95,15862.0,United Kingdom
336,536401,20820,SILVER LOOKING MIRROR,3,12/1/2010 11:21,4.95,15862.0,United Kingdom
...,...,...,...,...,...,...,...,...
541539,581498,85174,S/4 CACTI CANDLES,1,12/9/2011 10:26,10.79,,United Kingdom
541618,581516,21620,SET OF 4 ROSE BOTANICAL CANDLES,12,12/9/2011 11:26,1.25,14422.0,United Kingdom
541636,581538,21222,SET/4 BADGES BEETLES,1,12/9/2011 11:34,1.25,14446.0,United Kingdom
541637,581538,21220,SET/4 BADGES DOGS,1,12/9/2011 11:34,1.25,14446.0,United Kingdom


In [37]:
df[df.Description.isin(['WHITE HANGING HEART T-LIGHT HOLDER'])]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
49,536373,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 9:02,2.55,17850.0,United Kingdom
66,536375,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 9:32,2.55,17850.0,United Kingdom
220,536390,85123A,WHITE HANGING HEART T-LIGHT HOLDER,64,12/1/2010 10:19,2.55,17511.0,United Kingdom
262,536394,85123A,WHITE HANGING HEART T-LIGHT HOLDER,32,12/1/2010 10:39,2.55,13408.0,United Kingdom
...,...,...,...,...,...,...,...,...
537291,581246,85123A,WHITE HANGING HEART T-LIGHT HOLDER,1,12/8/2011 10:59,2.95,15453.0,United Kingdom
537326,581253,85123A,WHITE HANGING HEART T-LIGHT HOLDER,2,12/8/2011 11:15,2.95,16891.0,United Kingdom
537852,581356,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/8/2011 12:28,2.95,17830.0,United Kingdom
539979,581452,85123A,WHITE HANGING HEART T-LIGHT HOLDER,32,12/8/2011 18:03,2.55,17675.0,United Kingdom
