In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances

%matplotlib inline

In [2]:
df = pd.read_csv('Data/Online_Retail.csv',parse_dates=['InvoiceDate'])

In [3]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [4]:
df.groupby('Description')['Quantity'].sum().sort_values(ascending=False).nlargest(10)

Description
WORLD WAR 2 GLIDERS ASSTD DESIGNS     53847
JUMBO BAG RED RETROSPOT               47363
ASSORTED COLOUR BIRD ORNAMENT         36381
POPCORN HOLDER                        36334
PACK OF 72 RETROSPOT CAKE CASES       36039
WHITE HANGING HEART T-LIGHT HOLDER    35317
RABBIT NIGHT LIGHT                    30680
MINI PAINT SET VINTAGE                26437
PACK OF 12 LONDON TISSUES             26315
PACK OF 60 PINK PAISLEY CAKE CASES    24753
Name: Quantity, dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


In [6]:
df.shape

(541909, 8)

## Duplicated Values
Not dropping duplicated values as the Customer might separate their purchases on the same items.

In [7]:
df.duplicated().sum()

5268

## Null Values


In [8]:
df.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [9]:
df.dropna(subset=['CustomerID'], inplace=True)

In [10]:
df.shape

(406829, 8)

In [11]:
df.isnull().sum()

InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
dtype: int64

## InvoiceNo
Did an investigation to the InvoiceNo to see if there's a need to convert to int.

In [12]:
# Check the InvoiceNo for non-number
df[df['InvoiceNo'].str.contains(r'\D')].head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
141,C536379,D,Discount,-1,2010-12-01 09:41:00,27.5,14527.0,United Kingdom
154,C536383,35004C,SET OF 3 COLOURED FLYING DUCKS,-1,2010-12-01 09:49:00,4.65,15311.0,United Kingdom
235,C536391,22556,PLASTERS IN TIN CIRCUS PARADE,-12,2010-12-01 10:24:00,1.65,17548.0,United Kingdom
236,C536391,21984,PACK OF 12 PINK PAISLEY TISSUES,-24,2010-12-01 10:24:00,0.29,17548.0,United Kingdom
237,C536391,21983,PACK OF 12 BLUE PAISLEY TISSUES,-24,2010-12-01 10:24:00,0.29,17548.0,United Kingdom


## Quantity
The quantity sold contains negative numbers which might be due to human error. An absolute is applied to the quantity column to change the negative value to positive

In [13]:
df['Quantity'] = df['Quantity'].abs()

In [14]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


## Customer ID
Convert the customer ID to a whole number.

In [15]:
df.CustomerID = df.CustomerID.astype(int)

## Rename Columns
Renaming column Description to Item

In [16]:
df.rename(columns={'Description':'Item'},inplace=True)

In [17]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Item,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom


## Create Total Spent

In [18]:
df['Total Spent'] = df['Quantity'] * df['UnitPrice']

In [19]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Item,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Total Spent
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34


## Item-Based Collaborative Filtering
- The Item will be the index
- The CustomerID will be the column
- The UnitPrice will be the value

In [20]:
df.shape

(406829, 9)

In [21]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Item,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Total Spent
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34


In [22]:
# Using Pivot Table
pivot = pd.pivot_table(df,index='Item',columns='CustomerID',values='UnitPrice')
pivot.head()

CustomerID,12346,12347,12348,12349,12350,12352,12353,12354,12355,12356,...,18273,18274,18276,18277,18278,18280,18281,18282,18283,18287
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4 PURPLE FLOCK DINNER CANDLES,,,,,,,,,,,...,,,,,,,,,,
50'S CHRISTMAS GIFT BAG LARGE,,,,,,,,,,,...,,,,,,,,,,
DOLLY GIRL BEAKER,,,,,,,,,,,...,,,,,,,,,,
I LOVE LONDON MINI BACKPACK,,,,,,,,,,,...,,,,,,,,,,
I LOVE LONDON MINI RUCKSACK,,,,,,,,,,,...,,,,,,,,,,


In [23]:
pivot.shape

(3896, 4372)

In [24]:
pivot.index.value_counts().nlargest(10)

SMALL HAMMERED SILVER CANDLEPOT       1
PAPER CHAIN KIT EMPIRE                1
SET OF 10 LED DOLLY LIGHTS            1
GREEN SWEETHEART BRACELET             1
INFLATABLE POLITICAL GLOBE            1
HOME GARLAND PAINTED ZINC             1
PINK ACRYLIC JEWEL SNOWFLAKE          1
BULL DOG BOTTLE OPENER                1
GLASS JAR DIGESTIVE BISCUITS          1
PACK OF 60 PINK PAISLEY CAKE CASES    1
Name: Item, dtype: int64

## Create Sparse Matrix

In [25]:
sparse_pivot = sparse.csr_matrix(pivot.fillna(0))
print(sparse_pivot)

  (0, 462)	2.55
  (0, 465)	2.55
  (0, 476)	2.55
  (0, 1192)	0.79
  (0, 1282)	0.79
  (0, 1292)	2.55
  (0, 1345)	2.55
  (0, 1762)	2.55
  (0, 1958)	2.55
  (0, 2033)	2.55
  (0, 2306)	2.55
  (0, 2359)	2.55
  (0, 2360)	2.55
  (0, 2424)	0.79
  (0, 2612)	2.55
  (0, 2646)	2.55
  (0, 2702)	2.55
  (0, 2914)	2.55
  (0, 3272)	2.55
  (0, 3279)	0.79
  (0, 3493)	2.55
  (0, 3636)	2.55
  (0, 3656)	1.67
  (0, 3675)	2.55
  (0, 4000)	2.55
  :	:
  (3894, 1752)	10.4
  (3894, 2725)	10.4
  (3894, 2821)	3.95
  (3894, 3427)	10.4
  (3894, 3757)	3.95
  (3894, 3900)	10.4
  (3894, 3950)	10.4
  (3895, 515)	3.75
  (3895, 657)	3.75
  (3895, 802)	3.75
  (3895, 905)	3.75
  (3895, 1295)	3.29
  (3895, 1447)	3.29
  (3895, 1751)	3.75
  (3895, 1895)	1.25
  (3895, 2291)	3.75
  (3895, 2313)	3.75
  (3895, 2520)	3.75
  (3895, 3252)	3.75
  (3895, 3639)	3.75
  (3895, 3818)	3.75
  (3895, 3950)	3.75
  (3895, 4024)	3.75
  (3895, 4037)	1.25
  (3895, 4042)	3.75


## Cosine Similarity

In [26]:
recommender = pairwise_distances(sparse_pivot,metric='cosine')

In [27]:
recommender.shape

(3896, 3896)

## Create Distance DataFrame

In [28]:
recommender_df = pd.DataFrame(recommender,columns=pivot.index,index=pivot.index)
recommender_df.head()

Item,4 PURPLE FLOCK DINNER CANDLES,50'S CHRISTMAS GIFT BAG LARGE,DOLLY GIRL BEAKER,I LOVE LONDON MINI BACKPACK,I LOVE LONDON MINI RUCKSACK,NINE DRAWER OFFICE TIDY,OVAL WALL MIRROR DIAMANTE,RED SPOT GIFT BAG LARGE,SET 2 TEA TOWELS I LOVE LONDON,SPACEBOY BABY GIFT SET,...,ZINC STAR T-LIGHT HOLDER,ZINC SWEETHEART SOAP DISH,ZINC SWEETHEART WIRE LETTER RACK,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS LARGE,ZINC T-LIGHT HOLDER STARS SMALL,ZINC TOP 2 DOOR WOODEN SHELF,ZINC WILLIE WINKIE CANDLE STICK,ZINC WIRE KITCHEN ORGANISER,ZINC WIRE SWEETHEART LETTER TRAY
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4 PURPLE FLOCK DINNER CANDLES,0.0,1.0,0.96148,0.974602,1.0,1.0,0.990469,1.0,0.967195,0.983386,...,1.0,0.946308,0.953761,0.965995,1.0,0.971305,1.0,0.945861,1.0,0.951356
50'S CHRISTMAS GIFT BAG LARGE,1.0,0.0,0.941795,0.961623,1.0,0.963494,0.93802,0.669694,0.968938,0.914613,...,1.0,0.964377,0.953421,0.965746,1.0,0.906057,1.0,0.967045,1.0,0.991833
DOLLY GIRL BEAKER,0.96148,0.941795,0.0,0.858277,0.91112,0.962571,0.952339,0.933795,0.939025,0.801249,...,1.0,0.972273,0.952243,0.91447,1.0,0.905362,1.0,0.968282,0.985544,0.941386
I LOVE LONDON MINI BACKPACK,0.974602,0.961623,0.858277,0.0,0.881161,0.950642,0.9581,0.956348,0.873457,0.88867,...,1.0,0.951835,0.952767,0.968143,1.0,0.96303,1.0,0.946918,1.0,0.933747
I LOVE LONDON MINI RUCKSACK,1.0,1.0,0.91112,0.881161,0.0,1.0,1.0,1.0,1.0,0.913972,...,1.0,1.0,1.0,0.926788,1.0,0.935554,1.0,0.928255,1.0,1.0


## Evaluate recommender performance

In [29]:
search = "CHILDREN'S APRON DOLLY GIRL"

price = (df.loc[df['Item'].str.contains(search),'Item']).unique()

for item in price:
    print(item)
    print('Average Price: ', pivot.loc[item,:].mean())
    print('Number of Item: ', pivot.loc[item,:].count())
    
    print('')
    print('10 recommended items: ')
    recommended_item = recommender_df[item].sort_values()[1:11]
    print(recommended_item)
    print('')
    print('***********************')

CHILDREN'S APRON DOLLY GIRL 
Average Price:  2.0985863874345605
Number of Item:  191

10 recommended items: 
Item
CHILDRENS APRON SPACEBOY DESIGN    0.452217
LUNCH BAG DOLLY GIRL DESIGN        0.626852
RETROSPOT CHILDRENS APRON          0.634272
PINK FAIRY CAKE CHILDRENS APRON    0.642912
DOLLY GIRL LUNCH BOX               0.651518
SPACEBOY LUNCH BOX                 0.670644
CHILDRENS APRON APPLES DESIGN      0.674141
LUNCH BAG SPACEBOY DESIGN          0.685596
DOLLY GIRL MINI BACKPACK           0.686651
CHARLOTTE BAG DOLLY GIRL DESIGN    0.702261
Name: CHILDREN'S APRON DOLLY GIRL , dtype: float64

***********************


## User-Based Collaborative Filtering
- The CustomerID will be the index
- The Item will be the column
- The Quantity will be the value

Usually wholeseller will have problem deciding how much quantity they should be getting for each of the items. Hence we will recommend the user based on the quantity bought by other similar users as a reference.

In [30]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Item,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Total Spent
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34


In [48]:
# Using pivot table
id_pivot = pd.pivot_table(df,index='CustomerID',columns='Item',values='Quantity')
id_pivot.head()

Item,4 PURPLE FLOCK DINNER CANDLES,50'S CHRISTMAS GIFT BAG LARGE,DOLLY GIRL BEAKER,I LOVE LONDON MINI BACKPACK,I LOVE LONDON MINI RUCKSACK,NINE DRAWER OFFICE TIDY,OVAL WALL MIRROR DIAMANTE,RED SPOT GIFT BAG LARGE,SET 2 TEA TOWELS I LOVE LONDON,SPACEBOY BABY GIFT SET,...,ZINC STAR T-LIGHT HOLDER,ZINC SWEETHEART SOAP DISH,ZINC SWEETHEART WIRE LETTER RACK,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS LARGE,ZINC T-LIGHT HOLDER STARS SMALL,ZINC TOP 2 DOOR WOODEN SHELF,ZINC WILLIE WINKIE CANDLE STICK,ZINC WIRE KITCHEN ORGANISER,ZINC WIRE SWEETHEART LETTER TRAY
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346,,,,,,,,,,,...,,,,,,,,,,
12347,,,,,,,,,,,...,,,,,,,,,,
12348,,,,,,,,,,,...,,,,,,,,,,
12349,,,,,,,,,,,...,,,,,,,,,,
12350,,,,,,,,,,,...,,,,,,,,,,


In [49]:
# Convert to sparse matrix
id_sparse = sparse.csr_matrix(id_pivot.fillna(0))
print(id_sparse)

  (0, 2002)	74215.0
  (1, 70)	25.2
  (1, 72)	12.0
  (1, 73)	12.0
  (1, 109)	24.0
  (1, 113)	24.0
  (1, 136)	4.666666666666667
  (1, 137)	5.333333333333333
  (1, 138)	4.0
  (1, 139)	8.0
  (1, 140)	4.0
  (1, 142)	5.0
  (1, 143)	5.333333333333333
  (1, 145)	4.0
  (1, 146)	5.333333333333333
  (1, 147)	7.2
  (1, 286)	12.0
  (1, 313)	6.0
  (1, 324)	12.0
  (1, 344)	9.0
  (1, 357)	12.0
  (1, 366)	30.0
  (1, 402)	6.0
  (1, 423)	12.0
  (1, 452)	24.0
  :	:
  (4371, 2464)	12.0
  (4371, 2480)	4.0
  (4371, 2486)	6.0
  (4371, 2916)	40.0
  (4371, 2917)	24.0
  (4371, 2977)	4.0
  (4371, 2997)	36.0
  (4371, 3027)	24.0
  (4371, 3068)	24.0
  (4371, 3095)	6.0
  (4371, 3143)	48.0
  (4371, 3145)	18.0
  (4371, 3154)	48.0
  (4371, 3159)	24.0
  (4371, 3163)	30.0
  (4371, 3166)	24.0
  (4371, 3207)	24.0
  (4371, 3284)	12.0
  (4371, 3285)	12.0
  (4371, 3301)	12.0
  (4371, 3354)	12.0
  (4371, 3368)	30.0
  (4371, 3369)	12.0
  (4371, 3410)	24.0
  (4371, 3501)	12.0


In [50]:
# Cosine similarity
recommender_id = pairwise_distances(id_sparse,metric='cosine')
recommender_id.shape

(4372, 4372)

In [51]:
# Create DataFrame
id_df = pd.DataFrame(recommender_id, index=id_pivot.index, columns=id_pivot.index)
id_df.head()

CustomerID,12346,12347,12348,12349,12350,12352,12353,12354,12355,12356,...,18273,18274,18276,18277,18278,18280,18281,18282,18283,18287
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
12347,1.0,0.0,0.831705,0.985745,0.979653,0.987955,1.0,0.99356,0.289283,0.925408,...,1.0,0.999573,0.580469,0.999561,0.995734,0.988565,1.0,0.987953,0.964484,0.890869
12348,1.0,0.831705,0.0,0.999946,0.9999,0.999889,1.0,0.986497,0.751867,0.840682,...,1.0,0.95872,0.860572,1.0,1.0,1.0,1.0,1.0,0.791609,0.928112
12349,1.0,0.985745,0.999946,0.0,0.969879,0.922464,1.0,0.995069,0.99982,0.824997,...,1.0,0.851934,1.0,0.999656,0.98432,1.0,1.0,0.979033,0.931088,0.991978
12350,1.0,0.979653,0.9999,0.969879,0.0,0.999569,1.0,1.0,1.0,0.998627,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.972009,1.0


## Recommendation

In [52]:
id_pivot.head()

Item,4 PURPLE FLOCK DINNER CANDLES,50'S CHRISTMAS GIFT BAG LARGE,DOLLY GIRL BEAKER,I LOVE LONDON MINI BACKPACK,I LOVE LONDON MINI RUCKSACK,NINE DRAWER OFFICE TIDY,OVAL WALL MIRROR DIAMANTE,RED SPOT GIFT BAG LARGE,SET 2 TEA TOWELS I LOVE LONDON,SPACEBOY BABY GIFT SET,...,ZINC STAR T-LIGHT HOLDER,ZINC SWEETHEART SOAP DISH,ZINC SWEETHEART WIRE LETTER RACK,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS LARGE,ZINC T-LIGHT HOLDER STARS SMALL,ZINC TOP 2 DOOR WOODEN SHELF,ZINC WILLIE WINKIE CANDLE STICK,ZINC WIRE KITCHEN ORGANISER,ZINC WIRE SWEETHEART LETTER TRAY
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346,,,,,,,,,,,...,,,,,,,,,,
12347,,,,,,,,,,,...,,,,,,,,,,
12348,,,,,,,,,,,...,,,,,,,,,,
12349,,,,,,,,,,,...,,,,,,,,,,
12350,,,,,,,,,,,...,,,,,,,,,,


In [53]:
# Get 12348 similarity scores
sim_12348 = id_df[12348].drop(12348)
sim_12348

CustomerID
12346    1.000000
12347    0.831705
12349    0.999946
12350    0.999900
12352    0.999889
           ...   
18280    1.000000
18281    1.000000
18282    1.000000
18283    0.791609
18287    0.928112
Name: 12348, Length: 4371, dtype: float64

In [54]:
# Filter the top similarity
sim_12348 = sim_12348[sim_12348<0.6]
sim_12348

CustomerID
14891    0.529864
15232    0.598594
17940    0.522354
Name: 12348, dtype: float64

In [55]:
# 3. Turn 12348's similarity scores to weights as a %
weights_12348 = sim_12348.values/np.sum(sim_12348.values)
weights_12348

array([0.32097178, 0.36260586, 0.31642236])

In [70]:
# 4. Get DOLLY GIRL BEAKER quantity
dollygb_quantity = id_pivot[' DOLLY GIRL BEAKER'].drop(12348).loc[sim_12348.index]
dollygb_quantity

CustomerID
14891   NaN
15232   NaN
17940   NaN
Name:  DOLLY GIRL BEAKER, dtype: float64

## Content-based filtering

In [None]:
# when do ppl buy product
# 