## Import module and package

In [None]:
!pip install transformers



In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, Data2VecTextModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

## Read dataset

In [None]:
df = pd.read_csv('new_supermarket_sales_with_product_info.csv')

## Analyze Product

### Number of products

In [None]:
list_product = list(set(df['Item name']))
len(list_product)

704

### About selling

In [None]:
item_count = df['Item name'].value_counts()
item_count

Old Town Coffee White 3in1 Mocha 15sx35g                                                   22
Cheong Kwanjang By Korea Ginseng Corporation Korean Red Ginseng No Sugar Candy 1           17
Early Bird SALES! Up to 20% DISCOUNT FREE DELIVERY, Chinese New Year Hamper, cny hamper    12
[Honey New and manuka Health] Wheat the Card Aloe Honey MGO250  250g 蜂蜜                     9
Easiyo Yogurt Maker 1kg, Easy Way to Make Fresh Yogurt                                      8
                                                                                           ..
Practical  Bamboo Fiber Home Sofa Bedding Blanket Travel Plaid 180x200cm                    1
Asics Arthur onitsuka tiger classic Japanese shoes tiger series running shoes               1
Z69 4K UHD Smart TV BOX KODI Android 6.0 Marshmallow Amlogic                                1
#Womens Winter Long Sleeve 3D Tiger Animal Hooded Sweatshirt Hoodie Coat Jacket             1
PU Leather Embossing Chain Shoulder Bag                     

#### Top 5

In [None]:
# Top 5
item_count[:5]

Old Town Coffee White 3in1 Mocha 15sx35g                                                   22
Cheong Kwanjang By Korea Ginseng Corporation Korean Red Ginseng No Sugar Candy 1           17
Early Bird SALES! Up to 20% DISCOUNT FREE DELIVERY, Chinese New Year Hamper, cny hamper    12
[Honey New and manuka Health] Wheat the Card Aloe Honey MGO250  250g 蜂蜜                     9
Easiyo Yogurt Maker 1kg, Easy Way to Make Fresh Yogurt                                      8
Name: Item name, dtype: int64

$\to$ Top 5 product lines:


1. Old Town Coffee White 3in1 Mocha 15sx35g (22)
2. Cheong Kwanjang By Korea Ginseng Corporation Korean Red Ginseng No Sugar Candy 1 (17)
3. Early Bird SALES! Up to 20% DISCOUNT FREE DELIVERY, Chinese New Year Hamper, cny hamper  (12)
4. [Honey New and manuka Health] Wheat the Card Aloe Honey MGO250  250g 蜂蜜 (9)
5. Easiyo Yogurt Maker 1kg, Easy Way to Make Fresh Yogurt  (8)



#### Best seller

In [None]:
# Best seller
item_count[0:1]

Old Town Coffee White 3in1 Mocha 15sx35g    22
Name: Item name, dtype: int64

$\to$ Best seller: **Old Town Coffee White 3in1 Mocha 15sx35g**

### Top 5 of each Product line

In [None]:
df.groupby(['Product line'])['Item name'].value_counts().groupby(level=0, group_keys=False).head(5)

Product line            Item name                                                                              
Electronic accessories  KingSpec SATA III 3.0 2.5 256GB MLC Digital SSD Solid State Drive                           4
                        Wireless Laser Projection Keyboard                                                          4
                        8.5 LCD eWriter Tablet Writting Drawing Pad Memo Message Board Notepad Stylus               3
                        Ken Xin Da S7 1.54 inch Smartwatch Phone MTK6261 Bluetooth                                  3
                        Mifree Q8 Bluetooth 4.0 Smart Watch MTK2502C Heart Rate Moni                                3
Fashion accessories     Metal Universe Pattern Colour Splicing Crossbody Bag                                        4
                        Adidas NMD HumanRace women men running shoes                                                3
                        Top Quality 100% leather  Mens wallet 

### Create product profile

In [None]:
product_profile = df[['Product line', 'Item name', 'Unit price', 'Quantity', 'Date', 'gross income', 'Item description']].copy()
product_profile

Unnamed: 0,Product line,Item name,Unit price,Quantity,Date,gross income,Item description
0,Health and beauty,Medicube Red Line Toner(100ml) & Serum(50ml)SE...,74.69,7,1/5/2019,26.1415,MEDICUBE REDLINE Balance Sebum Production Clin...
1,Electronic accessories,【HOT】iphone7 mobile phone shell Apple 7plus fr...,15.28,5,3/8/2019,3.8200,Protective sleeve texture: TPUStyle: protectiv...
2,Home and lifestyle,100% flannel four pieces Dualuse Blanket Bed c...,46.33,7,3/3/2019,16.2155,★ soft breathable★ comfortable easy pilling★ F...
3,Health and beauty,Philips AVENT Large Massage Cushion for Comfor...,58.22,8,1/27/2019,23.2880,Wider funnel means more comfortable pumping fo...
4,Sports and travel,LCD Digital Camera Video Scouting Outdoor HC30...,86.31,7,2/8/2019,30.2085,100% Brand New.Model: HC300MMega pixels: 12.0M...
...,...,...,...,...,...,...,...
995,Health and beauty,Wavy Hair Curling Iron Pro Styling Tool Spiral...,40.35,1,1/29/2019,2.0175,﻿﻿Salon Magic Hair Curling Iron Pro Styling To...
996,Home and lifestyle,Purified Air Mute Led Display Air purification...,97.38,10,3/2/2019,48.6900,Led Display2.2L water tankHigh qualityFashion ...
997,Food and beverages,Tokyo Banana Preorder now open!,31.84,1,2/9/2019,1.5920,PO closes on 27th Sep and stocks are coming in...
998,Home and lifestyle,Rubine Shower Mixer with Cover & Elbow Connect...,65.82,1,2/22/2019,3.2910,Stainless steel finishSimple installationEffor...


In [None]:
# Insert new column
product_profile['Day'] = pd.to_datetime(df['Date']).dt.day

In [None]:
# Create dictionary of products
dict_product_profile = dict.fromkeys(list(list_product), {})
dict_product_profile

for p in list_product:
    dict_product_profile[p] = dict.fromkeys(['Average price', 'Average profit per unit', 'Most quantity', 'Most day', 'Product line', 'Description'])
    dict_product_profile[p]['Most quantity'] = product_profile[product_profile['Item name'] == p][['Quantity']].value_counts()[0:1].keys()[0][0]
    dict_product_profile[p]['Most day'] = product_profile[product_profile['Item name'] == p][['Day']].value_counts()[0:1].keys()[0][0]
    dict_product_profile[p]['Average price'] = product_profile[product_profile['Item name'] == p][['Unit price']].mean().values[0]
    dict_product_profile[p]['Average profit per unit'] = product_profile[product_profile['Item name'] == p][['gross income']].mean().values[0]/dict_product_profile[p]['Most quantity']
    dict_product_profile[p]['Product line'] = list(set(product_profile[product_profile['Item name'] == p]['Product line']))[0]
    dict_product_profile[p]['Description'] = list(set(product_profile[product_profile['Item name'] == p]['Item description']))[0]

In [None]:
# Create dataframe product profile
df_product_profile = pd.DataFrame.from_dict(dict_product_profile, orient='index')

In [None]:
df_product_profile.head()

Unnamed: 0,Average price,Average profit per unit,Most quantity,Most day,Product line,Description
Detachable Hifi Circle Iron Inear Headset,94.88,18.634,1,8,Electronic accessories,Style: InEar Communication: Wired Connectors: ...
In stock NIKE ROSHERUN light running shoes couple shoes sports shoes,83.24,4.162,9,29,Fashion accessories,About us:Dolcoa mall is a professional selling...
"Magic Pop Rocks [ Grape, Cola, Strawberry, Orange, Lemon ]",14.55,1.261125,2,1,Food and beverages,40packs
Double Side Thick Warm Flannel Blanket (150x190cm) (Zebra)法蘭絨毯,47.38,2.369,4,23,Home and lifestyle,"More thick, dense, warm doubledQuickdrying and..."
BB Cushion_AntiAging SPF50 PA No. 23 Sand 15g*2,62.0,3.1,8,3,Health and beauty,Skin cooling effectLonglasting makeupPerfect c...


## Analyze Customer

#### Accoss all the stores

In [None]:
df.groupby(['Customer ID'])[['Invoice ID']].agg('count').reset_index().rename(columns={'Invoice ID': 'Count Invoices'}).sort_values(by='Count Invoices', ascending=False)

Unnamed: 0,Customer ID,Count Invoices
494,CI-18-0911,8
45,CI-16-0214,6
61,CI-16-0300,6
183,CI-16-1000,6
220,CI-17-0187,5
...,...,...
245,CI-17-0352,1
244,CI-17-0349,1
237,CI-17-0318,1
235,CI-17-0266,1


$\to$ The customer who makes the most purchases is CI-18-0911. He/She made 8 purchases.

In [None]:
df.groupby(['Customer ID'])[['Total']].agg('sum').reset_index().sort_values(by=['Total'], ascending=False)

Unnamed: 0,Customer ID,Total
79,CI-16-0406,2731.4805
76,CI-16-0397,2451.5505
494,CI-18-0911,2449.7025
125,CI-16-0633,2289.9870
61,CI-16-0300,2259.2325
...,...,...
49,CI-16-0236,20.6850
489,CI-18-0872,20.1075
194,CI-17-0046,19.2465
245,CI-17-0352,19.1940


$\to$ The top spender across all stores is CI-16-0406. He/She spent about 2732$

#### Each of store

In [None]:
df.groupby(['City'])['Customer ID'].value_counts().groupby(level=0, group_keys=False).head(1)

City       Customer ID
Mandalay   CI-16-0300     3
Naypyitaw  CI-17-0634     5
Yangon     CI-16-0601     3
Name: Customer ID, dtype: int64

$\to$ The customer who makes the most purchases in the store in Mandalay city is CI-16-0300. He/She made 3 purchases.

The customer who makes the most purchases in the store in Naypyitaw. He/She made 5 purchases.

The customer who makes the most purchases in the store in Yangon. He/She made 3 purchases.

In [None]:
df.groupby(['City', 'Customer ID'])[['Total']].agg('sum').sort_values(by='Total', ascending=False).groupby(level=0, group_keys=False).head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Total
City,Customer ID,Unnamed: 2_level_1
Naypyitaw,CI-16-0406,2036.244
Mandalay,CI-18-0770,1675.989
Yangon,CI-17-0828,1589.868


$\to$ The top spender of store in Mandalay is CI-18-0770. She/He spent about 1676$

The top spender of store in Naypyitaw. He/She spent about 2037$

The top spender of store in Yangon. He/She spent about 1590$

## Other analysis

### Create matrix customer - product - rating

In [None]:
list_cus_IDs = list(set(df['Customer ID']))
list_item_IDs = list(set(df['Item name']))

dict_cus_item_rating = dict.fromkeys(list_cus_IDs, dict)

for k in dict_cus_item_rating:
    dict_cus_item_rating[k] = dict.fromkeys(list_item_IDs, 0)

for i, r in df.iterrows():
    dict_cus_item_rating[r['Customer ID']][r['Item name']] = r['Rating']

In [None]:
df_matrix_cus_item_rating = pd.DataFrame.from_dict(dict_cus_item_rating, orient='index')

In [None]:
df_matrix_cus_item_rating.head()

Unnamed: 0,Detachable Hifi Circle Iron Inear Headset,In stock NIKE ROSHERUN light running shoes couple shoes sports shoes,"Magic Pop Rocks [ Grape, Cola, Strawberry, Orange, Lemon ]",Double Side Thick Warm Flannel Blanket (150x190cm) (Zebra)法蘭絨毯,BB Cushion_AntiAging SPF50 PA No. 23 Sand 15g*2,Adidas male short sleeves,High TC Embroidery Red Luck Festival Wedding Bedding Sets,PgmallSimple Big Storage Stitch TwoInOne Bags,Women Cycling Biking Sports Long Sleeve TShirt Jersey Long Pants Sportswear Set,Awei A885BL Sport IPX4 Waterproof Wireless Bluetooth Headpho,...,Iona Cordless Electric Kettle GLK158 / GLK188 / GLK150,Nice WomenS Lace Bodycon Party Long Sleeve Floral Backless Mini Cocktail Dress,TP880 Automobile Solar Energy Tire Pressure Monitoring,IPhone 6 Plusfor Iphone 6s Plus Shockproof Phone Case Metal Casing Bumper Cover,Mens down jacket Slim long section winter Korean thick white duck down jacket,Biometric Fingerprint Password Attendance Machine Employee Checkingin Reco,G6 Bluetooth Pedometer Tracker Monitor Smart Watch for IOS Android Phones,Kokomyun Hot Spicy Chicken Soup Noodle Ramen 4.23 Oz X 5 Packs,Hera Cell Essence 150ml,Faux Leather Laptop Sleeve
CI-18-0569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CI-18-0931,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CI-18-0911,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CI-17-0812,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CI-16-0813,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Ratings of the customer CI-18-0911
df_matrix_cus_item_rating.loc['CI-18-0911'][df_matrix_cus_item_rating.loc['CI-18-0911'] != 0]

Mothers Day gift                                                                                     9.6
Sale! G20 BT4.0 Heart Rate Monitoring Pedometer Smart Watch Wristband Bracelet                       5.4
SADES SA901 Gaming Headphones 7.1 Surround Sound With Mic Remote Control Stereo Bass For PC Gamer    6.7
KOSE Sekkisei Sun Protect Essence Gel Set Limited Edition                                            9.3
Korean trend jacket loose coat men and women leisure jacket student jackets                          7.2
PUMA NASKET CLASSIC LFS  Shoes Flat Sneaker Man and Women                                            8.9
(Asia)K88H Bluetooth Smart Watch Heart Rate Monitor  Gesture Control                                 9.7
Ralph Lauren Polo No.1 FOR WOMEN EDT (TESTER)                                                        8.9
Name: CI-18-0911, dtype: float64

### Vectorize description for calculating consine similarity

In [None]:
# Using model from transformer (huggingface)
tokenizer = AutoTokenizer.from_pretrained("facebook/data2vec-text-base")
model = Data2VecTextModel.from_pretrained("facebook/data2vec-text-base")

Some weights of the model checkpoint at facebook/data2vec-text-base were not used when initializing Data2VecTextModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing Data2VecTextModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Data2VecTextModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Data2VecTextModel were not initialized from the model checkpoint at facebook/data2vec-text-base and are newly initialized: ['data2vec_text.pooler.dense.bias', 'data2vec_text.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it f

In [None]:
# Extract samples of description (2 samples from Electronic accessories, 1 from Food and beverages)
samples_description = df[df['Product line'] == 'Electronic accessories'][['Item name', 'Item description']].iloc[2:4]['Item description'].to_list()
another_description = df[df['Product line'] == 'Food and beverages'][['Item name', 'Item description']].iloc[0]['Item description']
samples_description.append(another_description)
samples_description

['Material :PC, ABSStyles :with LanyardFeatures:Waterproof, Shockproof, Dropproof, Dustproof, Antislip, SnowProof, Antiscratch, WearresistantSize:19*10*3.5cmWeight :369gPackage Include:1 x Manual1x Waterproof Case1x LanyardCleaning Cloth1 x Oring Seal1 x Lubricant Watersealing #iphone #iphone7plus#case#phonecase#cover#phonecover#waterproof',
 'This product advantages:1, refined lines and the avantgarde design, derived from a Japanese teacher, by the precision of the nc machine tools are cut, not stamping or abrasive casting, production process is complex and rarely.2, on the surface of the anode oxidation film process, and beautiful at the same time with high corrosion and wear resistance.3, independent of the volume, mute, switch button, convenient operation.4, the bottom left hole, can install phone chain, belts, etc., has high practicability.5, the position of the contact with the shell bumper adopted polyurethane buffer, to protect your phone from damage.Six words with laser etchin

In [None]:
# Vetorize
output_embeddings = []

for s in samples_description:
    inputs = tokenizer(s, return_tensors='pt')
    outputs = model(**inputs)

    last_hidden_states = outputs.last_hidden_state

    output_embeddings.append(last_hidden_states.detach().numpy())

len(output_embeddings)

3

In [None]:
output_embeddings[0].shape

(1, 111, 768)

In [None]:
# Reshape (3, 1, len(words), 768) to (3, 768)
# Using mean of each word vector in a description
final_vectors = []
for v in output_embeddings:
    temp = np.mean(v.reshape(v.shape[1], v.shape[2]), axis=0)
    final_vectors.append(temp)
final_vectors = np.array(final_vectors)
final_vectors.shape

(3, 768)

In [None]:
# Using cosine similarity from sklearn
df_cosine = cosine_similarity(final_vectors, final_vectors)
df_cosine

array([[1.        , 0.92214847, 0.74826175],
       [0.92214847, 0.9999999 , 0.9224901 ],
       [0.74826175, 0.9224901 , 0.9999999 ]], dtype=float32)

The above result shows that this method can be used to compare the similarity of two products. This means that we can use this method to suggest products to customers that are similar to products they have purchased in the past.

### Using similarity for rows in matrix customer - product - rating

In [None]:
# Get samples
samples_customer = df_matrix_cus_item_rating.sample(n=5)

In [None]:
df_cosine_customer = cosine_similarity(samples_customer, samples_customer)
df_cosine_customer

array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.]])

However, this method is not meaningful for customer-product ratings.