In [2]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from torch.utils.data import DataLoader, Dataset
from sentence_transformers import SentenceTransformer

In [None]:
!pip install sentence_transformers

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
df = pd.read_csv('./drive/MyDrive/product_descriptions.csv/product_descriptions.csv')
df.head()

Unnamed: 0,product_uid,product_description
0,100001,"Not only do angles make joints stronger, they ..."
1,100002,BEHR Premium Textured DECKOVER is an innovativ...
2,100003,Classic architecture meets contemporary design...
3,100004,The Grape Solar 265-Watt Polycrystalline PV So...
4,100005,Update your bathroom with the Delta Vero Singl...


In [7]:
df.shape

(124428, 2)

In [8]:
duplicates = df['product_uid'].duplicated()
num_duplicates = duplicates.sum()
num_duplicates

0

In [9]:
num = df['product_uid'].nunique()
num

124428

In [10]:
df.info()
df.isnull().any()
#make all product_uid to be string to avoid incosistencies later on
df.loc[:, 'product_uid'] = df['product_uid'].astype(str)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124428 entries, 0 to 124427
Data columns (total 2 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   product_uid          124428 non-null  int64 
 1   product_description  124428 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.9+ MB


product_uid            False
product_description    False
dtype: bool

In [11]:
#shorten , remove later
df = df.head(1000)

In [12]:
#class to handle text and function below
class Textdata(Dataset):
  def __init__(self, texts):
    self.texts = texts
  def __len__(self):
    return len(self.texts)
  def __getitem__(self,idx):
    return self.texts[idx]

In [13]:
# function to create embeddings in batches
def get_embedding_batches(texts, model, tokenizer, batch_size=32):
  dataset = Textdata(texts)
  #load data from using textdata dataset
  dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
  all_embeddings = []
# itearte over each batch and tokenize input and apply model to the tokenized input
  with torch.no_grad():
    for batch in dataloader:
      #tokenize the batch
      inputs = tokenizer(batch, return_tensors='pt', truncation=True, padding =True, max_length=512)
      outputs=model(**inputs)
      #extract embedding from the ouput getting last hidden state from batch
      batch_embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
      all_embeddings.extend(batch_embeddings)
  return np.vstack(all_embeddings)

In [15]:
#pre-trained model from huggingface
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [16]:
texts = df['product_description'].values
#apply embeddings to the texts specified above
embeddings = get_embedding_batches(texts, model, tokenizer , batch_size = 32)

In [17]:
df2 = df

In [18]:
# apply cosine_similarity to the embeddings
similarity_matrix = cosine_similarity(embeddings)

In [24]:
#create a series from the df[] for better indexing
p_code = pd.Series(df['product_uid'])
p_code.head()

0    100001
1    100002
2    100003
3    100004
4    100005
Name: product_uid, dtype: int64

In [20]:
#unique_p_code = p_code.drop_duplicates().reset_index(drop=True)

In [25]:
#create recommendation function passing sim_matrix
def recommendation(product_uid, cos_sim = similarity_matrix):
  #list to keep the recommendations
  recommended_stock = []
  #find the index of the product_uid in p_code series
  idx = p_code[p_code == product_uid].index[0]
  #sort the similarity scores(series) in descending order
  series = pd.Series(cos_sim[idx]).sort_values(ascending=False)
  #find the top 10 similar codes excluding the one in reference
  top_10_similar_codes = list(series.iloc[1:11].index)
  for i in top_10_similar_codes:
    #append the product_uid to the list
    recommended_stock.append(p_code.iloc[i])
  recommended_stock = list(set(recommended_stock))
  #return the list of recommended stock ; ive changed list to set to list to avoid duplicates inrecommenfdation
  return recommended_stock

In [31]:
recommend = recommendation(100004)
recommend

[100128,
 100483,
 100007,
 100936,
 100298,
 100172,
 100370,
 100115,
 100276,
 100760]

In [32]:
#find each recommendation and get the product_description
df.loc[df['product_uid'].isin(recommend), ['product_uid', 'product_description']].drop_duplicates().set_index('product_uid').loc[recommend]

Unnamed: 0_level_0,product_description
product_uid,Unnamed: 1_level_1
100128,This Proslat wall solution features Proslat's ...
100483,"One-of-a-kind, the Hastings Collection 4-Light..."
100007,The Quantum Adjustable 2-Light LED Black Emerg...
100936,The Grape Solar 265-Watt Polycrystalline PV So...
100298,Decorate your entryway with a classic covering...
100172,The LED Light Puff from Lithonia Lighting is t...
100370,Made from 100% recycled quality post consumer ...
100115,The MasterPiece Patio 71-1/4 in. x 79-1/2 in. ...
100276,"With a vintage modern flair, Markor is inspire..."
100760,The Dual 100-Watt 3-Way Indoor/Outdoor Speaker...


In [28]:
sim_matrix = pd.DataFrame(similarity_matrix, index=p_code, columns=p_code)
sim_matrix

product_uid,100001,100002,100003,100004,100005,100006,100007,100008,100009,100010,...,100991,100992,100993,100994,100995,100996,100997,100998,100999,101000
product_uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,1.000000,0.593381,0.765254,0.714608,0.629065,0.063180,0.675303,0.787658,0.687299,0.744972,...,0.692930,0.716329,0.670024,0.678787,0.757180,0.645287,0.652129,0.201697,0.292832,0.403607
100002,0.593381,1.000000,0.570542,0.559605,0.580117,0.138395,0.472021,0.448875,0.581369,0.612370,...,0.650056,0.519284,0.525481,0.545405,0.539441,0.545728,0.603364,0.158667,0.372949,0.441459
100003,0.765254,0.570542,1.000000,0.658498,0.612615,0.116422,0.656507,0.691836,0.738032,0.668110,...,0.680902,0.698817,0.673661,0.665626,0.657372,0.645859,0.670481,0.176833,0.345213,0.362261
100004,0.714608,0.559605,0.658498,1.000000,0.567495,0.108104,0.725391,0.571930,0.618420,0.650386,...,0.619909,0.554779,0.532351,0.601886,0.581572,0.611839,0.548435,0.188551,0.229565,0.442122
100005,0.629065,0.580117,0.612615,0.567495,1.000000,0.181353,0.556515,0.576067,0.640620,0.567490,...,0.533673,0.587721,0.600599,0.672790,0.603859,0.495381,0.635328,0.249259,0.475055,0.418782
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100996,0.645287,0.545728,0.645859,0.611839,0.495381,0.147628,0.519310,0.565287,0.639274,0.643893,...,0.572993,0.599989,0.586782,0.552226,0.611741,1.000000,0.575573,0.119780,0.209682,0.317301
100997,0.652129,0.603364,0.670481,0.548435,0.635328,0.050048,0.478377,0.593502,0.597601,0.693077,...,0.635148,0.690756,0.705815,0.713010,0.675557,0.575573,1.000000,0.177654,0.419613,0.420977
100998,0.201697,0.158667,0.176833,0.188551,0.249259,0.294573,0.253571,0.230694,0.144627,0.222584,...,0.165765,0.138209,0.260880,0.199893,0.209085,0.119780,0.177654,1.000000,0.278892,0.425504
100999,0.292832,0.372949,0.345213,0.229565,0.475055,0.284458,0.265867,0.253123,0.347123,0.312341,...,0.313208,0.323820,0.359779,0.450355,0.297936,0.209682,0.419613,0.278892,1.000000,0.372152
