In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
dataset = pd.read_csv(r'../data/cleaned_shoe_dataset.csv')
dataset.head()

Unnamed: 0,Product_id,Brand,Type,Gender,Size,Number_Sold,Price(USD),User_id,review_title,review_text,review_rating,url,clean_review_title,clean_review_text,combined_review
0,pdct_79691,Nike,Basketball,Men,US 10,2242,170.0,user_793,Love em,Love these. Was looking for converses and thes...,5,"https://static.nike.com/a/images/c_limit,w_592...",,love these was looking for converses and these...,love these was looking for converses and thes...
1,pdct_53183,Adidas,Running,Men,US 9.5,240,180.0,user_110,The plastic ripped,"The shoes are very cute, but after the 2nd day...",2,https://m.media-amazon.com/images/I/81RV6pBH0q...,the plastic ripped,the shoes are very cute but after the nd day o...,the plastic ripped the shoes are very cute but...
2,pdct_48632,Reebok,Casual,Men,US 11,16662,75.0,user_371,Good quality,Good quality,5,https://m.media-amazon.com/images/I/51jV3wqLqj...,good quality,good quality,good quality good quality
3,pdct_64501,Converse,Casual,Women,US 8,135,55.0,user_169,Good,Great,5,https://m.media-amazon.com/images/I/81aeZowGDr...,,great,great
4,pdct_37352,Puma,Lifestyle,Men,US 9.5,245,110.0,user_860,Perfect right outta the box,True to size. If between I'd probably go with ...,5,https://m.media-amazon.com/images/I/61baYgulRj...,perfect right outta the box,true to size if between id probably go with yo...,perfect right outta the box true to size if be...


In [3]:
# Text Vectorization using TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=500)
tfidf_matrix = tfidf.fit_transform(dataset['combined_review'].fillna(""))

In [4]:
meta_cols = ['Brand', 'Type', 'Gender']
encoder = OneHotEncoder()
meta_encoded = encoder.fit_transform(dataset[meta_cols])

In [5]:
# Merge TF-IDF + metadata into one feature matrix
final_matrix = hstack([tfidf_matrix, meta_encoded])
final_matrix

<535x529 sparse matrix of type '<class 'numpy.float64'>'
	with 8934 stored elements in Compressed Sparse Row format>

In [6]:
# Build similarity matrix
similarity_matrix = cosine_similarity(final_matrix)
similarity_matrix

array([[1.        , 0.27722248, 0.25      , ..., 0.25      , 0.02766262,
        0.        ],
       [0.27722248, 1.        , 0.25      , ..., 0.25      , 0.25      ,
        0.01631714],
       [0.25      , 0.25      , 1.        , ..., 0.46573673, 0.04810355,
        0.        ],
       ...,
       [0.25      , 0.25      , 0.46573673, ..., 1.        , 0.04151081,
        0.        ],
       [0.02766262, 0.25      , 0.04810355, ..., 0.04151081, 1.        ,
        0.25      ],
       [0.        , 0.01631714, 0.        , ..., 0.        , 0.25      ,
        1.        ]])

In [7]:
def get_recommendations(product_id, df, sim_matrix):
    index = df[df['Product_id'] == product_id].index[0]
    sim_scores = list(enumerate(sim_matrix[index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_indices = [i[0] for i in sim_scores[1:6]]
    return dataset.iloc[top_indices][['Product_id', 'Brand', 'Type', 'Price(USD)']]

In [10]:
result = get_recommendations('pdct_37352', dataset, similarity_matrix)
result

Unnamed: 0,Product_id,Brand,Type,Price(USD)
306,pdct_50914,Puma,Lifestyle,80.0
64,pdct_60103,Puma,Lifestyle,110.0
140,pdct_46686,Puma,Lifestyle,110.0
192,pdct_84456,Puma,Lifestyle,120.0
214,pdct_90005,Puma,Lifestyle,90.0


In [17]:
result = get_recommendations('pdct_84636', dataset, similarity_matrix)
result

Unnamed: 0,Product_id,Brand,Type,Price(USD)
243,pdct_77595,Nike,Running,160.0
357,pdct_61815,Nike,Running,120.0
270,pdct_74209,Nike,Running,160.0
302,pdct_77378,Nike,Running,250.0
145,pdct_61988,Nike,Running,120.0


In [18]:
result = get_recommendations('pdct_60748', dataset, similarity_matrix)
result

Unnamed: 0,Product_id,Brand,Type,Price(USD)
34,pdct_81742,Vans,Skate,55.0
180,pdct_98878,Vans,Skate,55.0
307,pdct_67908,Vans,Skate,65.0
44,pdct_98179,Vans,Skate,55.0
75,pdct_38757,Vans,Skate,50.0


In [39]:
result = get_recommendations('pdct_43066', dataset, similarity_matrix)
result

Unnamed: 0,Product_id,Brand,Type,Price(USD)
523,pdct_77309,Adidas,Lifestyle,140.0
271,pdct_71624,Adidas,Lifestyle,130.0
220,pdct_51764,Adidas,Lifestyle,130.0
296,pdct_43396,Adidas,Lifestyle,140.0
287,pdct_81295,Adidas,Lifestyle,140.0


In [19]:
result = get_recommendations('pdct_87842', dataset, similarity_matrix)
result

Unnamed: 0,Product_id,Brand,Type,Price(USD)
398,pdct_88539,Asics,Running,85.0
498,pdct_97534,Asics,Running,150.0
185,pdct_48736,Asics,Running,90.0
423,pdct_51490,Asics,Running,70.0
525,pdct_95701,Asics,Running,150.0
