In [13]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
dataset = pd.read_csv(r'../data/cleaned_shoe_dataset.csv')
dataset.head()

Unnamed: 0,Product_id,Brand,Type,Gender,Size,Number_Sold,Price(USD),User_id,review_title,review_text,review_rating,clean_review_title,clean_review_text,combined_review
0,pdct_70480,Nike,Basketball,Men,US 10,2242,170.0,user_425,Love em,Love these. Was looking for converses and thes...,5,,love these was looking for converses and these...,love these was looking for converses and thes...
1,pdct_66989,Adidas,Running,Men,US 9.5,240,180.0,user_384,The plastic ripped,"The shoes are very cute, but after the 2nd day...",2,the plastic ripped,the shoes are very cute but after the nd day o...,the plastic ripped the shoes are very cute but...
2,pdct_68165,Reebok,Casual,Men,US 11,16662,75.0,user_729,Good quality,Good quality,5,good quality,good quality,good quality good quality
3,pdct_56702,Converse,Casual,Women,US 8,135,55.0,user_239,Good,Great,5,,great,great
4,pdct_68397,Puma,Lifestyle,Men,US 9.5,245,110.0,user_593,Perfect right outta the box,True to size. If between I'd probably go with ...,5,perfect right outta the box,true to size if between id probably go with yo...,perfect right outta the box true to size if be...


In [6]:
# Text Vectorization using TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=500)
tfidf_matrix = tfidf.fit_transform(dataset['combined_review'].fillna(""))

In [8]:
meta_cols = ['Brand', 'Type', 'Gender']
encoder = OneHotEncoder()
meta_encoded = encoder.fit_transform(dataset[meta_cols])

In [12]:
# Merge TF-IDF + metadata into one feature matrix
final_matrix = hstack([tfidf_matrix, meta_encoded])
final_matrix

<539x529 sparse matrix of type '<class 'numpy.float64'>'
	with 8948 stored elements in Compressed Sparse Row format>

In [19]:
# Build similarity matrix
similarity_matrix = cosine_similarity(final_matrix)
similarity_matrix

array([[1.        , 0.27721729, 0.25      , ..., 0.25      , 0.02765868,
        0.        ],
       [0.27721729, 1.        , 0.25      , ..., 0.25      , 0.25      ,
        0.01632773],
       [0.25      , 0.25      , 1.        , ..., 0.4658832 , 0.04815264,
        0.        ],
       ...,
       [0.25      , 0.25      , 0.4658832 , ..., 1.        , 0.04158138,
        0.        ],
       [0.02765868, 0.25      , 0.04815264, ..., 0.04158138, 1.        ,
        0.25      ],
       [0.        , 0.01632773, 0.        , ..., 0.        , 0.25      ,
        1.        ]])

In [24]:
def get_recommendations(product_id, df, sim_matrix):
    index = df[df['Product_id'] == product_id].index[0]
    sim_scores = list(enumerate(sim_matrix[index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_indices = [i[0] for i in sim_scores[1:6]]
    return dataset.iloc[top_indices][['Product_id', 'Brand', 'Type', 'Price(USD)']]

In [30]:
result = get_recommendations('pdct_68397', dataset, similarity_matrix)
result

Unnamed: 0,Product_id,Brand,Type,Price(USD)
308,pdct_85567,Puma,Lifestyle,80.0
64,pdct_47865,Puma,Lifestyle,110.0
140,pdct_77932,Puma,Lifestyle,110.0
192,pdct_87499,Puma,Lifestyle,120.0
214,pdct_90657,Puma,Lifestyle,90.0


In [34]:
result = get_recommendations('pdct_73911', dataset, similarity_matrix)
result

Unnamed: 0,Product_id,Brand,Type,Price(USD)
188,pdct_90216,Nike,Running,160.0
259,pdct_35155,Nike,Running,160.0
531,pdct_41769,Nike,Running,160.0
286,pdct_66003,Nike,Running,160.0
269,pdct_45651,Nike,Running,150.0


In [38]:
result = get_recommendations('pdct_89147', dataset, similarity_matrix)
result

Unnamed: 0,Product_id,Brand,Type,Price(USD)
39,pdct_84757,Reebok,Training,130.0
518,pdct_71269,Reebok,Training,130.0
148,pdct_87226,Reebok,Training,130.0
272,pdct_76268,Reebok,Training,130.0
313,pdct_43992,Reebok,Training,130.0


In [39]:
result = get_recommendations('pdct_43066', dataset, similarity_matrix)
result

Unnamed: 0,Product_id,Brand,Type,Price(USD)
523,pdct_77309,Adidas,Lifestyle,140.0
271,pdct_71624,Adidas,Lifestyle,130.0
220,pdct_51764,Adidas,Lifestyle,130.0
296,pdct_43396,Adidas,Lifestyle,140.0
287,pdct_81295,Adidas,Lifestyle,140.0
