In [42]:
import pandas as pd
import numpy as np
import nltk
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

In [43]:
data = pd.read_csv('cleaned_data.csv')

In [44]:
data.isnull().sum()

Clothing ID                0
Age                        0
Title                      0
Review Text                0
Rating                     0
Recommended IND            0
Positive Feedback Count    0
Division Name              0
Department Name            0
Class Name                 0
dtype: int64

In [65]:
data.head()

Unnamed: 0,Clothing ID,Title,Review Text,Rating,Text,Tokenized Text,Title Embeddings,Title Average Embeddings
0,1077,Some major design flaws,I had such high hopes for this dress and reall...,3,Some major design flaws I had such high hopes ...,"[Some, major, design, flaws, I, had, such, hig...","[[0.020565804, 0.09458869, 0.070526294, 0.0888...","[-0.71415097, 0.18164065, 0.24993803, -0.02855..."
1,1049,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,"My favorite buy! I love, love, love this jumps...","[My, favorite, buy, !, I, love, ,, love, ,, lo...","[[0.8402145, 0.59331125, 0.8788204, 0.36551872...","[-0.3292577, 0.30907252, 0.24541365, 0.2673249..."
2,847,Flattering shirt,This shirt is very flattering to all due to th...,5,Flattering shirt This shirt is very flattering...,"[Flattering, shirt, This, shirt, is, very, fla...","[[0.29835078, 0.21943633, 0.4257022, 0.2651122...","[-1.0669509, 0.64420104, 0.33429253, 0.1397360..."
3,1080,Not for the very petite,"I love tracy reese dresses, but this one is no...",2,Not for the very petite I love tracy reese dre...,"[Not, for, the, very, petite, I, love, tracy, ...","[[-0.19434465, 1.8714932, -0.023376394, 0.8437...","[-0.80076677, 0.55171883, 0.03433997, 0.205853..."
4,858,Cagrcoal shimmer fun,I aded this in my basket at hte last mintue to...,5,Cagrcoal shimmer fun I aded this in my basket ...,"[Cagrcoal, shimmer, fun, I, aded, this, in, my...","[[-0.00037799077, 0.0006160232, 0.0071792337, ...","[-0.86289436, 0.38553366, 0.1172388, 0.1001524..."


In [66]:
# Dataset 'data' is a DataFrame containing the columns: 'Clothing ID', 'Title', 'Review Text', 'Rating'

# Step 1: Preprocess the text data (filter out missing values)
print("Preprocessing data...")
data = data[['Clothing ID', 'Title', 'Review Text', 'Rating']].dropna(subset=['Title', 'Review Text'])
data['Text'] = data['Title'] + ' ' + data['Review Text']
#nltk.download('punkt')
data['Tokenized Text'] = data['Text'].apply(nltk.word_tokenize)

Preprocessing data...


In [67]:
data.head()

Unnamed: 0,Clothing ID,Title,Review Text,Rating,Text,Tokenized Text
0,1077,Some major design flaws,I had such high hopes for this dress and reall...,3,Some major design flaws I had such high hopes ...,"[Some, major, design, flaws, I, had, such, hig..."
1,1049,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,"My favorite buy! I love, love, love this jumps...","[My, favorite, buy, !, I, love, ,, love, ,, lo..."
2,847,Flattering shirt,This shirt is very flattering to all due to th...,5,Flattering shirt This shirt is very flattering...,"[Flattering, shirt, This, shirt, is, very, fla..."
3,1080,Not for the very petite,"I love tracy reese dresses, but this one is no...",2,Not for the very petite I love tracy reese dre...,"[Not, for, the, very, petite, I, love, tracy, ..."
4,858,Cagrcoal shimmer fun,I aded this in my basket at hte last mintue to...,5,Cagrcoal shimmer fun I aded this in my basket ...,"[Cagrcoal, shimmer, fun, I, aded, this, in, my..."


In [68]:
# Step 2: Train the Word2Vec model 
sentences = data['Tokenized Text'].tolist()
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)


In [69]:
data.head()

Unnamed: 0,Clothing ID,Title,Review Text,Rating,Text,Tokenized Text
0,1077,Some major design flaws,I had such high hopes for this dress and reall...,3,Some major design flaws I had such high hopes ...,"[Some, major, design, flaws, I, had, such, hig..."
1,1049,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,"My favorite buy! I love, love, love this jumps...","[My, favorite, buy, !, I, love, ,, love, ,, lo..."
2,847,Flattering shirt,This shirt is very flattering to all due to th...,5,Flattering shirt This shirt is very flattering...,"[Flattering, shirt, This, shirt, is, very, fla..."
3,1080,Not for the very petite,"I love tracy reese dresses, but this one is no...",2,Not for the very petite I love tracy reese dre...,"[Not, for, the, very, petite, I, love, tracy, ..."
4,858,Cagrcoal shimmer fun,I aded this in my basket at hte last mintue to...,5,Cagrcoal shimmer fun I aded this in my basket ...,"[Cagrcoal, shimmer, fun, I, aded, this, in, my..."


In [70]:
# Step 3: Calculate the average embeddings for each item
print("Calculating average embeddings...")
data['Title Embeddings'] = data['Tokenized Text'].apply(lambda x: [word2vec_model.wv[word] for word in x])
data['Title Average Embeddings'] = data['Title Embeddings'].apply(lambda x: np.mean(x, axis=0))

Calculating average embeddings...


In [71]:
data.head()

Unnamed: 0,Clothing ID,Title,Review Text,Rating,Text,Tokenized Text,Title Embeddings,Title Average Embeddings
0,1077,Some major design flaws,I had such high hopes for this dress and reall...,3,Some major design flaws I had such high hopes ...,"[Some, major, design, flaws, I, had, such, hig...","[[0.010133444, 0.09302715, 0.02481847, 0.10016...","[-0.72239244, 0.11369373, 0.181142, 0.12728694..."
1,1049,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,"My favorite buy! I love, love, love this jumps...","[My, favorite, buy, !, I, love, ,, love, ,, lo...","[[0.9915871, 0.845416, 0.7309338, 0.35235518, ...","[-0.24818459, 0.25702408, 0.08568845, 0.356133..."
2,847,Flattering shirt,This shirt is very flattering to all due to th...,5,Flattering shirt This shirt is very flattering...,"[Flattering, shirt, This, shirt, is, very, fla...","[[0.21602146, 0.34328246, 0.45109534, 0.146753...","[-1.0911458, 0.6699375, 0.13972206, 0.3084092,..."
3,1080,Not for the very petite,"I love tracy reese dresses, but this one is no...",2,Not for the very petite I love tracy reese dre...,"[Not, for, the, very, petite, I, love, tracy, ...","[[-0.24303421, 2.2687676, 0.020348907, 0.71986...","[-0.74685556, 0.4202889, -0.04262956, 0.345292..."
4,858,Cagrcoal shimmer fun,I aded this in my basket at hte last mintue to...,5,Cagrcoal shimmer fun I aded this in my basket ...,"[Cagrcoal, shimmer, fun, I, aded, this, in, my...","[[-0.004629471, 0.01068616, 0.0040068827, 0.01...","[-0.8385299, 0.23148893, 0.022635305, 0.253173..."


In [72]:
# Step 4: Compute cosine similarity between all item embeddings
item_embeddings = data['Title Average Embeddings'].tolist()
cosine_similarities = cosine_similarity(item_embeddings, item_embeddings)

In [73]:
data.head()

Unnamed: 0,Clothing ID,Title,Review Text,Rating,Text,Tokenized Text,Title Embeddings,Title Average Embeddings
0,1077,Some major design flaws,I had such high hopes for this dress and reall...,3,Some major design flaws I had such high hopes ...,"[Some, major, design, flaws, I, had, such, hig...","[[0.010133444, 0.09302715, 0.02481847, 0.10016...","[-0.72239244, 0.11369373, 0.181142, 0.12728694..."
1,1049,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,"My favorite buy! I love, love, love this jumps...","[My, favorite, buy, !, I, love, ,, love, ,, lo...","[[0.9915871, 0.845416, 0.7309338, 0.35235518, ...","[-0.24818459, 0.25702408, 0.08568845, 0.356133..."
2,847,Flattering shirt,This shirt is very flattering to all due to th...,5,Flattering shirt This shirt is very flattering...,"[Flattering, shirt, This, shirt, is, very, fla...","[[0.21602146, 0.34328246, 0.45109534, 0.146753...","[-1.0911458, 0.6699375, 0.13972206, 0.3084092,..."
3,1080,Not for the very petite,"I love tracy reese dresses, but this one is no...",2,Not for the very petite I love tracy reese dre...,"[Not, for, the, very, petite, I, love, tracy, ...","[[-0.24303421, 2.2687676, 0.020348907, 0.71986...","[-0.74685556, 0.4202889, -0.04262956, 0.345292..."
4,858,Cagrcoal shimmer fun,I aded this in my basket at hte last mintue to...,5,Cagrcoal shimmer fun I aded this in my basket ...,"[Cagrcoal, shimmer, fun, I, aded, this, in, my...","[[-0.004629471, 0.01068616, 0.0040068827, 0.01...","[-0.8385299, 0.23148893, 0.022635305, 0.253173..."


In [60]:
# Step 5: Recommendation function based on cosine similarity
def recommend_similar_items(item_id, n=5):
    # Get the index of the item with the given item_id
    item_index = data[data['Clothing ID'] == item_id].index[0]

    # Get the cosine similarity scores for the given item with all other items
    similarity_scores = cosine_similarities[item_index]

    # Sort the items based on similarity scores (excluding the item itself)
    similar_item_indices = similarity_scores.argsort()[:-1][::-1]

    # Get the top N similar items
    top_similar_items = data.iloc[similar_item_indices][:n]

    return top_similar_items

In [75]:
# Example: Recommend 5 similar items to an item with 'Clothing ID' of 22
recommended_items = recommend_similar_items(941, n=5)
print(recommended_items[['Clothing ID', 'Title', 'Rating']])

       Clothing ID                                         Title  Rating
17635         1083                              Gorgeous dress!!       5
7776           940                               Staple cardigan       4
14977          933                                 Love, love!!!       5
2478           862  Transformed into a crop top with one washing       1
1618          1016                                   So bizarre!       5
