In [173]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics import pairwise, pairwise_distances
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import json

In [119]:
dataset = pd.read_csv("data/dataset_split/train.csv", index_col=0)
dataset.head()

Unnamed: 0,fit,user_id,bust size,item_id,weight,rating,rented for,review_text,body type,review_summary,category,height,size,age,review_date,class_rating
46628,fit,345809,36b,326784,150lbs,5.0,wedding,I wore this dress for my bridal shower this pa...,athletic,Great Bridal Shower Dress,dress,"5' 6""",16,30.0,"May 3, 2016",1
18399,fit,45235,,2766308,128lbs,5.0,everyday,I really liked this sweater. I wore it on a fl...,athletic,"Great sweater, perfect for travel!",cardigan,"5' 6""",8,34.0,"March 7, 2017",1
12853,fit,508677,36b,254960,145lbs,3.0,wedding,I ordered this dress and one other for a weddi...,athletic,Seeing as I didn't wear this dress....not this...,sheath,"5' 8""",16,39.0,"January 18, 2017",0
17290,fit,117290,34a,1687082,110lbs,5.0,wedding,This dress is so fun! Was a little tight aroun...,petite,The COLOR!!!!,gown,"5' 4""",4,33.0,"March 28, 2016",1
9628,fit,144767,34b,135459,,4.0,party,"I get nervous about gold, especially gold lace...",pear,Opulent but Chic and Sexy,dress,"5' 5""",16,33.0,"January 11, 2016",1


In [120]:
model = SentenceTransformer("stsb-distilbert-base")

In [121]:
reviews = list(map(str, dataset["review_text"].values))
review_summaries = list(map(str, dataset["review_summary"].values))

In [123]:
dataset["review_vector"] = model.encode(reviews, show_progress_bar=True, device="cuda").tolist()

Batches:   0%|          | 0/1326 [00:00<?, ?it/s]

In [125]:
dataset['review_summary_vector'] = model.encode(review_summaries, show_progress_bar=True, device="cuda").tolist()

Batches:   0%|          | 0/1326 [00:00<?, ?it/s]

In [126]:
items_review_embeddings = dataset[['item_id', 'review_vector']].groupby('item_id').agg(lambda x: np.array(x.values.tolist()).mean(0).tolist())
items_review_summary_embeddings = dataset[['item_id', 'review_summary_vector']].groupby('item_id').agg(lambda x: np.array(x.values.tolist()).mean(0).tolist())

items_review_embeddings = pd.DataFrame(items_review_embeddings['review_vector'].to_list(), index=items_review_embeddings.index)
items_review_summary_embeddings = pd.DataFrame(items_review_summary_embeddings['review_summary_vector'].to_list(), index=items_review_summary_embeddings.index)

In [127]:
items_categories = dataset[['item_id', 'category']].groupby('item_id').agg(pd.Series.mode)
items_categories_onehot = pd.get_dummies(items_categories)

In [128]:
features = items_review_embeddings.join(items_review_summary_embeddings, how='left', lsuffix='_review', rsuffix='_review_summary')
features = features.join(items_categories_onehot, how='left', rsuffix='_category')
features.head()

Unnamed: 0_level_0,0_review,1_review,2_review,3_review,4_review,5_review,6_review,7_review,8_review,9_review,...,category_tank,category_tee,category_tight,category_top,category_trench,category_trouser,category_trousers,category_tunic,category_turtleneck,category_vest
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
123373,0.229172,-0.322426,0.466535,-0.118174,-0.21943,0.325732,0.010137,-0.829683,0.319148,-0.147905,...,0,0,0,0,0,0,0,0,0,0
123793,0.147573,-0.198897,0.305321,-0.05634,-0.185331,0.320737,0.021465,-0.909637,0.194083,-0.013917,...,0,0,0,0,0,0,0,0,0,0
124204,0.22193,-0.18124,0.292231,-0.145675,-0.305753,0.319419,-0.055652,-0.847778,0.279464,-0.003183,...,0,0,0,0,0,0,0,0,0,0
124553,0.074109,-0.252585,0.301475,-0.219387,-0.091041,0.336473,0.011513,-0.786276,0.047805,-0.10061,...,0,0,0,0,0,0,0,0,0,0
125424,0.071431,-0.227778,0.303384,-0.211697,-0.205015,0.234111,-0.002537,-0.831392,0.126363,-0.056895,...,0,0,0,0,0,0,0,0,0,0


In [167]:
from sklearn.decomposition import PCA
pca = PCA(100)
features_pca = pd.DataFrame(pca.fit_transform(features), index = features.index)


In [165]:
sns.lineplot(x=np.arange(1000), y=pca.explained_variance_ratio_)

<matplotlib.axes._subplots.AxesSubplot at 0x7fa0ffad55e0>

In [166]:
np.sum(pca.explained_variance_ratio_[:100])

0.8446105870515508

In [None]:
metrics = ['eculidean, ']

In [178]:
os.makedirs('data/predictions', exist_ok=True)
for metric in ['euclidean', 'cosine', 'manhattan']:
    recomendations = {}
    for user_id, row in tqdm(dataset.groupby('user_id')):
        items_rented = set(row['item_id'])
        mean = features.loc[items_rented].mean(axis=0)
        mean = pca.transform([mean])[0]
        
        distances = pairwise_distances([mean], features_pca.values, metric=metric, n_jobs=-1)[0]
        maximum, minimum = (np.max(distances), np.min(distances))

        distances = (distances - minimum) / (maximum - minimum)
        recomendations[user_id] = dict(zip(map(str, features_pca.index), 1 - distances))
        break
        
    with open(f'data/predictions/knn_{metric}.json', 'w') as f:
        json.dump(recomendations, f)

    

  0%|          | 25/5631 [00:09<35:42,  2.62it/s]


KeyboardInterrupt: 