In [91]:
# Librerías importadas

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD

In [9]:
# Importamos el DataSet

my_dt = pd.read_json('renttherunway_final_data.json', lines=True)
my_dt.head(5)

Unnamed: 0,fit,user_id,bust size,item_id,weight,rating,rented for,review_text,body type,review_summary,category,height,size,age,review_date
0,fit,420272,34d,2260466,137lbs,10.0,vacation,An adorable romper! Belt and zipper were a lit...,hourglass,So many compliments!,romper,"5' 8""",14,28.0,"April 20, 2016"
1,fit,273551,34b,153475,132lbs,10.0,other,I rented this dress for a photo shoot. The the...,straight & narrow,I felt so glamourous!!!,gown,"5' 6""",12,36.0,"June 18, 2013"
2,fit,360448,,1063761,,10.0,party,This hugged in all the right places! It was a ...,,It was a great time to celebrate the (almost) ...,sheath,"5' 4""",4,116.0,"December 14, 2015"
3,fit,909926,34c,126335,135lbs,8.0,formal affair,I rented this for my company's black tie award...,pear,Dress arrived on time and in perfect condition.,dress,"5' 5""",8,34.0,"February 12, 2014"
4,fit,151944,34b,616682,145lbs,10.0,wedding,I have always been petite in my upper body and...,athletic,Was in love with this dress !!!,gown,"5' 9""",12,27.0,"September 26, 2016"


Preprocesamiento y limpieza

In [10]:
print(my_dt.shape)

(192544, 15)


In [11]:
my_dt.dtypes

fit                object
user_id             int64
bust size          object
item_id             int64
weight             object
rating            float64
rented for         object
review_text        object
body type          object
review_summary     object
category           object
height             object
size                int64
age               float64
review_date        object
dtype: object

In [12]:
my_dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192544 entries, 0 to 192543
Data columns (total 15 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   fit             192544 non-null  object 
 1   user_id         192544 non-null  int64  
 2   bust size       174133 non-null  object 
 3   item_id         192544 non-null  int64  
 4   weight          162562 non-null  object 
 5   rating          192462 non-null  float64
 6   rented for      192534 non-null  object 
 7   review_text     192544 non-null  object 
 8   body type       177907 non-null  object 
 9   review_summary  192544 non-null  object 
 10  category        192544 non-null  object 
 11  height          191867 non-null  object 
 12  size            192544 non-null  int64  
 13  age             191584 non-null  float64
 14  review_date     192544 non-null  object 
dtypes: float64(2), int64(3), object(10)
memory usage: 22.0+ MB


In [15]:
my_dt.drop(['bust size','weight','height'], axis=1, inplace=True)

In [42]:
body_type_mode = my_dt['body type'].mode()[0]
my_dt['body type'].fillna(body_type_mode,inplace=True)

In [43]:
my_dt['body type'].unique()

array(['hourglass', 'straight & narrow', 'pear', 'athletic', 'full bust',
       'petite', 'apple'], dtype=object)

In [56]:
print(my_dt['body type'].value_counts())

hourglass            69986
athletic             43667
pear                 22135
petite               22131
full bust            15006
straight & narrow    14742
apple                 4877
Name: body type, dtype: int64


In [67]:
clean_dt = my_dt[my_dt['body type'].str.contains('athletic')]
print(clean_dt['body type'].unique())
clean_dt.shape

['athletic']


(43667, 12)

In [68]:
clean_dt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43667 entries, 4 to 192543
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   fit             43667 non-null  object 
 1   user_id         43667 non-null  int64  
 2   item_id         43667 non-null  int64  
 3   rating          43652 non-null  float64
 4   rented for      43665 non-null  object 
 5   review_text     43667 non-null  object 
 6   body type       43667 non-null  object 
 7   review_summary  43667 non-null  object 
 8   category        43667 non-null  object 
 9   size            43667 non-null  int64  
 10  age             43579 non-null  float64
 11  review_date     43667 non-null  object 
dtypes: float64(2), int64(3), object(7)
memory usage: 4.3+ MB


In [72]:
categoria_total = clean_dt['category'].unique()
ocasiones_total = clean_dt['rented for'].unique()
usuarios_total = clean_dt['user_id'].unique()

print('Categorias totales: ', len(categoria_total))
print('Razones de compra totales: ', len(ocasiones_total))
print('Usuarios totales: ', len(usuarios_total))

Categorias totales:  62
Razones de compra totales:  9
Usuarios totales:  23862


In [73]:
print("Categorias que se compran")
pd.DataFrame(sorted(clean_dt['category'].unique()))

Categorias que se compran


Unnamed: 0,0
0,ballgown
1,blazer
2,blouse
3,bomber
4,caftan
...,...
57,trouser
58,trousers
59,tunic
60,turtleneck


In [158]:
# Eliminamos categorías duplicadas

df_names = clean_dt.drop_duplicates(subset='category', ignore_index=True)

# Obtenemos una lista de categirías únicas
unique_category = sorted(df_names['category'].unique())

# Contar el numero de categorias
counts = df_names.groupby('category')['item_id'].nunique()

# Creamos un diccionario de categorías unicas
unique_counts = {}
for value in unique_category:
    count = counts.get(value, 0)
    unique_counts[value] = count
    
sorted_data = dict(sorted(unique_counts.items(), key=lambda item: item[1], reverse=True))

df_unique = pd.DataFrame({'Categoria':sorted_data.keys(),
                        'Prendas':sorted_data.values()
                       })

print('Numero de prendas por categoria:')
df_unique

Numero de prendas por categoria:


Unnamed: 0,Categoria,Prendas
0,ballgown,1
1,blazer,1
2,blouse,1
3,bomber,1
4,caftan,1
...,...,...
57,trouser,1
58,trousers,1
59,tunic,1
60,turtleneck,1


Para nuestro sistema de recomendación seleccionaremos las columnas para nuestro algoritmo, en este caso (user_id, category, rating y item_id)

In [83]:
df_recsys = clean_dt[['user_id', 'category', 'rating', 'item_id']]

print(df_recsys.shape)
df_recsys.head()

(43667, 4)


Unnamed: 0,user_id,category,rating,item_id
4,151944,gown,10.0,616682
5,734848,dress,8.0,364092
10,185966,dress,8.0,1077123
14,721308,gown,10.0,123793
17,339899,dress,10.0,1622747


In [86]:
# Seleccionamos un usuario random para nuestro análisis

idx = (df_recsys['user_id'] == 339899)

# Imprimimos el total de calificaciones del usuario

print('Total de restaurantes calificados por el usuario: ',idx.sum())
df_recsys[idx]

Total de restaurantes calificados por el usuario:  8


Unnamed: 0,user_id,category,rating,item_id
17,339899,dress,10.0,1622747
10718,339899,romper,10.0,2567610
29462,339899,dress,10.0,1574548
51350,339899,top,10.0,2301588
97608,339899,top,8.0,2341911
101968,339899,dress,10.0,1839582
177894,339899,dress,10.0,1979533
186791,339899,dress,8.0,1203092


In [87]:
# Creamos nuestra matriz de utilidad

UtlMtrx = df_recsys.pivot_table(values='rating', index='user_id', 
                                columns='category', fill_value=0
                                )
UtlMtrx.head() 

category,ballgown,blazer,blouse,bomber,caftan,cami,cape,cardigan,coat,combo,...,tank,tee,tight,top,trench,trouser,trousers,tunic,turtleneck,vest
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
302,0,0.0,0.0,0,0,0,0,0,0.0,0,...,0,0,0,0.0,0,0,0,0,0,0
318,0,0.0,0.0,0,0,0,0,0,0.0,0,...,0,0,0,0.0,0,0,0,0,0,0
442,0,0.0,0.0,0,0,0,0,0,0.0,0,...,0,0,0,0.0,0,0,0,0,0,0
464,0,0.0,0.0,0,0,0,0,0,0.0,0,...,0,0,0,0.0,0,0,0,0,0,0
474,0,0.0,0.0,0,0,0,0,0,0.0,0,...,0,0,0,0.0,0,0,0,0,0,0


In [88]:
print('Total de elementos de la matriz de utilidad: %d' % (UtlMtrx.size))
print('Total de elementos diferentes de cero: %d' % (np.count_nonzero(UtlMtrx)))
print('Porcentaje de elementos diferentes de cero: %.1f%%' % (100 * np.count_nonzero(UtlMtrx) / UtlMtrx.size))

Total de elementos de la matriz de utilidad: 1478700
Total de elementos diferentes de cero: 32959
Porcentaje de elementos diferentes de cero: 2.2%


Vamos a desarrollar el sistema de recomendación utilizando el método de reducción de dimensionalidad SVD (Singular Value Decomposition) o Descomposición de Valores Singulares.
Esta solución es comúnmente empleada como un primer sistema de recomendación, basado en las calificaciones que los usuarios han otorgado en el pasado.


In [89]:
# Transponer la matriz de utilidad

X = UtlMtrx.T

print(X.shape)
X.head()

(62, 23850)


user_id,302,318,442,464,474,487,526,663,678,703,...,999606,999621,999666,999686,999710,999834,999865,999892,999914,999952
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ballgown,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
blazer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
blouse,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
bomber,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
caftan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [133]:
SVD = TruncatedSVD(n_components=658, random_state=42)  
SVD.fit(X)

# Vamos a validar con 7 valores singulares

num_sv = 7

print('Cantidad de información simplificada con los primeros %d vectores singulares:' % num_sv)
print('%.1f%%' %  (100 * (1- (SVD.singular_values_[0:num_sv]).sum() / (SVD.singular_values_).sum())))

Cantidad de información simplificada con los primeros 7 vectores singulares:
43.2%


In [105]:
num_sv = 7

SVD = TruncatedSVD(n_components=num_sv, random_state=42)

clothes_matrix = SVD.fit_transform(X)
clothes_matrix.shape

(62, 7)

Utilizando la nueva matriz, calculamos el coeficiente de correlación de Pearson.

Este será el indicador que nos mostrará la similitud entre categorías y, por ende, si es o no una opción recomendable.

In [106]:
# Matriz de correlación Pearson

corrMtx = np.corrcoef(clothes_matrix)
 
corrMtx[0:5,0:5]

array([[1.        , 0.11734568, 0.06352402, 0.08279376, 0.08856077],
       [0.11734568, 1.        , 0.99183639, 0.97821323, 0.91453735],
       [0.06352402, 0.99183639, 1.        , 0.9924488 , 0.95187131],
       [0.08279376, 0.97821323, 0.9924488 , 1.        , 0.9760327 ],
       [0.08856077, 0.91453735, 0.95187131, 0.9760327 , 1.        ]])

In [120]:
liked = 'dress'

names = UtlMtrx.columns 
names_list = list(names)
id_liked = names_list.index(liked)

id_liked

13

In [121]:
# Revisamos la matriz de correlación con algunas otras categorías

corr_recom = corrMtx[id_liked]
print(corr_recom.shape)
corr_recom[0:10]

(62,)


array([0.97152554, 0.29974868, 0.23865196, 0.24930346, 0.22194159,
       0.1712353 , 0.35565556, 0.29833755, 0.23785135, 0.11320672])

Obtenemos las recomendaciones de acuerdo al la categoría seleccionada
Ejemplo 1:
limitamos la salida de la lista especificando un valor alto del coeficiente de correlación, en este caso, entre 0.80 y 0.99:

In [124]:
# Categoría gown
print('Recomendaciones: ')

# Seleccionamos categorías entre el 80 y el 99 de correlación
list(names[(corr_recom > 0.80) & (corr_recom < 0.99)]) 

Recomendaciones: 


['ballgown', 'frock', 'mini', 'suit']

In [126]:
ids = (corr_recom > .80) & (corr_recom < 0.99)
tmp = list()

for i in range(len(names[ids])):
    tmp.append((corr_recom[ids][i], names[ids][i]))

sorted(tmp, key=lambda x:x[0], reverse=True) 

[(0.9715255443159208, 'ballgown'),
 (0.8914046099689586, 'frock'),
 (0.876546702570356, 'suit'),
 (0.813429409648354, 'mini')]

Como podemos observar, obtenemos una recomendación basada en la correlación de nuestra prenda seleccionada, podemos hacer esto mismo con las calificaciones de cliente