In [1]:
import gzip
import numpy as np
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, balanced_accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

In [2]:
def parse(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield eval(l, {"__builtins__": None}, {"null": None})

In [3]:
allRatings = [item for item in parse("../data/renttherunway_final_data.json.gz")]

In [4]:
allRatings = pd.DataFrame(allRatings)

In [5]:
allRatings.head()

Unnamed: 0,fit,user_id,bust size,item_id,weight,rating,rented for,review_text,body type,review_summary,category,height,size,age,review_date
0,fit,420272,34d,2260466,137lbs,10,vacation,An adorable romper! Belt and zipper were a lit...,hourglass,So many compliments!,romper,"5' 8""",14,28,"April 20, 2016"
1,fit,273551,34b,153475,132lbs,10,other,I rented this dress for a photo shoot. The the...,straight & narrow,I felt so glamourous!!!,gown,"5' 6""",12,36,"June 18, 2013"
2,fit,360448,,1063761,,10,party,This hugged in all the right places! It was a ...,,It was a great time to celebrate the (almost) ...,sheath,"5' 4""",4,116,"December 14, 2015"
3,fit,909926,34c,126335,135lbs,8,formal affair,I rented this for my company's black tie award...,pear,Dress arrived on time and in perfect condition.,dress,"5' 5""",8,34,"February 12, 2014"
4,fit,151944,34b,616682,145lbs,10,wedding,I have always been petite in my upper body and...,athletic,Was in love with this dress !!!,gown,"5' 9""",12,27,"September 26, 2016"


In [6]:
allRatings.iloc[0]

fit                                                             fit
user_id                                                      420272
bust size                                                       34d
item_id                                                     2260466
weight                                                       137lbs
rating                                                           10
rented for                                                 vacation
review_text       An adorable romper! Belt and zipper were a lit...
body type                                                 hourglass
review_summary                                 So many compliments!
category                                                     romper
height                                                        5' 8"
size                                                             14
age                                                              28
review_date                                     

In [7]:
allRatings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192544 entries, 0 to 192543
Data columns (total 15 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   fit             192544 non-null  object
 1   user_id         192544 non-null  object
 2   bust size       174133 non-null  object
 3   item_id         192544 non-null  object
 4   weight          162562 non-null  object
 5   rating          192462 non-null  object
 6   rented for      192534 non-null  object
 7   review_text     192544 non-null  object
 8   body type       177907 non-null  object
 9   review_summary  192544 non-null  object
 10  category        192544 non-null  object
 11  height          191867 non-null  object
 12  size            192544 non-null  int64 
 13  age             191584 non-null  object
 14  review_date     192544 non-null  object
dtypes: int64(1), object(14)
memory usage: 22.0+ MB


In [8]:
allRatings['item_id'].describe()

count     192544
unique      5850
top       126335
freq        2241
Name: item_id, dtype: object

In [9]:
allRatings.isnull().sum()

fit                   0
user_id               0
bust size         18411
item_id               0
weight            29982
rating               82
rented for           10
review_text           0
body type         14637
review_summary        0
category              0
height              677
size                  0
age                 960
review_date           0
dtype: int64

In [10]:
allRatings = allRatings.dropna(subset=['bust size', 'weight', 'rating', 'rented for', 'body type','height', 'age'])

In [11]:
allRatings.isnull().sum()

fit               0
user_id           0
bust size         0
item_id           0
weight            0
rating            0
rented for        0
review_text       0
body type         0
review_summary    0
category          0
height            0
size              0
age               0
review_date       0
dtype: int64

In [12]:
allRatings['height'][0]

'5\' 8"'

In [13]:
# Converting height to inches
allRatings['height'] = (
    allRatings['height']
    .str.extract(r"(\d+)' ?(\d+)?")
    .fillna(0) 
    .astype(int)
    .apply(lambda x: x[0] * 12 + x[1], axis=1)
)

In [14]:
# Convert 'weight' to numeric by removing 'lbs'
allRatings['weight'] = pd.to_numeric(allRatings['weight'].str.replace('lbs', ''), errors='coerce')

In [15]:
# Extract numeric bust size from 'bust size' and handle missing values gracefully
allRatings['bust_numeric'] = allRatings['bust size'].str.extract(r'(\d+)').astype(int)

In [16]:
allRatings['cup_size'] = allRatings['bust size'].str.extract(r'([A-Za-z]+)')

# Encode the cup size alphabetically (A = 1, B = 2, ..., AA = 27)
allRatings['cup_size_encoded'] = allRatings['cup_size'].apply(lambda x: sum([(ord(char) - 64) for char in x.upper()]) if pd.notna(x) else 0)

In [17]:
# combine bust numeric and cup size as a single feature
allRatings['bust_combined'] = allRatings['bust_numeric'] + allRatings['cup_size_encoded'] * 0.1

In [18]:
allRatings.iloc[0]

fit                                                               fit
user_id                                                        420272
bust size                                                         34d
item_id                                                       2260466
weight                                                            137
rating                                                             10
rented for                                                   vacation
review_text         An adorable romper! Belt and zipper were a lit...
body type                                                   hourglass
review_summary                                   So many compliments!
category                                                       romper
height                                                             68
size                                                               14
age                                                                28
review_date         

In [19]:
allRatings['rating'] = allRatings['rating'].astype(int)

In [20]:
# allRatings = pd.get_dummies(allRatings, columns=['body type'], drop_first=True)

In [21]:
allRatings.drop(columns=['bust size', 'review_date', 'cup_size', 'cup_size_encoded', 'review_text', 'review_summary', 'bust_numeric'], inplace = True)

In [22]:
allRatings.head()

Unnamed: 0,fit,user_id,item_id,weight,rating,rented for,body type,category,height,size,age,bust_combined
0,fit,420272,2260466,137,10,vacation,hourglass,romper,68,14,28,34.4
1,fit,273551,153475,132,10,other,straight & narrow,gown,66,12,36,34.2
3,fit,909926,126335,135,8,formal affair,pear,dress,65,8,34,34.3
4,fit,151944,616682,145,10,wedding,athletic,gown,69,12,27,34.2
5,fit,734848,364092,138,8,date,athletic,dress,68,8,45,32.2


In [23]:
allRatings['user_id'] = allRatings['user_id'].astype(str)
allRatings['item_id'] = allRatings['item_id'].astype(str)

In [24]:
# Create a user-item interaction matrix
user_item_matrix = allRatings.pivot_table(index='user_id', columns='item_id', values='rating', fill_value=0)


In [25]:
pip install surprise

Note: you may need to restart the kernel to use updated packages.


In [26]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy

# Convert the DataFrame to a Surprise Dataset
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(allRatings[['user_id', 'item_id', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2)

# Apply SVD
svd = SVD()
svd.fit(trainset)

# Predictions
predictions = svd.test(testset)
accuracy.rmse(predictions)

# Generate top recommendations for a user
user_id = '420272'  # Example user
recommendations = []
for item_id in user_item_matrix.columns:
    pred = svd.predict(user_id, item_id)
    recommendations.append((item_id, pred.est))

# Sort recommendations by predicted rating
recommendations.sort(key=lambda x: x[1], reverse=True)
top_recommendations = recommendations[:5]  # Top 5 recommended items


RMSE: 1.4073


In [27]:
top_recommendations

[('1312996', 9.984366796209635),
 ('1215281', 9.959987573213724),
 ('1335648', 9.946120491873826),
 ('278878', 9.9377352910563),
 ('1584094', 9.916303991075198)]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Example of using the 'category' and 'body type' as features for item profiles
item_features = allRatings[['category', 'body type']].apply(lambda row: ' '.join(row.astype(str)), axis=1)

# Use TF-IDF to vectorize these features
vectorizer = TfidfVectorizer(stop_words='english')
item_profiles = vectorizer.fit_transform(item_features)

# Compute similarity between items
item_similarity = cosine_similarity(item_profiles, item_profiles)

# Convert to DataFrame for easier access
item_similarity_df = pd.DataFrame(item_similarity, index=allRatings['item_id'], columns=allRatings['item_id'])

# Example: Get similar items for a given item
item_id = '2260466'  # Example item
similar_items = item_similarity_df[item_id].sort_values(ascending=False).head(5)
