In [639]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import altair as alt
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

### Load Data

In [640]:
df = pd.read_csv("fashion3.csv")
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,item_id,category,waist,size,quality,cup size,hips,bra size,height_inches,length,fit,review_summary,review_text,polarity,reviewLen,wordCount
0,0,8490,159891,1,26.0,1,5.0,2.0,38.0,32.0,64.0,5.0,3,This dress. Is so amazing,This dress. Is so amazing. It just arrived tod...,0.3925,301,58
1,1,8493,159891,1,31.0,8,3.0,5.0,41.0,36.0,62.0,5.0,3,This dress looks great on,This dress looks great on. I'm a pretty curren...,0.18625,273,57
2,2,8521,160625,1,32.0,5,5.0,2.0,,32.0,65.0,5.0,1,Used as my wedding dress,Used as my wedding dress and I LOVED it!PROS ...,0.172436,452,85
3,3,8523,160625,1,32.0,11,4.0,5.0,43.0,38.0,67.0,5.0,2,"I love this dress, it's b","I love this dress, it's beautiful ! Wore this ...",0.413839,581,125
4,4,8524,160625,1,28.0,7,5.0,3.0,32.0,34.0,65.0,5.0,3,I love this dress!!! I bo,I love this dress!!! I bought it specifically ...,0.400769,627,117


In [641]:
df.iloc[:,2:13]
col_lst = list(range(2,13))
col_lst.extend([15,17])
df = df.iloc[:,col_lst]
df_clean = df.dropna()

In [642]:
df_clean.head()

Unnamed: 0,item_id,category,waist,size,quality,cup size,hips,bra size,height_inches,length,fit,polarity,wordCount
0,159891,1,26.0,1,5.0,2.0,38.0,32.0,64.0,5.0,3,0.3925,58
1,159891,1,31.0,8,3.0,5.0,41.0,36.0,62.0,5.0,3,0.18625,57
3,160625,1,32.0,11,4.0,5.0,43.0,38.0,67.0,5.0,2,0.413839,125
4,160625,1,28.0,7,5.0,3.0,32.0,34.0,65.0,5.0,3,0.400769,117
7,160625,1,27.0,5,5.0,3.0,38.0,36.0,68.0,5.0,2,0.098889,42


# SVD

User's rating scores:

* `quality`, and `polarity`

Because both features have different scaling, we need to make sure that they have same scaling to combine them together

In [643]:
# Scaling range: [0,1]
# After scaling, take the average to get the overall score
col_lst = [4,11]
score = df_clean.iloc[:,col_lst].apply(lambda iterator: ((iterator - iterator.min())/(iterator.max() - iterator.min())).round(2))
score["avg"] = score.mean(axis=1)
score.head()

Unnamed: 0,quality,polarity,avg
0,1.0,0.69,0.845
1,0.5,0.59,0.545
3,0.75,0.7,0.725
4,1.0,0.7,0.85
7,1.0,0.54,0.77


In [644]:
 # Add user id and category
score.insert(0, "user_id", score.index + 1)
score.insert(1, "item_id", df_clean["category"])

In [645]:
# only keep the first three columns
score = score.iloc[:,0:3]
score.head()

Unnamed: 0,user_id,item_id,quality
0,1,1,1.0
1,2,1,0.5
3,4,1,0.75
4,5,1,1.0
7,8,1,1.0


In [646]:
score2 = score["quality"]

In [647]:
score = score.pivot(index='user_id', columns='item_id', values='quality') 

In [648]:
score.head()

item_id,1,2,3,4
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1.0,,,
2,0.5,,,
4,0.75,,,
5,1.0,,,
8,1.0,,,


In [649]:
df_top = df_clean[df_clean['category'] == 2]
df_top

Unnamed: 0,item_id,category,waist,size,quality,cup size,hips,bra size,height_inches,length,fit,polarity,wordCount
899,423314,2,50.0,38,2.0,5.0,60.0,44.0,69.0,3.0,2,0.063500,74
900,423572,2,28.0,8,5.0,3.0,39.0,34.0,66.0,5.0,3,0.328175,51
901,423572,2,33.0,15,4.0,5.0,40.0,36.0,67.0,4.0,3,0.354894,56
902,423572,2,33.0,15,5.0,6.0,43.0,36.0,64.0,5.0,1,0.318429,32
903,423572,2,30.0,8,3.0,4.0,40.0,36.0,64.0,5.0,2,0.015625,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1544,650890,2,36.0,20,5.0,5.0,42.0,40.0,65.0,5.0,3,0.712500,14
1546,650890,2,46.0,20,5.0,5.0,46.0,42.0,64.0,5.0,3,0.225000,68
1548,652823,2,29.0,12,5.0,8.0,41.0,34.0,66.0,5.0,3,0.009722,40
1549,652823,2,31.0,8,2.0,5.0,31.0,36.0,64.0,5.0,3,0.358854,32


In [650]:
df_top = df_clean[['waist','height_inches','cup size', 'bra size','fit']].copy()
df_top

Unnamed: 0,waist,height_inches,cup size,bra size,fit
0,26.0,64.0,2.0,32.0,3
1,31.0,62.0,5.0,36.0,3
3,32.0,67.0,5.0,38.0,2
4,28.0,65.0,3.0,34.0,3
7,27.0,68.0,3.0,36.0,2
...,...,...,...,...,...
2089,48.0,66.0,6.0,44.0,2
2090,30.0,63.0,5.0,36.0,1
2093,28.0,66.0,5.0,34.0,3
2094,30.0,64.0,3.0,42.0,3


In [651]:
score.fillna(0)

item_id,1,2,3,4
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1.00,0.0,0.0,0.00
2,0.50,0.0,0.0,0.00
4,0.75,0.0,0.0,0.00
5,1.00,0.0,0.0,0.00
8,1.00,0.0,0.0,0.00
...,...,...,...,...
2090,0.00,0.0,0.0,0.00
2091,0.00,0.0,0.0,1.00
2094,0.00,0.0,0.0,1.00
2095,0.00,0.0,0.0,1.00


In [652]:
X = df_top
y = score2

In [653]:
scaler = StandardScaler()

In [654]:
scaler.fit(X)

StandardScaler()

In [655]:
X_scaled = scaler.transform(X)

In [656]:
reg = KNeighborsRegressor(n_neighbors=15)

In [657]:
reg.fit(X_scaled,y)

KNeighborsRegressor(n_neighbors=15)

In [658]:
score["top pred"] = reg.predict(X_scaled)

In [659]:
score

item_id,1,2,3,4,top pred
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1.00,,,,0.783333
2,0.50,,,,0.783333
4,0.75,,,,0.700000
5,1.00,,,,0.933333
8,1.00,,,,0.666667
...,...,...,...,...,...
2090,,,,0.00,0.700000
2091,,,,1.00,0.800000
2094,,,,1.00,0.800000
2095,,,,1.00,0.883333


In [660]:
X_scaled_train, X_scaled_test, y_train, y_test = train_test_split(X_scaled,y,test_size=0.2)

In [661]:
reg.fit(X_scaled_train,y_train)

KNeighborsRegressor(n_neighbors=15)

In [662]:
mean_squared_error(reg.predict(X_scaled_test),y_test)

0.0632626956853761

In [663]:
mean_squared_error(reg.predict(X_scaled_train),y_train)

0.05307727185948836

In [664]:
def get_scores(k):
    reg = KNeighborsRegressor(n_neighbors=k)
    reg.fit(X_scaled_train, y_train)
    train_error = mean_absolute_error(reg.predict(X_scaled_train), y_train)
    test_error = mean_absolute_error(reg.predict(X_scaled_test), y_test)
    return (train_error, test_error)

df_scores = pd.DataFrame({"k":range(1,150),"train_error":np.nan,"test_error":np.nan})

for i in df_scores.index:
    df_scores.loc[i,["train_error","test_error"]] = get_scores(df_scores.loc[i,"k"])

df_scores["kinv"] = 1/df_scores.k

In [665]:
ctrain = alt.Chart(df_scores).mark_line(color="#FF0000").encode(
    x = "kinv",
    y = "train_error"
)

In [666]:
ctest = alt.Chart(df_scores).mark_line(color="#045FB4").encode(
    x = alt.X("kinv",title='k-inverse'),
    y = "test_error"
)

In [667]:
# plot for mean squared error for tops
ctrain+ctest

In [668]:
df_bottoms = df_clean[df_clean['category'] == 3]
df_bottoms

Unnamed: 0,item_id,category,waist,size,quality,cup size,hips,bra size,height_inches,length,fit,polarity,wordCount
1552,654079,3,32.0,12,4.0,6.0,42.0,38.0,67.0,5.0,3,0.500000,35
1553,654079,3,32.0,12,5.0,5.0,43.0,38.0,67.0,5.0,3,0.447619,78
1554,654079,3,37.0,15,5.0,5.0,43.0,36.0,64.0,5.0,3,0.625000,11
1557,654585,3,35.0,12,3.0,8.0,42.0,34.0,62.0,4.0,3,0.543452,23
1558,654585,3,32.0,12,3.0,6.0,43.0,36.0,65.0,4.0,3,0.225926,54
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1996,765620,3,28.0,8,3.0,5.0,39.0,34.0,66.0,5.0,3,0.115720,203
1999,766618,3,27.0,7,5.0,3.0,35.0,30.0,66.0,3.0,1,0.240833,30
2000,766618,3,30.0,11,5.0,5.0,40.0,36.0,69.0,5.0,1,0.241926,100
2001,768242,3,32.0,10,3.0,6.0,43.0,36.0,65.0,5.0,3,0.800000,29


In [669]:
df_bottoms = df_clean[['waist','height_inches','fit']].copy()
df_bottoms

Unnamed: 0,waist,height_inches,fit
0,26.0,64.0,3
1,31.0,62.0,3
3,32.0,67.0,2
4,28.0,65.0,3
7,27.0,68.0,2
...,...,...,...
2089,48.0,66.0,2
2090,30.0,63.0,1
2093,28.0,66.0,3
2094,30.0,64.0,3


In [670]:
Z = df_bottoms

In [671]:
scaler.fit(Z)

StandardScaler()

In [672]:
Z_scaled = scaler.transform(Z)

In [673]:
Z_scaled_train, Z_scaled_test, y_train, y_test = train_test_split(Z_scaled,y,test_size=0.2)

In [674]:
reg.fit(Z_scaled,y)

KNeighborsRegressor(n_neighbors=15)

In [675]:
score["bottom pred"] = reg.predict(Z_scaled)

In [676]:
mean_squared_error(reg.predict(Z_scaled_test),y_test)

0.05187285223367697

In [677]:
mean_squared_error(reg.predict(Z_scaled_train),y_train)

0.05215301641848034

In [678]:
def get_scores_b(k):
    reg = KNeighborsRegressor(n_neighbors=k)
    reg.fit(Z_scaled_train, y_train)
    train_error_b = mean_absolute_error(reg.predict(Z_scaled_train), y_train)
    test_error_b = mean_absolute_error(reg.predict(Z_scaled_test), y_test)
    return (train_error_b, test_error_b)

df_scores_b = pd.DataFrame({"k":range(1,150),"train_error_b":np.nan,"test_error_b":np.nan})

for i in df_scores.index:
    df_scores_b.loc[i,["train_error_b","test_error_b"]] = get_scores_b(df_scores_b.loc[i,"k"])

df_scores_b["kinv"] = 1/df_scores_b.k

In [679]:
ctrain_b = alt.Chart(df_scores).mark_line(color="#FF0000").encode(
    x = "kinv",
    y = "train_error"
)

In [680]:
ctest_b = alt.Chart(df_scores).mark_line(color="#045FB4").encode(
    x = alt.X("kinv",title='k-inverse'),
    y = "test_error"
)

In [681]:
ctrain_b+ctest_b

In [682]:
df_ow = df_clean[df_clean['category'] == 4]
df_ow

Unnamed: 0,item_id,category,waist,size,quality,cup size,hips,bra size,height_inches,length,fit,polarity,wordCount
2005,769971,4,40.0,26,5.0,4.0,50.0,40.0,66.0,5.0,3,0.229211,107
2007,771389,4,38.0,12,4.0,5.0,40.0,36.0,65.0,5.0,3,0.281944,37
2008,780724,4,40.0,15,5.0,3.0,42.0,38.0,69.0,5.0,2,0.227083,36
2012,780724,4,31.0,12,4.0,4.0,41.0,36.0,66.0,5.0,3,0.321759,138
2013,780724,4,27.0,4,5.0,3.0,35.0,34.0,69.0,5.0,3,1.000000,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2089,803464,4,48.0,32,1.0,6.0,58.0,44.0,66.0,3.0,2,0.218254,24
2090,806406,4,30.0,12,5.0,5.0,40.0,36.0,63.0,5.0,1,0.362963,23
2093,806856,4,28.0,8,5.0,5.0,36.0,34.0,66.0,5.0,3,0.302696,127
2094,807252,4,30.0,20,5.0,3.0,39.0,42.0,64.0,5.0,3,0.373418,42


In [683]:
df_ow = df_clean[['waist','height_inches','cup size', 'bra size','fit']].copy()
df_ow

Unnamed: 0,waist,height_inches,cup size,bra size,fit
0,26.0,64.0,2.0,32.0,3
1,31.0,62.0,5.0,36.0,3
3,32.0,67.0,5.0,38.0,2
4,28.0,65.0,3.0,34.0,3
7,27.0,68.0,3.0,36.0,2
...,...,...,...,...,...
2089,48.0,66.0,6.0,44.0,2
2090,30.0,63.0,5.0,36.0,1
2093,28.0,66.0,5.0,34.0,3
2094,30.0,64.0,3.0,42.0,3


In [684]:
W = df_ow

In [685]:
scaler.fit(W)

StandardScaler()

In [686]:
W_scaled = scaler.transform(W)

In [687]:
W_scaled_train, W_scaled_test, y_train,_test = train_test_split(W_scaled,y,test_size=0.2)

In [688]:
reg.fit(W_scaled,y)

KNeighborsRegressor(n_neighbors=15)

In [689]:
score["outerwear pred"] = reg.predict(W_scaled)

In [690]:
mean_squared_error(reg.predict(W_scaled_test),y_test)

0.0717955326460481

In [691]:
mean_squared_error(reg.predict(W_scaled_train),y_train)

0.05330732149675448

In [692]:
def get_scores_o(k):
    reg = KNeighborsRegressor(n_neighbors=k)
    reg.fit(W_scaled_train, y_train)
    train_error_o = mean_absolute_error(reg.predict(W_scaled_train), y_train)
    test_error_o = mean_absolute_error(reg.predict(W_scaled_test), y_test)
    return (train_error_o, test_error_o)

df_scores_o = pd.DataFrame({"k":range(1,150),"train_error_o":np.nan,"test_error_o":np.nan})

for i in df_scores.index:
    df_scores_o.loc[i,["train_error_o","test_error_o"]] = get_scores_o(df_scores_o.loc[i,"k"])

df_scores_o["kinv"] = 1/df_scores_o.k

In [693]:
ctrain_o = alt.Chart(df_scores).mark_line(color="#FF0000").encode(
    x = "kinv",
    y = "train_error"
)

In [694]:
ctest_o = alt.Chart(df_scores).mark_line(color="#045FB4").encode(
    x = alt.X("kinv",title='k-inverse'),
    y = "test_error"
)

In [695]:
ctrain_o+ctest_o

In [696]:
df_d = df_clean[df_clean['category'] == 1]
df_d

Unnamed: 0,item_id,category,waist,size,quality,cup size,hips,bra size,height_inches,length,fit,polarity,wordCount
0,159891,1,26.0,1,5.0,2.0,38.0,32.0,64.0,5.0,3,0.392500,58
1,159891,1,31.0,8,3.0,5.0,41.0,36.0,62.0,5.0,3,0.186250,57
3,160625,1,32.0,11,4.0,5.0,43.0,38.0,67.0,5.0,2,0.413839,125
4,160625,1,28.0,7,5.0,3.0,32.0,34.0,65.0,5.0,3,0.400769,117
7,160625,1,27.0,5,5.0,3.0,38.0,36.0,68.0,5.0,2,0.098889,42
...,...,...,...,...,...,...,...,...,...,...,...,...,...
894,397642,1,29.0,11,3.0,6.0,41.0,34.0,66.0,5.0,3,0.136592,79
895,397642,1,35.0,15,4.0,4.0,41.0,38.0,64.0,5.0,1,0.130824,126
896,400486,1,27.0,5,4.0,2.0,37.0,36.0,62.0,5.0,3,0.550000,21
897,400541,1,34.0,13,5.0,6.0,42.0,36.0,64.0,5.0,3,0.540000,32


In [697]:
df_d = df_clean[['waist','height_inches','cup size', 'bra size','fit']].copy()
df_d

Unnamed: 0,waist,height_inches,cup size,bra size,fit
0,26.0,64.0,2.0,32.0,3
1,31.0,62.0,5.0,36.0,3
3,32.0,67.0,5.0,38.0,2
4,28.0,65.0,3.0,34.0,3
7,27.0,68.0,3.0,36.0,2
...,...,...,...,...,...
2089,48.0,66.0,6.0,44.0,2
2090,30.0,63.0,5.0,36.0,1
2093,28.0,66.0,5.0,34.0,3
2094,30.0,64.0,3.0,42.0,3


In [698]:
U = df_d

In [699]:
scaler.fit(U)

StandardScaler()

In [700]:
U_scaled = scaler.transform(U)

In [701]:
U_scaled_train, U_scaled_test, y_train,_test = train_test_split(U_scaled,y,test_size=0.2)

In [702]:
reg.fit(U_scaled,y)

KNeighborsRegressor(n_neighbors=15)

In [703]:
score["dress pred"] = reg.predict(U_scaled)

In [704]:
mean_squared_error(reg.predict(U_scaled_test),y_test)

0.07050782741504391

In [705]:
mean_squared_error(reg.predict(U_scaled_train),y_train)

0.0525159889270714

In [706]:
def get_scores_d(k):
    reg = KNeighborsRegressor(n_neighbors=k)
    reg.fit(U_scaled_train, y_train)
    train_error_d = mean_absolute_error(reg.predict(U_scaled_train), y_train)
    test_error_d = mean_absolute_error(reg.predict(U_scaled_test), y_test)
    return (train_error_d, test_error_d)

df_scores_d = pd.DataFrame({"k":range(1,150),"train_error_d":np.nan,"test_error_d":np.nan})

for i in df_scores.index:
    df_scores_d.loc[i,["train_error_d","test_error_d"]] = get_scores_d(df_scores_d.loc[i,"k"])

df_scores_d["kinv"] = 1/df_scores_d.k

In [707]:
ctrain_d = alt.Chart(df_scores).mark_line(color="#FF0000").encode(
    x = "kinv",
    y = "train_error"
)

In [708]:
ctest_d = alt.Chart(df_scores).mark_line(color="#045FB4").encode(
    x = alt.X("kinv",title='k-inverse'),
    y = "test_error"
)

In [709]:
ctrain_d+ctest_d