In [299]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import altair as alt
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

### Load Data

In [300]:
df = pd.read_csv("fashion3.csv")
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,item_id,category,waist,size,quality,cup size,hips,bra size,height_inches,length,fit,review_summary,review_text,polarity,reviewLen,wordCount
0,0,8490,159891,1,26.0,1,5.0,2.0,38.0,32.0,64.0,5.0,3,This dress. Is so amazing,This dress. Is so amazing. It just arrived tod...,0.3925,301,58
1,1,8493,159891,1,31.0,8,3.0,5.0,41.0,36.0,62.0,5.0,3,This dress looks great on,This dress looks great on. I'm a pretty curren...,0.18625,273,57
2,2,8521,160625,1,32.0,5,5.0,2.0,,32.0,65.0,5.0,1,Used as my wedding dress,Used as my wedding dress and I LOVED it!PROS ...,0.172436,452,85
3,3,8523,160625,1,32.0,11,4.0,5.0,43.0,38.0,67.0,5.0,2,"I love this dress, it's b","I love this dress, it's beautiful ! Wore this ...",0.413839,581,125
4,4,8524,160625,1,28.0,7,5.0,3.0,32.0,34.0,65.0,5.0,3,I love this dress!!! I bo,I love this dress!!! I bought it specifically ...,0.400769,627,117


In [301]:
df.iloc[:,2:13]
col_lst = list(range(2,13))
col_lst.extend([15,17])
df = df.iloc[:,col_lst]
df_clean = df.dropna()

In [302]:
df_clean.head()

Unnamed: 0,item_id,category,waist,size,quality,cup size,hips,bra size,height_inches,length,fit,polarity,wordCount
0,159891,1,26.0,1,5.0,2.0,38.0,32.0,64.0,5.0,3,0.3925,58
1,159891,1,31.0,8,3.0,5.0,41.0,36.0,62.0,5.0,3,0.18625,57
3,160625,1,32.0,11,4.0,5.0,43.0,38.0,67.0,5.0,2,0.413839,125
4,160625,1,28.0,7,5.0,3.0,32.0,34.0,65.0,5.0,3,0.400769,117
7,160625,1,27.0,5,5.0,3.0,38.0,36.0,68.0,5.0,2,0.098889,42


# SVD

User's rating scores:

* `quality`, and `polarity`

Because both features have different scaling, we need to make sure that they have same scaling to combine them together

In [303]:
# Scaling range: [0,1]
# After scaling, take the average to get the overall score
col_lst = [4,11]
score = df_clean.iloc[:,col_lst].apply(lambda iterator: ((iterator - iterator.min())/(iterator.max() - iterator.min())).round(2))
score["avg"] = score.mean(axis=1)
score.head()

Unnamed: 0,quality,polarity,avg
0,1.0,0.69,0.845
1,0.5,0.59,0.545
3,0.75,0.7,0.725
4,1.0,0.7,0.85
7,1.0,0.54,0.77


In [304]:
 # Add user id and category
score.insert(0, "user_id", score.index + 1)
score.insert(1, "item_id", df_clean["category"])

In [305]:
# only keep the first three columns
score = score.iloc[:,0:3]
score.head()

Unnamed: 0,user_id,item_id,quality
0,1,1,1.0
1,2,1,0.5
3,4,1,0.75
4,5,1,1.0
7,8,1,1.0


In [306]:
score2 = score["quality"]

In [307]:
score = score.pivot(index='user_id', columns='item_id', values='quality') 

In [308]:
score.head()

item_id,1,2,3,4
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1.0,,,
2,0.5,,,
4,0.75,,,
5,1.0,,,
8,1.0,,,


In [309]:
df_top = df_clean[['waist','height_inches','cup size', 'bra size']].copy()

In [310]:
score.fillna(0)

item_id,1,2,3,4
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1.00,0.0,0.0,0.00
2,0.50,0.0,0.0,0.00
4,0.75,0.0,0.0,0.00
5,1.00,0.0,0.0,0.00
8,1.00,0.0,0.0,0.00
...,...,...,...,...
2090,0.00,0.0,0.0,0.00
2091,0.00,0.0,0.0,1.00
2094,0.00,0.0,0.0,1.00
2095,0.00,0.0,0.0,1.00


In [311]:
X = df_top
y = score2

In [312]:
scaler = StandardScaler()

In [313]:
scaler.fit(X)

StandardScaler()

In [314]:
X_scaled = scaler.transform(X)

In [315]:
reg = KNeighborsRegressor(n_neighbors=15)

In [316]:
reg.fit(X_scaled,y)

KNeighborsRegressor(n_neighbors=15)

In [317]:
score["top pred"] = reg.predict(X_scaled)

In [318]:
score

item_id,1,2,3,4,top pred
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1.00,,,,0.700000
2,0.50,,,,0.866667
4,0.75,,,,0.866667
5,1.00,,,,0.783333
8,1.00,,,,0.816667
...,...,...,...,...,...
2090,,,,0.00,0.800000
2091,,,,1.00,0.750000
2094,,,,1.00,0.750000
2095,,,,1.00,0.783333


In [319]:
X_scaled_train, X_scaled_test, y_train, y_test = train_test_split(X_scaled,y,test_size=0.2)

In [320]:
reg.fit(X_scaled_train,y_train)

KNeighborsRegressor(n_neighbors=15)

In [321]:
mean_squared_error(reg.predict(X_scaled_test),y_test)

0.0648663612065674

In [322]:
mean_squared_error(reg.predict(X_scaled_train),y_train)

0.05556796487208858

In [323]:
def get_scores(k):
    reg = KNeighborsRegressor(n_neighbors=k)
    reg.fit(X_scaled_train, y_train)
    train_error = mean_absolute_error(reg.predict(X_scaled_train), y_train)
    test_error = mean_absolute_error(reg.predict(X_scaled_test), y_test)
    return (train_error, test_error)

df_scores = pd.DataFrame({"k":range(1,150),"train_error":np.nan,"test_error":np.nan})

for i in df_scores.index:
    df_scores.loc[i,["train_error","test_error"]] = get_scores(df_scores.loc[i,"k"])

df_scores["kinv"] = 1/df_scores.k

In [324]:
ctrain = alt.Chart(df_scores).mark_line(color="#FF0000").encode(
    x = "kinv",
    y = "train_error"
)

In [325]:
ctest = alt.Chart(df_scores).mark_line(color="#045FB4").encode(
    x = alt.X("kinv",title='k-inverse'),
    y = "test_error"
)

In [326]:
# plot for mean squared error for tops
ctrain+ctest

In [327]:
df_bottoms = df_clean[['waist','height_inches']].copy()

In [328]:
Z = df_bottoms

In [329]:
scaler.fit(Z)

StandardScaler()

In [330]:
Z_scaled = scaler.transform(Z)

In [331]:
Z_scaled_train, Z_scaled_test, y_train, y_test = train_test_split(Z_scaled,y,test_size=0.2)

In [332]:
reg.fit(Z_scaled,y)

KNeighborsRegressor(n_neighbors=15)

In [333]:
mean_squared_error(reg.predict(Z_scaled_test),y_test)

0.0597012218403971

In [334]:
mean_squared_error(reg.predict(Z_scaled_train),y_train)

0.054334192439862546

In [339]:
def get_scores_b(k):
    reg = KNeighborsRegressor(n_neighbors=k)
    reg.fit(Z_scaled_train, y_train)
    train_error_b = mean_absolute_error(reg.predict(Z_scaled_train), y_train)
    test_error_b = mean_absolute_error(reg.predict(Z_scaled_test), y_test)
    return (train_error_b, test_error_b)

df_scores_b = pd.DataFrame({"k":range(1,150),"train_error_b":np.nan,"test_error_b":np.nan})

for i in df_scores.index:
    df_scores_b.loc[i,["train_error_b","test_error_b"]] = get_scores_b(df_scores_b.loc[i,"k"])

df_scores_b["kinv"] = 1/df_scores_b.k

In [340]:
ctrain_b = alt.Chart(df_scores).mark_line(color="#FF0000").encode(
    x = "kinv",
    y = "train_error"
)

In [341]:
ctest_b = alt.Chart(df_scores).mark_line(color="#045FB4").encode(
    x = alt.X("kinv",title='k-inverse'),
    y = "test_error"
)

In [342]:
ctrain_b+ctest_b

In [None]:
df_dress = df_clean[['waist','height_inches','cup size', 'bra size']].copy()

NameError: name 'df_ow' is not defined