In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from fuzzywuzzy import fuzz



In [2]:
outfits = pd.read_csv('outfit_combinations.csv')

In [3]:
outfits.head()

Unnamed: 0,outfit_id,product_id,outfit_item_type,brand,product_full_name
0,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2P5H24WK0HTK4R0A1,bottom,Eileen Fisher,Slim Knit Skirt
1,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2PEPWFTT7RMP5AA1T,top,Eileen Fisher,Rib Mock Neck Tank
2,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2S5T9W793F4CY41HE,accessory1,kate spade new york,medium margaux leather satchel
3,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2ZFDYRYY5TRQZJTBD,shoe,Tory Burch,Penelope Mid Cap Toe Pump
4,01DMHCX50CFX5YNG99F3Y65GQW,01DMBRYVA2P5H24WK0HTK4R0A1,bottom,Eileen Fisher,Slim Knit Skirt


In [4]:
print(f'There are {len(outfits)} no of rows')

There are 5291 no of rows


In [5]:
print(f'There are {len(outfits.outfit_id.unique())} unique Outfits')

There are 1137 unique Outfits


In [6]:
print(f'There are {len(outfits.product_id.unique())} unique products')

There are 804 unique products


1. input is product id
2. grab list of outfit ids
3. iterate through each outfit id
    a. return outfit_item_type + brand + product_full_name\n for each outfit

# test process for building function

In [7]:
test_wrong = 'something random that maybe I would want (23049nnvlsjerou293094kljd)'

In [8]:
test1 = 'bottom: Eileen Fisher Slim Knit Skirt (01DMBRYVA2P5H24WK0HTK4R0A1)'

In [9]:
test2 = 'shoe: Tory Burch Penelope Mid Cap Toe Pump (01DMBRYVA2ZFDYRYY5TRQZJTBD)'

In [10]:
# we see here that this product id is part of two different outfits
outfits[outfits['product_id'] == '01DMBRYVA2P5H24WK0HTK4R0A1']

Unnamed: 0,outfit_id,product_id,outfit_item_type,brand,product_full_name
0,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2P5H24WK0HTK4R0A1,bottom,Eileen Fisher,Slim Knit Skirt
4,01DMHCX50CFX5YNG99F3Y65GQW,01DMBRYVA2P5H24WK0HTK4R0A1,bottom,Eileen Fisher,Slim Knit Skirt


In [11]:
# the first outfit has four products in it
outfits[outfits['outfit_id'] == '01DDBHC62ES5K80P0KYJ56AM2T']

Unnamed: 0,outfit_id,product_id,outfit_item_type,brand,product_full_name
0,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2P5H24WK0HTK4R0A1,bottom,Eileen Fisher,Slim Knit Skirt
1,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2PEPWFTT7RMP5AA1T,top,Eileen Fisher,Rib Mock Neck Tank
2,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2S5T9W793F4CY41HE,accessory1,kate spade new york,medium margaux leather satchel
3,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2ZFDYRYY5TRQZJTBD,shoe,Tory Burch,Penelope Mid Cap Toe Pump


In [12]:
# Isolate first outfit id from df pulled by product id
outfit = outfits['outfit_id'][outfits['product_id'] == '01DMBRYVA2P5H24WK0HTK4R0A1'][0]

In [13]:
outfit

'01DDBHC62ES5K80P0KYJ56AM2T'

In [14]:
outfits[outfits['outfit_id'] == outfit].iloc[0,2]

'bottom'

In [15]:
outfits['outfit_id'][outfits['product_id'] == '01DMBRYVA2P5H24WK0HTK4R0A1']

0    01DDBHC62ES5K80P0KYJ56AM2T
4    01DMHCX50CFX5YNG99F3Y65GQW
Name: outfit_id, dtype: object

In [16]:
test = ''

In [17]:
testappend = 'hello'

In [18]:
test += testappend

In [19]:
test

'hello'

In [20]:
re.search(r'\(([^)]+)', test1).group(1)
#https://stackoverflow.com/questions/38999344/extract-string-within-parentheses-python
# user Maroun

'01DMBRYVA2P5H24WK0HTK4R0A1'

In [21]:
# Outfit compiler if product ID is in outfit df
# starting point - add cosine similarity/fuzzy matching to this
def outfit_compiler_v2(text):
    # pull product_id from input
    product_id = re.search(r'\(([^)]+)', text).group(1)
    # get list of outfits associated with product_id
    outfit_list = outfits['outfit_id'][outfits['product_id'] == product_id]
    outfit_num = 1 #to print the outfit number
    # iterate through each outfit
    for outfit in outfit_list:
        outfit_size = outfits[outfits['outfit_id'] == outfit].shape[0] #get how many products in the outfit
        print('Outfit Recommendation', outfit_num)
        outfit_num +=1
        # print each piece in the outfit
        print(text) # print the input as the first product in the outfit
        for i in range(outfit_size):
            # print the rest of the products in the outfit
            if (outfits[outfits['outfit_id'] == outfit].iloc[i,1] != product_id):
                piece = outfits[outfits['outfit_id'] == outfit].iloc[i,2] + ': ' + outfits[outfits['outfit_id'] == outfit].iloc[i,3] + ' ' + outfits[outfits['outfit_id'] == outfit].iloc[i,4] + ' (' + outfits[outfits['outfit_id'] == outfit].iloc[i,1] + ')'
                print(piece)
        print('\n')

# Fuzzywuzzy

In [22]:
query = 'something129490random'

In [23]:
outfits['fuzzy'] = outfits['product_id'].apply(lambda x:fuzz.partial_ratio(x, query))

In [24]:
outfits[outfits['fuzzy'] == outfits['fuzzy'].max()]

Unnamed: 0,outfit_id,product_id,outfit_item_type,brand,product_full_name,fuzzy
729,01DVBQN0A1NQVQY805R202YW8H,01DT0DKEZ2DQZ9SXHE34790QGC,top,ATM Anthony Thomas Melillo,Woven Boyfriend V-Neck Tee,26
739,01DVBQN0A2140HYFMQ04YHKCPD,01DT0DKEZ2DQZ9SXHE34790QGC,top,ATM Anthony Thomas Melillo,Woven Boyfriend V-Neck Tee,26
743,01DVBQN0A21NSDFGJC5VZVS7S2,01DT0DKEZ2DQZ9SXHE34790QGC,top,ATM Anthony Thomas Melillo,Woven Boyfriend V-Neck Tee,26
769,01DVBQN0A2EK6QV31FEJHNTYZ2,01DT0DKEZ2DQZ9SXHE34790QGC,top,ATM Anthony Thomas Melillo,Woven Boyfriend V-Neck Tee,26
773,01DVBQN0A2G9FJWYSJKV22K2EH,01DT0DKEZ2DQZ9SXHE34790QGC,top,ATM Anthony Thomas Melillo,Woven Boyfriend V-Neck Tee,26
783,01DVBQN0A2SCBC7953HFK47P18,01DT0DKEZ2DQZ9SXHE34790QGC,top,ATM Anthony Thomas Melillo,Woven Boyfriend V-Neck Tee,26


In [25]:
outfits[outfits['fuzzy'] == outfits['fuzzy'].max()].iloc[0,1]

'01DT0DKEZ2DQZ9SXHE34790QGC'

In [26]:
# Outfit compiler for fuzzywuzzy
def outfit_compiler_fuzzy(text):
    # pull product_id from input
    product_id = re.search(r'\(([^)]+)', text).group(1)
    # get fuzzywuzzy score between input and all product ids
    outfits['fuzzy'] = outfits['product_id'].apply(lambda x:fuzz.partial_ratio(x, text))
    # find product_id with max fuzzywuzzy score
    found_product = outfits[outfits['fuzzy'] == outfits['fuzzy'].max()].iloc[0,1]
    # found_product_desc takes found_product and puts it  in the same format as the input
    found_product_desc = outfits[outfits['product_id'] == found_product].iloc[0,2] + ': ' + outfits[outfits['product_id'] == found_product].iloc[0,3] + ' ' + outfits[outfits['product_id'] == found_product].iloc[0,4] + ' (' + outfits[outfits['product_id'] == found_product].iloc[0,1] + ')'
    if product_id != found_product: # if the input isn't in the outfits df
        print('Product ID not found--did you mean', found_product_desc, '?\n')
    outfit_list = outfits['outfit_id'][outfits['product_id'] == found_product] # get list of all outfits where product id shows up
    outfit_num = 1 #to print the outfit number
    # iterate through each outfit
    for outfit in outfit_list:
        outfit_size = outfits[outfits['outfit_id'] == outfit].shape[0] #get how many products in the outfit
        print('Outfit Recommendation', outfit_num)
        outfit_num +=1
        # print each piece in the outfit
        print(found_product_desc) #print the input as the first product in the outfit
        for i in range(outfit_size): #print the rest of the products in the outfit
            if (outfits[outfits['outfit_id'] == outfit].iloc[i,1] != found_product):
                piece = outfits[outfits['outfit_id'] == outfit].iloc[i,2] + ': ' + outfits[outfits['outfit_id'] == outfit].iloc[i,3] + ' ' + outfits[outfits['outfit_id'] == outfit].iloc[i,4] + ' (' + outfits[outfits['outfit_id'] == outfit].iloc[i,1] + ')'
                print(piece)
        print('\n')

# starting to explore tfidf/similarity

In [27]:
tokens = outfits['product_id']

In [28]:
vectorizer = TfidfVectorizer()

In [29]:
X = vectorizer.fit_transform(tokens)
terms = vectorizer.get_feature_names()

In [30]:
# I've been changing this to try to figure out different things
query = '01DMBRYVA2PEPWFTT7RMP5ATTT'

In [31]:
Q = vectorizer.transform([query])
terms = vectorizer.get_feature_names()

In [32]:
results=cosine_similarity(X,Q).reshape((-1,1))

In [33]:
# np.argmax always returns 1 when results has a 1 in it 
outfits.iloc[np.argmax(results),0]

'01DDBHC62ES5K80P0KYJ56AM2T'

In [34]:
terms

['01dmbryva2p5h24wk0htk4r0a1',
 '01dmbryva2pepwftt7rmp5aa1t',
 '01dmbryva2q2st7mnyr6eey4tk',
 '01dmbryva2s5t9w793f4cy41he',
 '01dmbryva2zfdyryy5trqzjtbd',
 '01dmhcnt41e14qwp503v7ct9g6',
 '01dpc6z5k3zb8kz0m6yw7z4jx8',
 '01dpc8ampe4czhmjetf1q8ven0',
 '01dpc9xnr3xbh3haaqygykrj5h',
 '01dpcb2keavxxkfvm7fxbne4vy',
 '01dpcdef6syx2e1nt5x7hjbfgy',
 '01dpchnew5f2rhjq3njmvpk6se',
 '01dpchnqm0pa0sxzzzx85pf2zj',
 '01dpcrzwx4s2z8q5hydfm4hneg',
 '01dpcwejrbvwze397fmf9qxbby',
 '01dpcwryqmhrag7nj3q5jjmt42',
 '01dpcye6akjfkjsncbdtqfg52y',
 '01dpcz5dte59vbyd9nwnszkc40',
 '01dpd1r0f6z4ejcpndkpag5dpf',
 '01dpd2fga0fwfxjckmt5qbjyhk',
 '01dpd2fyrjg5wkxx8hekrn9j9b',
 '01dpd2gdftfsppdsnpfq0w3pq7',
 '01dpd2jcxh4nyqtrpja6t6vga9',
 '01dpd350msfxmw7s524jken2ty',
 '01dpd3agtnrnaknp8th9wyg35x',
 '01dpd3j095srerspggyx9ffrkx',
 '01dpd4r5x5tqcwtvtc2aeafc10',
 '01dpd7kn5438z6vnzkfttgy34g',
 '01dpd8c9xn7mqshsstktczzn1n',
 '01dpd9mvzapgvx5p3ay7y53x9v',
 '01dpdvgjabjvq3s3da3tjrd1r7',
 '01dpeb8hx0gk7a07qadnb52404',
 '01dpee

#### trying to use the results vector to return the product id most similar to the query input

In [35]:
# dataframe of all product ids
tok = pd.DataFrame(terms)

In [36]:
tok.head()

Unnamed: 0,0
0,01dmbryva2p5h24wk0htk4r0a1
1,01dmbryva2pepwftt7rmp5aa1t
2,01dmbryva2q2st7mnyr6eey4tk
3,01dmbryva2s5t9w793f4cy41he
4,01dmbryva2zfdyryy5trqzjtbd


In [37]:
# res is the results converted into a boolean dataframe
# ress can be used to index tok
res = pd.DataFrame(results.astype(bool))
ress = res.iloc[:,0]

In [38]:
tok[ress]

  """Entry point for launching an IPython kernel.


Unnamed: 0,0


In [39]:
# this is how I realized that the results array doesn't  assign 1 to anything unless the product id is in the outfits df
results.sum()

0.0

In [40]:
outfits.iloc[np.argmax(results),0]

'01DDBHC62ES5K80P0KYJ56AM2T'

In [41]:
outfits.iloc[np.argmax(results),1]

'01DMBRYVA2P5H24WK0HTK4R0A1'

In [42]:
# Tfidf outfit compiler
def outfit_compiler_tfidf(text):
    text = text
    tokens = outfits['product_id']
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(tokens)
    terms = vectorizer.get_feature_names()
    # pull product_id from text
    product_id = re.search(r'\(([^)]+)', text).group(1)
    Q = vectorizer.transform([product_id]) # vectorize the query
    results=cosine_similarity(X,Q).reshape((-1,1))
    if(results.sum() >0):
    #Where ever cosine similarity is a direct match we tried to find index of that row
        boolArr = (results == 1)
        index = np.where(boolArr)[0]
    # get list of outfits associated with product_id
        found_product = outfits.iloc[index,1].reset_index(drop=True)[0]
    #get outfit that involve this product
        found_product_outfit = outfits.iloc[index,0]
        #print(found_product_outfit) testing output
    # found_product_desc takes found_product and puts it  in the same format as the input
        found_product_desc = outfits[outfits['product_id'] == found_product].iloc[0,2] + ': ' + outfits[outfits['product_id'] == found_product].iloc[0,3] + ' ' + outfits[outfits['product_id'] == found_product].iloc[0,4] + ' (' + outfits[outfits['product_id'] == found_product].iloc[0,1] + ')'
        #print(found_product_desc) testing output
        outfit_list = outfits['outfit_id'][outfits['product_id'] == found_product] # get list of all outfits where product id shows up
        outfit_num = 1 #to print the outfit number
    # iterate through each outfit
        for outfit in outfit_list:
            outfit_size = outfits[outfits['outfit_id'] == outfit].shape[0] #get how many products in the outfit
            print('Outfit Recommendation', outfit_num)
            outfit_num +=1
        # print each piece in the outfit
            print(found_product_desc) #print the input as the first product in the outfit
            for i in range(outfit_size): #print the rest of the products in the outfit
                if (outfits[outfits['outfit_id'] == outfit].iloc[i,1] != found_product):
                    piece = outfits[outfits['outfit_id'] == outfit].iloc[i,2] + ': ' + outfits[outfits['outfit_id'] == outfit].iloc[i,3] + ' ' + outfits[outfits['outfit_id'] == outfit].iloc[i,4] + ' (' + outfits[outfits['outfit_id'] == outfit].iloc[i,1] + ')'
                    print(piece)
            print('\n')
    else:
        outfit_compiler_fuzzy(text)
    #print('Product ID not found--did you mean', found_product_desc, '?')
    

# Test the outfit compilers

In [43]:
test_wrong = 'something random that maybe I would want (23049nnvlsjerou293094kljd)'

In [44]:
test1 = 'bottom: Eileen Fisher Slim Knit Skirt (01DMBRYVA2P5H24WK0HTK4R0A1)'

In [45]:
test2 = 'shoe: Tory Burch Penelope Mid Cap Toe Pump (01DMBRYVA2ZFDYRYY5TRQZJTBD)'

In [46]:
outfit_compiler_tfidf(test_wrong)

Product ID not found--did you mean accessory2: Loewe Woven Fringe Open-Knit Wool Cardigan (01DVRYN4B93DG09MF4YS8P5PSJ) ?

Outfit Recommendation 1
accessory2: Loewe Woven Fringe Open-Knit Wool Cardigan (01DVRYN4B93DG09MF4YS8P5PSJ)
accessory1: BY FAR Jamie Mini Duchesse-Satin and Croc-Effect Leather Shoulder Bag (01DT2DA9XY52F7XQH552ZJW43K)
onepiece: Ulla Johnson Rosalie Tiered Cotton-Silk Blend Dress (01DVRYJHEHWV94XKY2F9HFE601)
shoe: Prada Mary Jane Mules (01DVS0C77R86TSCW59G1KNXP72)


Outfit Recommendation 2
accessory2: Loewe Woven Fringe Open-Knit Wool Cardigan (01DVRYN4B93DG09MF4YS8P5PSJ)
accessory1: BY FAR Jamie Mini Duchesse-Satin and Croc-Effect Leather Shoulder Bag (01DT2DA9XY52F7XQH552ZJW43K)
shoe: Jimmy Choo Romy 100 suede pumps (01DVA80YRD3A05GQDNZ93TVVVX)
onepiece: Ulla Johnson Rosalie Tiered Cotton-Silk Blend Dress (01DVRYJHEHWV94XKY2F9HFE601)


Outfit Recommendation 3
accessory2: Loewe Woven Fringe Open-Knit Wool Cardigan (01DVRYN4B93DG09MF4YS8P5PSJ)
accessory1: BY FAR Jam

In [47]:
outfit_compiler_tfidf(test1)

Outfit Recommendation 1
bottom: Eileen Fisher Slim Knit Skirt (01DMBRYVA2P5H24WK0HTK4R0A1)
top: Eileen Fisher Rib Mock Neck Tank (01DMBRYVA2PEPWFTT7RMP5AA1T)
accessory1: kate spade new york medium margaux leather satchel (01DMBRYVA2S5T9W793F4CY41HE)
shoe: Tory Burch Penelope Mid Cap Toe Pump (01DMBRYVA2ZFDYRYY5TRQZJTBD)


Outfit Recommendation 2
bottom: Eileen Fisher Slim Knit Skirt (01DMBRYVA2P5H24WK0HTK4R0A1)
top: Eileen Fisher Rib Mock Neck Tank (01DMBRYVA2PEPWFTT7RMP5AA1T)
shoe: Tory Burch Penelope Mid Cap Toe Pump (01DMBRYVA2ZFDYRYY5TRQZJTBD)
accessory1: Nina Crystal Clutch (01DMHCNT41E14QWP503V7CT9G6)




In [48]:
outfit_compiler_tfidf(test2)

Outfit Recommendation 1
shoe: Tory Burch Penelope Mid Cap Toe Pump (01DMBRYVA2ZFDYRYY5TRQZJTBD)
bottom: Eileen Fisher Slim Knit Skirt (01DMBRYVA2P5H24WK0HTK4R0A1)
top: Eileen Fisher Rib Mock Neck Tank (01DMBRYVA2PEPWFTT7RMP5AA1T)
accessory1: kate spade new york medium margaux leather satchel (01DMBRYVA2S5T9W793F4CY41HE)


Outfit Recommendation 2
shoe: Tory Burch Penelope Mid Cap Toe Pump (01DMBRYVA2ZFDYRYY5TRQZJTBD)
bottom: Eileen Fisher Slim Knit Skirt (01DMBRYVA2P5H24WK0HTK4R0A1)
top: Eileen Fisher Rib Mock Neck Tank (01DMBRYVA2PEPWFTT7RMP5AA1T)
accessory1: Nina Crystal Clutch (01DMHCNT41E14QWP503V7CT9G6)


Outfit Recommendation 3
shoe: Tory Burch Penelope Mid Cap Toe Pump (01DMBRYVA2ZFDYRYY5TRQZJTBD)
onepiece: Equipment Chemelle Midi Dress (01DMBRYVA2Q2ST7MNYR6EEY4TK)
accessory1: Nina Crystal Clutch (01DMHCNT41E14QWP503V7CT9G6)


Outfit Recommendation 4
shoe: Tory Burch Penelope Mid Cap Toe Pump (01DMBRYVA2ZFDYRYY5TRQZJTBD)
onepiece: Equipment Chemelle Midi Dress (01DMBRYVA2Q2ST7MNY