In [1]:
from gensim import corpora, models
import pandas as pd
import numpy as np
import sys
from collections import Counter
import itertools
from lsi_tagger.model import TagExtractor

%load_ext autoreload
%autoreload 2

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\David\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
df = pd.read_csv('data/h&m_kaggle_products.csv')
df = df[~df.isnull().any(axis=1)]
print(df.shape)
df.head(2)

(105126, 25)


Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.


In [3]:
documents = df['detail_desc'].drop_duplicates().values

te = TagExtractor(word_count_min=5, word_length_min=2, num_lsi_topics=200, 
                  bigram_kwargs={'bigrams_pmi_min_value':1, 'bigrams_min_freq':200})
te.fit(documents)

Preprocessing text: 100%|█████████████████████████████████████████████████████| 43404/43404 [00:01<00:00, 40663.94it/s]
Filtering tokens: 100%|████████████████████████████████████████████████████████| 43404/43404 [00:05<00:00, 8473.37it/s]
Making bigrams: 43404it [00:00, 109070.50it/s]
Filtering by word count: 100%|███████████████████████████████████████████████| 43404/43404 [00:00<00:00, 140194.50it/s]
2022-04-16 11:20:09 INFO     Training TF-IDF...
2022-04-16 11:20:09 INFO     adding document #0 to Dictionary(0 unique tokens: [])
2022-04-16 11:20:09 INFO     adding document #10000 to Dictionary(1635 unique tokens: ['jersey', 'jersey top', 'narrow', 'narrow shoulder', 'shoulder']...)
2022-04-16 11:20:09 INFO     adding document #20000 to Dictionary(1773 unique tokens: ['jersey', 'jersey top', 'narrow', 'narrow shoulder', 'shoulder']...)
2022-04-16 11:20:10 INFO     adding document #30000 to Dictionary(1863 unique tokens: ['jersey', 'jersey top', 'narrow', 'narrow shoulder', 'shoulder'

In [4]:
len(te.tc.bigrams_dict)

200

In [5]:
input_tags, candidate_tags = te.transform(input_document='Jersey top with narrow shoulder straps.',
                                          candidate_documents=[
                                              'Tights with built-in support to lift the bottom. Black in 30 denier and light amber in 15 denier.',
                                              'Semi shiny tights that shape the tummy, thighs and calves while also encouraging blood circulation in the legs. Elasticated waist.',
                                              'Jersey top with narrow shoulder straps.',
                                              'uygnhjkl'
                                          ], 
                                          n_input_tags=10, n_candidate_tags=10)

input_tags, candidate_tags

([('jersey top', 1),
  ('narrow shoulder', 1),
  ('shoulder', 1),
  ('shoulder straps', 1),
  ('straps', 1),
  ('narrow', 1),
  ('top', 1),
  ('jersey', 1)],
 [{},
  {},
  {'jersey top': 0.8234521965577675,
   'narrow shoulder': 0.7826849124903947,
   'shoulder': 0.6071230138053313,
   'shoulder straps': 0.606222931645604,
   'straps': 0.5936206410229897,
   'narrow': 0.5876212913989638,
   'top': 0.40098475426395075,
   'jersey': 0.353774562951922},
  {}])

In [6]:
new_candidate_ranking = te.rank(candidate_tags=candidate_tags, selected_tags=['jersey'])
new_candidate_ranking

[2, 0, 1, 3]

In [7]:
te.save()
te = TagExtractor()
te.load()

In [8]:
input_tags, candidate_tags = te.transform(input_document='Jersey top with narrow shoulder straps.',
                                          candidate_documents=[
                                              'Tights with built-in support to lift the bottom. Black in 30 denier and light amber in 15 denier.',
                                              'Semi shiny tights that shape the tummy, thighs and calves while also encouraging blood circulation in the legs. Elasticated waist.',
                                              'Jersey top with narrow shoulder straps.',
                                              'uygnhjkl'
                                          ], 
                                          n_input_tags=10, n_candidate_tags=10)

input_tags, candidate_tags

([('jersey top', 1),
  ('narrow shoulder', 1),
  ('shoulder', 1),
  ('shoulder straps', 1),
  ('straps', 1),
  ('narrow', 1),
  ('top', 1),
  ('jersey', 1)],
 [{},
  {},
  {'jersey top': 0.8234521965577675,
   'narrow shoulder': 0.7826849124903947,
   'shoulder': 0.6071230138053313,
   'shoulder straps': 0.606222931645604,
   'straps': 0.5936206410229897,
   'narrow': 0.5876212913989638,
   'top': 0.40098475426395075,
   'jersey': 0.353774562951922},
  {}])

In [9]:
new_candidate_ranking = te.rank(candidate_tags=candidate_tags, selected_tags=['jersey'])
new_candidate_ranking

[2, 0, 1, 3]

In [10]:
from tqdm import tqdm

class ContentRecommender:
    def __init__(self):
        lsi_vecs = np.array([self._lsi_corpus2vec(cl) for cl in tqdm(te.corpus_lsi, desc='Creating LSI Vectors')])
        self.lsi_vecs_normed = lsi_vecs/np.linalg.norm(lsi_vecs, axis=1)[:,None]
        self.lsi_topic_matrix_T_normed = te.lsi_topic_matrix.T/np.linalg.norm(te.lsi_topic_matrix.T, axis=1)[:,None]
        
    def _lsi_corpus2vec(self, lsi_corpus_vec):
        vec = np.zeros(te.num_lsi_topics)
        for ind, value in lsi_corpus_vec:
            vec[ind] = value
        return vec
    
    def _adjacent_tags(self, input_tags):
        bow = te.dictionary.doc2bow(input_tags)
        bow_vec = self._lsi_corpus2vec(te.lsi_model[te.tfidf[bow]])
        bow_vec_normed = bow_vec/np.linalg.norm(bow_vec)
        sims = bow_vec_normed.dot(self.lsi_topic_matrix_T_normed.T)
        top_inds = np.argsort(sims)[::-1]

        adj_tags = []
        count = 0
        for t in top_inds:
            tag = te.dictionary[t]
            flag = True
            for input_tag in input_tags:
                if (input_tag in tag) or (tag in input_tag):
                    flag = False
            if flag:
                count += 1
                adj_tags.append(tag)
            if count == self.n_adjacent_tags:
                break
        return adj_tags
    
    def _show(self, candidate_documents, candidate_tags):
        print("#########################")
        print("######### INPUTS ########")
        print("#########################\n")
        print(f"Input Document: {self.input_document}")
        print(f"\nAdjacent Tags: {self.input_adjacent_tags}")
        print(f"\nInput Tags: {self.input_tags}")
        
        print("\n#########################")
        print("#### RECOMMENDATIONS ####")
        print("#########################\n")
        for n, candidate_doc in enumerate(candidate_documents):
            print(f"{n+1}. {candidate_doc}")
            display(candidate_tags[n])
            print('-'*150)
            if (n+1) == self.show_n_recs:
                break
                
    def _get_content_recommendations(self, input_document):
        _, input_corpus_lsi = te._get_vector_representations(input_document)
        input_lsi_vec = self._lsi_corpus2vec(input_corpus_lsi)
        input_lsi_vec_normed = input_lsi_vec/np.linalg.norm(input_lsi_vec)
        sims = input_lsi_vec_normed.dot(self.lsi_vecs_normed.T)
        top_inds = np.argsort(sims)[::-1][:(self.internal_n_recs+1)]
        self.candidate_documents = [documents[ti] for ti in top_inds if documents[ti]!=input_document]
        self.input_tags, self.candidate_tags = te.transform(input_document=input_document,
                                                            candidate_documents=self.candidate_documents, 
                                                            n_input_tags=self.n_input_tags, 
                                                            n_candidate_tags=self.n_candidate_tags)
        self.input_adjacent_tags = self._adjacent_tags(input_tags=[t[0] for t in self.input_tags])
    
    def initial_recommendations(self, input_document, 
                                internal_n_recs=50, show_n_recs=5,
                                n_input_tags=10, n_candidate_tags=5,
                                n_adjacent_tags=10):
        self.__dict__.update(locals())
        self._get_content_recommendations(self.input_document)
        self._show(self.candidate_documents, self.candidate_tags)
        
    def rerank(self, selected_tags):
        ranking = te.rank(candidate_tags=self.candidate_tags, 
                          selected_tags=selected_tags)
        candidate_documents = np.array(self.candidate_documents)[ranking]
        candidate_tags = np.array(self.candidate_tags)[ranking]
        self._show(candidate_documents, candidate_tags)

In [11]:
recommender = ContentRecommender()

Creating LSI Vectors: 100%|████████████████████████████████████████████████████| 43404/43404 [00:07<00:00, 5875.28it/s]


In [12]:
# Use an existing product description as the input, to find similar products
recommender.initial_recommendations(input_document=documents[10000], 
                                    internal_n_recs=50, show_n_recs=5,
                                    n_input_tags=10, n_candidate_tags=5, 
                                    n_adjacent_tags=10)

#########################
######### INPUTS ########
#########################

Input Document: Straight-cut, linen-blend jacket in a patterned weave with a shawl collar, padded shoulders and long sleeves with decorative buttons at the cuffs. Buttons at the front and front pockets. Lined.

Adjacent Tags: ['two-button', 'single-breasted', 'lapel', 'herringbone', 'stitching', 'taper', 'buttonhole', 'peak', 'narrower', 'resort']

Input Tags: [('buttons', 49), ('decorative', 45), ('jacket', 45), ('decorative buttons', 42), ('straight-cut', 26), ('front', 19), ('front pockets', 11), ('long sleeves', 6), ('patterned', 3), ('weave', 2)]

#########################
#### RECOMMENDATIONS ####
#########################

1. Straight-cut, double-breasted jacket in a patterned weave with notch lapels and covered buttons at the front. Long sleeves with decorative buttons at the cuffs and welt front pockets with a flap. Lined.


{'buttons': 0.6040084576442799,
 'straight-cut': 0.5165327590197821,
 'decorative': 0.45251703358402384,
 'jacket': 0.3992945718140283,
 'decorative buttons': 0.37273231138306073}

------------------------------------------------------------------------------------------------------------------------------------------------------
2. Single-breasted jacket in a viscose weave with peak lapels, jetted front pockets and decorative buttons at the cuffs. Lined.


{'buttons': 0.5159429351454702,
 'decorative': 0.4890461578310454,
 'jacket': 0.4098731304478827,
 'decorative buttons': 0.37317545183368306,
 'front': 0.30614840658358006}

------------------------------------------------------------------------------------------------------------------------------------------------------
3. Straight-cut jacket in a patterned linen and viscose weave with notch lapels, buttons at the front, welt front pockets and decorative buttons at the cuffs. Partly lined.


{'buttons': 0.6033670506258286,
 'straight-cut': 0.5207388398787527,
 'decorative': 0.4616967960328113,
 'jacket': 0.42845213258734566,
 'decorative buttons': 0.38323580399025403}

------------------------------------------------------------------------------------------------------------------------------------------------------
4. Straight-cut, double-breasted jacket in woven fabric with peak lapels, welt front pockets and decorative buttons at the cuffs. Lined.


{'straight-cut': 0.5401074499530998,
 'buttons': 0.5027620009613111,
 'decorative': 0.47169966798603746,
 'jacket': 0.4029527881709758,
 'decorative buttons': 0.36343541787636996}

------------------------------------------------------------------------------------------------------------------------------------------------------
5. Double-breasted jacket in a striped weave with peak lapels, jetted front pockets and decorative buttons at the cuffs. Lined.


{'buttons': 0.49783490212270237,
 'decorative': 0.4759950236261371,
 'jacket': 0.3891374730972165,
 'decorative buttons': 0.3489875038498884,
 'front': 0.29364625298233726}

------------------------------------------------------------------------------------------------------------------------------------------------------


In [13]:
# Re-rank product recommendations with emphasis on a few input tags
recommender.rerank(selected_tags=['patterned'])

#########################
######### INPUTS ########
#########################

Input Document: Straight-cut, linen-blend jacket in a patterned weave with a shawl collar, padded shoulders and long sleeves with decorative buttons at the cuffs. Buttons at the front and front pockets. Lined.

Adjacent Tags: ['two-button', 'single-breasted', 'lapel', 'herringbone', 'stitching', 'taper', 'buttonhole', 'peak', 'narrower', 'resort']

Input Tags: [('buttons', 49), ('decorative', 45), ('jacket', 45), ('decorative buttons', 42), ('straight-cut', 26), ('front', 19), ('front pockets', 11), ('long sleeves', 6), ('patterned', 3), ('weave', 2)]

#########################
#### RECOMMENDATIONS ####
#########################

1. Straight-cut shirt in a soft, patterned viscose weave with a collar and buttons down the front. Long sleeves with narrow, buttoned cuffs, and a rounded hem.


{'straight-cut': 0.6369120049209104,
 'buttons': 0.524776461659095,
 'patterned': 0.46429118032507855,
 'long sleeves': 0.36747888514634175,
 'weave': 0.3384893294060679}

------------------------------------------------------------------------------------------------------------------------------------------------------
2. Short, straight-cut coat in a patterned weave with a zip down the front, long sleeves and zipped front pockets with a fake flap. Lined.


{'straight-cut': 0.6199036373391155,
 'patterned': 0.42808317578043054,
 'front pockets': 0.32558433956226557,
 'long sleeves': 0.3208542024336082,
 'front': 0.3136423010228962}

------------------------------------------------------------------------------------------------------------------------------------------------------
3. Jacket in a linen and cotton weave with notch lapels, buttons at the front, a chest pocket and flap front pockets. Decorative buttons at the cuffs and a single back vent. Patterned lining.


{'buttons': 0.5984048713340762,
 'decorative': 0.4568742921278206,
 'decorative buttons': 0.40165018632983246,
 'patterned': 0.3404270331869367,
 'jacket': 0.3392924495797817}

------------------------------------------------------------------------------------------------------------------------------------------------------
4. Two-button jacket in a herringbone linen and cotton weave with a decorative chest pocket, flap front pockets, decorative buttons at the cuffs and a single back vent. Partly lined.


{'decorative': 0.5460576502248433,
 'buttons': 0.47909871104270363,
 'decorative buttons': 0.3947374169768017,
 'jacket': 0.30440142486444155,
 'front pockets': 0.27640206630344405}

------------------------------------------------------------------------------------------------------------------------------------------------------
5. Jacket in a viscose weave with a sheen. Shawl collar, covered buttons at the front and long sleeves with wide cuffs. Detachable belt at the waist, and decorative piping at the front and cuffs. Unlined.


{'buttons': 0.4748476734428302,
 'decorative': 0.40745794844088895,
 'jacket': 0.3321357038470315,
 'front': 0.281336075756626,
 'long sleeves': 0.27738396346991795}

------------------------------------------------------------------------------------------------------------------------------------------------------


In [14]:
# Use free-text to find products that match the description best
recommender.initial_recommendations(input_document='leather gloves', 
                                    internal_n_recs=50, show_n_recs=5,
                                    n_input_tags=10, n_candidate_tags=5, 
                                    n_adjacent_tags=10)

#########################
######### INPUTS ########
#########################

Input Document: leather gloves

Adjacent Tags: ['fingerless', 'uppers', 'separately', 'touchscreen-compatible', 'index', 'finger', 'reveal', 'thumb', 'palms', 'material']

Input Tags: [('leather', 41), ('gloves', 26)]

#########################
#### RECOMMENDATIONS ####
#########################

1. Gloves in soft, supple leather. Lined.


{'leather': 0.2697697269627539, 'gloves': 0.16263032272489955}

------------------------------------------------------------------------------------------------------------------------------------------------------
2. Gloves in soft, supple imitation leather. Lined.


{'leather': 0.407979670937061, 'gloves': 0.15655984554677976}

------------------------------------------------------------------------------------------------------------------------------------------------------
3. Knitted gloves with imitation leather uppers and palms, and foldover cuffs. Fleece lining.


{'leather': 0.3269884723326767, 'gloves': 0.19394106644352957}

------------------------------------------------------------------------------------------------------------------------------------------------------
4. Gloves in soft leather with a wool-blend lining.


{'leather': 0.2569530068776335, 'gloves': 0.1676026789596508}

------------------------------------------------------------------------------------------------------------------------------------------------------
5. Gloves in soft, supple leather with a zip at the top. Lined.


{'leather': 0.26862292643613206, 'gloves': 0.16104456185196314}

------------------------------------------------------------------------------------------------------------------------------------------------------


In [15]:
# Use adjacent tags to make a different query
recommender.initial_recommendations(input_document='touchscreen-compatible leather gloves', 
                                    internal_n_recs=50, show_n_recs=5,
                                    n_input_tags=10, n_candidate_tags=5, 
                                    n_adjacent_tags=10)

#########################
######### INPUTS ########
#########################

Input Document: touchscreen-compatible leather gloves

Adjacent Tags: ['fingerless', 'uppers', 'index', 'separately', 'finger', 'thumb', 'reveal', 'palms', 'material', 'wrists']

Input Tags: [('leather', 36), ('gloves', 31), ('touchscreen-compatible', 4)]

#########################
#### RECOMMENDATIONS ####
#########################

1. Gloves in soft, supple leather. Lined.


{'leather': 0.2108169906886095, 'gloves': 0.13305906973152337}

------------------------------------------------------------------------------------------------------------------------------------------------------
2. Knitted gloves with imitation leather uppers and palms, and foldover cuffs. Fleece lining.


{'leather': 0.26803573605853226, 'gloves': 0.1643698134501534}

------------------------------------------------------------------------------------------------------------------------------------------------------
3. Gloves in soft, supple imitation leather. Lined.


{'leather': 0.3490269346629167, 'gloves': 0.12698859255340358}

------------------------------------------------------------------------------------------------------------------------------------------------------
4. Gloves in soft leather with a wool-blend lining.


{'leather': 0.19800027060348913, 'gloves': 0.13803142596627463}

------------------------------------------------------------------------------------------------------------------------------------------------------
5. Gloves in soft, supple leather with a zip at the top. Lined.


{'leather': 0.20967019016198774, 'gloves': 0.13147330885858696}

------------------------------------------------------------------------------------------------------------------------------------------------------


In [16]:
# Re-rank product recommendations based on an input tag
recommender.rerank(selected_tags=['touchscreen-compatible'])

#########################
######### INPUTS ########
#########################

Input Document: touchscreen-compatible leather gloves

Adjacent Tags: ['fingerless', 'uppers', 'index', 'separately', 'finger', 'thumb', 'reveal', 'palms', 'material', 'wrists']

Input Tags: [('leather', 36), ('gloves', 31), ('touchscreen-compatible', 4)]

#########################
#### RECOMMENDATIONS ####
#########################

1. Water-repellent, softshell gloves with a soft fleece lining and reinforced palms in imitation leather. Elastication at the wrists, reflective details and a plastic hook-and-eye fastening to keep the gloves together. The tops of the thumb and index finger are made from touchscreen-compatible material.


{'leather': 0.195319265840494,
 'gloves': 0.1487566578707491,
 'touchscreen-compatible': 0.01263168234071435}

------------------------------------------------------------------------------------------------------------------------------------------------------
2. Leather gloves with the tops of the thumb and index finger in touchscreen-compatible material. Lined.


{'leather': 0.15712906056563583,
 'gloves': 0.11228477478436617,
 'touchscreen-compatible': 0.01070671719353608}

------------------------------------------------------------------------------------------------------------------------------------------------------
3. Leather gloves. The tops of the thumb, index finger and middle finger have been treated to make them touchscreen-compatible. Lined.


{'leather': 0.14667848273180378,
 'gloves': 0.10407852766372036,
 'touchscreen-compatible': 0.00958206773754007}

------------------------------------------------------------------------------------------------------------------------------------------------------
4. Fine-knit gloves with touchscreen-compatible material on the thumb, index and middle finger.


{'gloves': 0.10867224682309784, 'touchscreen-compatible': 0.007508300795305566}

------------------------------------------------------------------------------------------------------------------------------------------------------
5. Gloves in soft, supple leather. Lined.


{'leather': 0.2108169906886095, 'gloves': 0.13305906973152337}

------------------------------------------------------------------------------------------------------------------------------------------------------
