In [1]:
from gensim import corpora, models
import pandas as pd
import numpy as np
import sys
from collections import Counter
import itertools
from lsi_tagger.model import TagExtractor

%load_ext autoreload
%autoreload 2

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\David\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
df = pd.read_csv('data/h&m_kaggle_products.csv')
df = df[~df.isnull().any(axis=1)]
print(df.shape)
df.head(2)

(105126, 25)


Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.


In [3]:
documents = df['detail_desc'].drop_duplicates().values

te = TagExtractor(word_count_min=2, word_length_min=2, num_lsi_topics=200, 
                  bigram_kwargs={'bigrams_pmi_min_value':1, 'bigrams_min_freq':200})
te.fit(documents)

Preprocessing text: 100%|█████████████████████████████████████████████████████| 43404/43404 [00:03<00:00, 12068.38it/s]
Filtering tokens: 100%|████████████████████████████████████████████████████████| 43404/43404 [00:15<00:00, 2851.35it/s]
Making bigrams: 43404it [00:01, 38158.96it/s]
Filtering by word count: 100%|████████████████████████████████████████████████| 43404/43404 [00:00<00:00, 56310.01it/s]
2022-04-10 13:11:50 INFO     Training TF-IDF...
2022-04-10 13:11:50 INFO     adding document #0 to Dictionary(0 unique tokens: [])
2022-04-10 13:11:51 INFO     adding document #10000 to Dictionary(1921 unique tokens: ['jersey', 'jersey top', 'narrow', 'narrow shoulder', 'shoulder']...)
2022-04-10 13:11:51 INFO     adding document #20000 to Dictionary(2174 unique tokens: ['jersey', 'jersey top', 'narrow', 'narrow shoulder', 'shoulder']...)
2022-04-10 13:11:52 INFO     adding document #30000 to Dictionary(2348 unique tokens: ['jersey', 'jersey top', 'narrow', 'narrow shoulder', 'shoulder']

In [4]:
len(te.tc.bigrams_dict)

203

In [5]:
input_tags, candidate_tags = te.transform(input_document='Jersey top with narrow shoulder straps.',
                                          candidate_documents=[
                                              'Tights with built-in support to lift the bottom. Black in 30 denier and light amber in 15 denier.',
                                              'Semi shiny tights that shape the tummy, thighs and calves while also encouraging blood circulation in the legs. Elasticated waist.',
                                              'Jersey top with narrow shoulder straps.',
                                              'uygnhjkl'
                                          ], 
                                          n_input_tags=10, n_candidate_tags=10)

input_tags, candidate_tags

([('narrow shoulder', 1),
  ('jersey top', 1),
  ('narrow', 1),
  ('shoulder', 1),
  ('shoulder straps', 1),
  ('straps', 1),
  ('top', 1),
  ('jersey', 1)],
 [{},
  {},
  {'narrow shoulder': 0.7617086871211831,
   'jersey top': 0.717147241740793,
   'narrow': 0.6157702550842733,
   'shoulder': 0.614215284901219,
   'shoulder straps': 0.5938417396995828,
   'straps': 0.5739733121003467,
   'top': 0.4158298945549237,
   'jersey': 0.32933640613636406},
  {}])

In [6]:
new_candidate_ranking = te.rank(candidate_tags=candidate_tags, selected_tags=['jersey'])
new_candidate_ranking

[2, 0, 1, 3]

In [7]:
te.save()
te = TagExtractor()
te.load()

In [8]:
input_tags, candidate_tags = te.transform(input_document='Jersey top with narrow shoulder straps.',
                                          candidate_documents=[
                                              'Tights with built-in support to lift the bottom. Black in 30 denier and light amber in 15 denier.',
                                              'Semi shiny tights that shape the tummy, thighs and calves while also encouraging blood circulation in the legs. Elasticated waist.',
                                              'Jersey top with narrow shoulder straps.',
                                              'uygnhjkl'
                                          ], 
                                          n_input_tags=10, n_candidate_tags=10)

input_tags, candidate_tags

([('jersey top', 1),
  ('narrow shoulder', 1),
  ('shoulder straps', 1),
  ('straps', 1),
  ('narrow', 1),
  ('shoulder', 1),
  ('top', 1),
  ('jersey', 1)],
 [{},
  {},
  {'jersey top': 0.8772317763823081,
   'narrow shoulder': 0.7785568909779876,
   'shoulder straps': 0.6355535130824466,
   'straps': 0.6059983392044233,
   'narrow': 0.6053735593033779,
   'shoulder': 0.5555633523654915,
   'top': 0.36434324889827213,
   'jersey': 0.35461624727365776},
  {}])

In [9]:
new_candidate_ranking = te.rank(candidate_tags=candidate_tags, selected_tags=['jersey'])
new_candidate_ranking

[2, 0, 1, 3]

In [98]:
from tqdm import tqdm

class ContentRecommender:
    def __init__(self):
        lsi_vecs = np.array([self._lsi_corpus2vec(cl) for cl in tqdm(te.corpus_lsi, desc='Creating LSI Vectors')])
        self.lsi_vecs_normed = lsi_vecs/np.linalg.norm(lsi_vecs, axis=1)[:,None]
        self.lsi_topic_matrix_T_normed = te.lsi_topic_matrix.T/np.linalg.norm(te.lsi_topic_matrix.T, axis=1)[:,None]
        
    def _lsi_corpus2vec(self, lsi_corpus_vec):
        vec = np.zeros(te.num_lsi_topics)
        for ind, value in lsi_corpus_vec:
            vec[ind] = value
        return vec
    
    def _adjacent_tags(self, input_tags):
        bow = te.dictionary.doc2bow(input_tags)
        bow_vec = self._lsi_corpus2vec(te.lsi_model[te.tfidf[bow]])
        bow_vec_normed = bow_vec/np.linalg.norm(bow_vec)
        sims = bow_vec_normed.dot(self.lsi_topic_matrix_T_normed.T)
        top_inds = np.argsort(sims)[::-1]

        adj_tags = []
        count = 0
        for t in top_inds:
            tag = te.dictionary[t]
            flag = True
            for input_tag in input_tags:
            # if tag not in input_tags:
                if (input_tag in tag) or (tag in input_tag):
                    flag = False
            if flag:
                count += 1
                adj_tags.append(tag)
            if count == self.n_adjacent_tags:
                break
        return adj_tags
    
    def _show(self, candidate_documents, candidate_tags):
        print("#########################")
        print("######### INPUTS ########")
        print("#########################\n")
        print(f"Input Document: {self.input_document}")
        print(f"\nAdjacent Tags: {self.input_adjacent_tags}")
        print(f"\nInput Tags: {self.input_tags}")
        
        print("\n#########################")
        print("#### RECOMMENDATIONS ####")
        print("#########################\n")
        for n, candidate_doc in enumerate(candidate_documents):
            print(f"{n+1}. {candidate_doc}")
            display(candidate_tags[n])
            print('-'*150)
            if (n+1) == self.show_n_recs:
                break
                
    def _get_content_recommendations(self, input_document):
        _, input_corpus_lsi = te._get_vector_representations(input_document)
        input_lsi_vec = self._lsi_corpus2vec(input_corpus_lsi)
        input_lsi_vec_normed = input_lsi_vec/np.linalg.norm(input_lsi_vec)
        sims = input_lsi_vec_normed.dot(self.lsi_vecs_normed.T)
        top_inds = np.argsort(sims)[::-1][:(self.internal_n_recs+1)]
        self.candidate_documents = [documents[ti] for ti in top_inds if documents[ti]!=input_document]
        self.input_tags, self.candidate_tags = te.transform(input_document=input_document,
                                                            candidate_documents=self.candidate_documents, 
                                                            n_input_tags=self.n_input_tags, 
                                                            n_candidate_tags=self.n_candidate_tags)
        self.input_adjacent_tags = self._adjacent_tags(input_tags=[t[0] for t in self.input_tags])
    
    def initial_recommendations(self, input_document, 
                                internal_n_recs=50, show_n_recs=5,
                                n_input_tags=10, n_candidate_tags=5,
                                n_adjacent_tags=10):
        self.__dict__.update(locals())
        self._get_content_recommendations(self.input_document)
        self._show(self.candidate_documents, self.candidate_tags)
        
    def rerank(self, selected_tags):
        ranking = te.rank(candidate_tags=self.candidate_tags, 
                          selected_tags=selected_tags)
        candidate_documents = np.array(self.candidate_documents)[ranking]
        candidate_tags = np.array(self.candidate_tags)[ranking]
        self._show(candidate_documents, candidate_tags)

In [99]:
recommender = ContentRecommender()

Creating LSI Vectors: 100%|████████████████████████████████████████████████████| 43404/43404 [00:20<00:00, 2142.66it/s]


In [100]:
# Use an existing product description as the input, to find similar products
recommender.initial_recommendations(input_document=documents[779], 
                                    internal_n_recs=50, show_n_recs=5,
                                    n_input_tags=10, n_candidate_tags=5, 
                                    n_adjacent_tags=10)

#########################
######### INPUTS ########
#########################

Input Document: 5-pocket jeans in washed stretch denim with a regular waist, zip fly and skinny legs.

Adjacent Tags: ['bootcut', 'ultra-flexible', 'keeps', 'technology', 'lycra®', 'super-generous', 'freefit®', 'unravelling', 'shuttle', 'looms']

Input Tags: [('regular', 50), ('regular waist', 50), ('jeans', 49), ('5-pocket', 46), ('stretch denim', 26), ('denim', 21), ('waist zip', 4), ('stretch', 4)]

#########################
#### RECOMMENDATIONS ####
#########################

1. 5-pocket jeans in washed stretch denim with a regular waist, zip fly and button, and skinny legs.


{'regular': 0.5911284319584238,
 'regular waist': 0.5893544921107612,
 'jeans': 0.5717551074038302,
 '5-pocket': 0.5294803168665573,
 'stretch denim': 0.4848088368012736}

------------------------------------------------------------------------------------------------------------------------------------------------------
2. 5-pocket jeans in washed, stretch denim with a regular waist, zip fly and button, and skinny legs.


{'regular': 0.5911284319584238,
 'regular waist': 0.5893544921107612,
 'jeans': 0.5717551074038302,
 '5-pocket': 0.5294803168665573,
 'stretch denim': 0.4848088368012736}

------------------------------------------------------------------------------------------------------------------------------------------------------
3. 5-pocket jeans in stretch denim with a regular waist, zip fly and skinny legs.


{'regular': 0.6036233006829017,
 'regular waist': 0.6011137184658021,
 'jeans': 0.5610622652433217,
 '5-pocket': 0.508718900308461,
 'stretch denim': 0.47564454987207627}

------------------------------------------------------------------------------------------------------------------------------------------------------
4. 5-pocket jeans in washed stretch denim with a regular waist, zip fly and button at the waist, and skinny legs.


{'regular': 0.59426701706062,
 'regular waist': 0.5914629477222434,
 'jeans': 0.5699366854049266,
 '5-pocket': 0.5286759432939807,
 'stretch denim': 0.48281298463507855}

------------------------------------------------------------------------------------------------------------------------------------------------------
5. 5-pocket superskinny-fit jeans in stretch denim with a regular waist, zip fly and skinny legs.


{'regular': 0.6039744413440691,
 'regular waist': 0.600292452634867,
 'jeans': 0.5485893394304046,
 '5-pocket': 0.49702116192905654,
 'stretch denim': 0.47541787473173436}

------------------------------------------------------------------------------------------------------------------------------------------------------


In [101]:
# Re-rank product recommendations with emphasis on a few input tags
recommender.rerank(selected_tags=['stretch','skinny','jeans'])

#########################
######### INPUTS ########
#########################

Input Document: 5-pocket jeans in washed stretch denim with a regular waist, zip fly and skinny legs.

Adjacent Tags: ['bootcut', 'ultra-flexible', 'keeps', 'technology', 'lycra®', 'super-generous', 'freefit®', 'unravelling', 'shuttle', 'looms']

Input Tags: [('regular', 50), ('regular waist', 50), ('jeans', 49), ('5-pocket', 46), ('stretch denim', 26), ('denim', 21), ('waist zip', 4), ('stretch', 4)]

#########################
#### RECOMMENDATIONS ####
#########################

1. Jeans in washed stretch denim with decorative beads on the front, a regular waist, zip fly and skinny legs.


{'regular': 0.5722032355972715,
 'regular waist': 0.5678917619811685,
 'jeans': 0.5281926092461704,
 'stretch denim': 0.4658813704099082,
 'stretch': 0.4497593337982089}

------------------------------------------------------------------------------------------------------------------------------------------------------
2. 5-pocket jeans in stretch denim with a regular waist, zip fly and super-skinny legs.


{'regular': 0.5733190144248591,
 'regular waist': 0.5675452094093354,
 'jeans': 0.5143820964919901,
 '5-pocket': 0.4999301567741487,
 'stretch': 0.4396199262869022}

------------------------------------------------------------------------------------------------------------------------------------------------------
3. Jeans in stretch denim with a regular waist, zip fly and button, and skinny legs with studs down the sides.


{'regular': 0.5662564971551864,
 'regular waist': 0.563118354591619,
 'jeans': 0.5020184540309384,
 'stretch denim': 0.4498837090466628,
 'stretch': 0.4488447404906962}

------------------------------------------------------------------------------------------------------------------------------------------------------
4. Jeans in washed stretch denim with a regular waist, zip fly and button, front and back pockets and skinny legs with sewn-in creases.


{'regular': 0.5652099341769957,
 'regular waist': 0.5605775616157098,
 'jeans': 0.5079441527121181,
 'stretch denim': 0.4578720533635183,
 'stretch': 0.4338855148206045}

------------------------------------------------------------------------------------------------------------------------------------------------------
5. 5-pocket jeans in washed, stretch denim with a regular waist, zip fly and button and skinny legs. The cotton content of the jeans is partly recycled.


{'jeans': 0.6219320166127076,
 'regular waist': 0.5386016133540117,
 'regular': 0.5373351869799562,
 '5-pocket': 0.5015702187384821,
 'denim': 0.45778842999833286}

------------------------------------------------------------------------------------------------------------------------------------------------------


In [102]:
# Use free-text to find products that match the description best
recommender.initial_recommendations(input_document='leather gloves', 
                                    internal_n_recs=50, show_n_recs=5,
                                    n_input_tags=10, n_candidate_tags=5, 
                                    n_adjacent_tags=10)

#########################
######### INPUTS ########
#########################

Input Document: leather gloves

Adjacent Tags: ['fingerless', 'uppers', 'separately', 'finger', 'index', 'touchscreen-compatible', 'thumb', 'reveal', 'wool-blend', 'palms']

Input Tags: [('leather', 42), ('gloves', 25)]

#########################
#### RECOMMENDATIONS ####
#########################

1. Gloves in soft, supple leather. Lined.


{'leather': 0.27643131582752917, 'gloves': 0.1535748451330823}

------------------------------------------------------------------------------------------------------------------------------------------------------
2. Gloves in soft, supple imitation leather. Lined.


{'leather': 0.4145587981608296, 'gloves': 0.15045473642571913}

------------------------------------------------------------------------------------------------------------------------------------------------------
3. Knitted gloves with imitation leather uppers and palms, and foldover cuffs. Fleece lining.


{'leather': 0.33506722809435135, 'gloves': 0.1895911370179059}

------------------------------------------------------------------------------------------------------------------------------------------------------
4. Gloves in soft leather with a wool-blend lining.


{'leather': 0.2638160794248091, 'gloves': 0.1607620878808983}

------------------------------------------------------------------------------------------------------------------------------------------------------
5. Gloves in soft, supple leather with a zip at the top. Lined.


{'leather': 0.27478579408301984, 'gloves': 0.15253287116631659}

------------------------------------------------------------------------------------------------------------------------------------------------------


In [103]:
# Use adjacent tags to make a different query
recommender.initial_recommendations(input_document='fingerless touchscreen-compatible gloves', 
                                    internal_n_recs=50, show_n_recs=5,
                                    n_input_tags=10, n_candidate_tags=5, 
                                    n_adjacent_tags=10)

#########################
######### INPUTS ########
#########################

Input Document: fingerless touchscreen-compatible gloves

Adjacent Tags: ['uppers', 'index', 'thumb', 'separately', 'reveal', 'palms', 'material', 'wool-blend', 'glow-in-the-dark', 'wrists']

Input Tags: [('gloves', 50), ('fingerless', 11), ('touchscreen-compatible', 10)]

#########################
#### RECOMMENDATIONS ####
#########################

1. Fine-knit gloves with matching fingerless gloves. The fingerless gloves have a motif on the uppers. The gloves can be worn over each other or separately.


{'gloves': 0.13394297409377554, 'fingerless': 0.016118883459633644}

------------------------------------------------------------------------------------------------------------------------------------------------------
2. Fine-knit gloves with matching fingerless gloves. The gloves can be worn over each other or separately.


{'gloves': 0.1365587109422879, 'fingerless': 0.016366525541119073}

------------------------------------------------------------------------------------------------------------------------------------------------------
3. Fine-knit gloves with matching fingerless gloves. The fingerless gloves have a stud appliqué on the uppers. The gloves can be worn over each other or separately.


{'gloves': 0.1335222271922923, 'fingerless': 0.015931937857539546}

------------------------------------------------------------------------------------------------------------------------------------------------------
4. Fine-knit gloves with matching fingerless gloves with a glow-in-the-dark print. The gloves can be worn over each other or separately.


{'gloves': 0.13474248422901913, 'fingerless': 0.016193304042143232}

------------------------------------------------------------------------------------------------------------------------------------------------------
5. Fine-knit gloves with matching fingerless gloves. The fingerless gloves have a print motif on the uppers. The gloves can be worn over each other or separately.


{'gloves': 0.13585175539697783, 'fingerless': 0.016445591023873163}

------------------------------------------------------------------------------------------------------------------------------------------------------


In [104]:
# Re-rank product recommendations based on an input tag
recommender.rerank(selected_tags=['touchscreen-compatible'])

#########################
######### INPUTS ########
#########################

Input Document: fingerless touchscreen-compatible gloves

Adjacent Tags: ['uppers', 'index', 'thumb', 'separately', 'reveal', 'palms', 'material', 'wool-blend', 'glow-in-the-dark', 'wrists']

Input Tags: [('gloves', 50), ('fingerless', 11), ('touchscreen-compatible', 10)]

#########################
#### RECOMMENDATIONS ####
#########################

1. Water-repellent, softshell gloves with a soft fleece lining and reinforced palms in imitation leather. Elastication at the wrists, reflective details and a plastic hook-and-eye fastening to keep the gloves together. The tops of the thumb and index finger are made from touchscreen-compatible material.


{'gloves': 0.1411970845547555, 'touchscreen-compatible': 0.011883351785491714}

------------------------------------------------------------------------------------------------------------------------------------------------------
2. Running set with a hat, gloves and tube scarf in thermal fleece. Gloves with a reflective print and touchscreen-compatible material at the tops of the thumb and index finger. Hat and tube scarf with reflective appliqués.


{'gloves': 0.13866283223719567, 'touchscreen-compatible': 0.011762339254019604}

------------------------------------------------------------------------------------------------------------------------------------------------------
3. Running gloves with uppers in windproof, functional fabric and ribbed cuffs. The tops of the thumb and index finger are made from touchscreen-compatible material.


{'gloves': 0.11653100956126772, 'touchscreen-compatible': 0.010451807623936002}

------------------------------------------------------------------------------------------------------------------------------------------------------
4. Windproof gloves in jersey and fleece with reinforced palms and an adjustable hook and loop tab at the wrists. The tops of the thumb and index finger are in touchscreen-compatible material. D-ring and hook to clip the gloves together.


{'gloves': 0.12437026346927627, 'touchscreen-compatible': 0.0103827449208123}

------------------------------------------------------------------------------------------------------------------------------------------------------
5. Leather gloves with the tops of the thumb and index finger in touchscreen-compatible material. Lined.


{'gloves': 0.10056264026440623, 'touchscreen-compatible': 0.009384742612469379}

------------------------------------------------------------------------------------------------------------------------------------------------------
