In [1]:
from gensim import corpora, models
import pandas as pd
import numpy as np
import sys
from collections import Counter
import itertools
from lsi_tagger.model import TagExtractor

%load_ext autoreload
%autoreload 2

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\David\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
df = pd.read_csv('data/h&m_kaggle_products.csv')
df = df[~df.isnull().any(axis=1)]
print(df.shape)
df.head(2)

(105126, 25)


Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.


In [3]:
documents = df['detail_desc'].drop_duplicates().values

te = TagExtractor(word_count_min=2, word_length_min=2, num_lsi_topics=200, 
                  bigram_kwargs={'bigrams_pmi_min_value':1, 'bigrams_min_freq':200})
te.fit(documents)

Preprocessing text: 100%|█████████████████████████████████████████████████████| 43404/43404 [00:01<00:00, 40715.10it/s]
Filtering tokens: 100%|████████████████████████████████████████████████████████| 43404/43404 [00:05<00:00, 7967.22it/s]
Making bigrams: 43404it [00:00, 100072.27it/s]
Filtering by word count: 100%|███████████████████████████████████████████████| 43404/43404 [00:00<00:00, 130571.59it/s]
2022-04-10 10:15:06 INFO     Training TF-IDF...
2022-04-10 10:15:06 INFO     adding document #0 to Dictionary(0 unique tokens: [])
2022-04-10 10:15:06 INFO     adding document #10000 to Dictionary(1921 unique tokens: ['jersey', 'jersey top', 'narrow', 'narrow shoulder', 'shoulder']...)
2022-04-10 10:15:06 INFO     adding document #20000 to Dictionary(2174 unique tokens: ['jersey', 'jersey top', 'narrow', 'narrow shoulder', 'shoulder']...)
2022-04-10 10:15:07 INFO     adding document #30000 to Dictionary(2348 unique tokens: ['jersey', 'jersey top', 'narrow', 'narrow shoulder', 'shoulder'

In [4]:
len(te.tc.bigrams_dict)

203

In [5]:
input_tags, candidate_tags = te.transform(input_document='Jersey top with narrow shoulder straps.',
                                          candidate_documents=[
                                              'Tights with built-in support to lift the bottom. Black in 30 denier and light amber in 15 denier.',
                                              'Semi shiny tights that shape the tummy, thighs and calves while also encouraging blood circulation in the legs. Elasticated waist.',
                                              'Jersey top with narrow shoulder straps.',
                                              'uygnhjkl'
                                          ], 
                                          n_input_tags=10, n_candidate_tags=10)

input_tags, candidate_tags

([('jersey top', 1),
  ('narrow shoulder', 1),
  ('shoulder straps', 1),
  ('shoulder', 1),
  ('narrow', 1),
  ('straps', 1),
  ('top', 1),
  ('jersey', 1)],
 [{},
  {},
  {'jersey top': 0.7799327616866192,
   'narrow shoulder': 0.7538538564770323,
   'shoulder straps': 0.6133740289496458,
   'shoulder': 0.612452844686896,
   'narrow': 0.595187858132562,
   'straps': 0.5852465015919929,
   'top': 0.39719267371105943,
   'jersey': 0.3619619206415009},
  {}])

In [6]:
new_candidate_ranking = te.rank(candidate_tags=candidate_tags, selected_tags=['jersey'])
new_candidate_ranking

[2, 0, 1, 3]

In [7]:
te.save()
te = TagExtractor()
te.load()

In [8]:
input_tags, candidate_tags = te.transform(input_document='Jersey top with narrow shoulder straps.',
                                          candidate_documents=[
                                              'Tights with built-in support to lift the bottom. Black in 30 denier and light amber in 15 denier.',
                                              'Semi shiny tights that shape the tummy, thighs and calves while also encouraging blood circulation in the legs. Elasticated waist.',
                                              'Jersey top with narrow shoulder straps.',
                                              'uygnhjkl'
                                          ], 
                                          n_input_tags=10, n_candidate_tags=10)

input_tags, candidate_tags

([('jersey top', 1),
  ('narrow shoulder', 1),
  ('shoulder straps', 1),
  ('straps', 1),
  ('narrow', 1),
  ('shoulder', 1),
  ('top', 1),
  ('jersey', 1)],
 [{},
  {},
  {'jersey top': 0.8772317763823081,
   'narrow shoulder': 0.7785568909779876,
   'shoulder straps': 0.6355535130824466,
   'straps': 0.6059983392044233,
   'narrow': 0.6053735593033779,
   'shoulder': 0.5555633523654915,
   'top': 0.36434324889827213,
   'jersey': 0.35461624727365776},
  {}])

In [9]:
new_candidate_ranking = te.rank(candidate_tags=candidate_tags, selected_tags=['jersey'])
new_candidate_ranking

[2, 0, 1, 3]

In [134]:
from tqdm import tqdm

class ContentRecommender:
    def __init__(self):
        lsi_vecs = np.array([self._lsi_corpus2vec(cl) for cl in tqdm(te.corpus_lsi)])
        self.lsi_vecs_normed = lsi_vecs/np.linalg.norm(lsi_vecs, axis=1)[:,None]
        self.lsi_topic_matrix_T_normed = te.lsi_topic_matrix.T/np.linalg.norm(te.lsi_topic_matrix.T, axis=1)[:,None]
        
    def _lsi_corpus2vec(self, lsi_corpus_vec):
        vec = np.zeros(te.num_lsi_topics)
        for ind, value in lsi_corpus_vec:
            vec[ind] = value
        return vec
    
    def _adjacent_tags(self, input_tags):
        bow = te.dictionary.doc2bow(input_tags)
        bow_vec = self._lsi_corpus2vec(te.lsi_model[te.tfidf[bow]])
        bow_vec_normed = bow_vec/np.linalg.norm(bow_vec)
        sims = bow_vec_normed.dot(self.lsi_topic_matrix_T_normed.T)
        top_inds = np.argsort(sims)[::-1]

        adj_tags = []
        count = 0
        for t in top_inds:
            tag = te.dictionary[t]
            if tag not in input_tags:
                count += 1
                adj_tags.append(tag)
            if count == self.n_adjacent_tags:
                break
        return adj_tags
    
    def _show(self, candidate_documents, candidate_tags, 
              new_input_document='',
              removed_tags=[], added_tags=[]):
        print(f"Input Document: {self.input_document}")
        print(f"Adjacent Tags: {self.input_adjacent_tags}")
        if len(new_input_document) > 0:
            print(f"\nNew Input Document: {new_input_document}")
        print(f"Input Tags: {self.input_tags}")
        if len(removed_tags) > 0:
            print(f"Removed Tags: {removed_tags}")
        if len(added_tags) > 0:
            print(f"Added Tags: {added_tags}")
        print('='*150)
        print()

        for n, candidate_doc in enumerate(candidate_documents):
            print(candidate_doc)
            display(candidate_tags[n])
            print('-'*150)
            if (n+1) == self.show_n_recs:
                break
                
    def _get_content_recommendations(self, input_document):
        _, input_corpus_lsi = te._get_vector_representations(input_document)
        input_lsi_vec = self._lsi_corpus2vec(input_corpus_lsi)
        input_lsi_vec_normed = input_lsi_vec/np.linalg.norm(input_lsi_vec)
        sims = input_lsi_vec_normed.dot(self.lsi_vecs_normed.T)
        top_inds = np.argsort(sims)[::-1][:(self.internal_n_recs+1)]
        self.candidate_documents = [documents[ti] for ti in top_inds if documents[ti]!=input_document]
        self.input_tags, self.candidate_tags = te.transform(input_document=input_document,
                                                            candidate_documents=self.candidate_documents, 
                                                            n_input_tags=self.n_input_tags, 
                                                            n_candidate_tags=self.n_candidate_tags)
        self.input_adjacent_tags = self._adjacent_tags(input_tags=[t[0] for t in self.input_tags])
    
    def initial_recommendations(self, input_document, 
                                internal_n_recs=50, show_n_recs=5,
                                n_input_tags=10, n_candidate_tags=5,
                                n_adjacent_tags=10):
        self.__dict__.update(locals())
        self._get_content_recommendations(self.input_document)
        self._show(self.candidate_documents, self.candidate_tags)
        
    def rerank(self, selected_tags):
        ranking = te.rank(candidate_tags=self.candidate_tags, 
                          selected_tags=selected_tags)
        candidate_documents = np.array(self.candidate_documents)[ranking]
        candidate_tags = np.array(self.candidate_tags)[ranking]
        self._show(candidate_documents, candidate_tags)
        
    def replace_tags(self, tags_to_remove=[], tags_to_add=[]):
        input_tags = list(set(te.tc.transform([self.input_document])[0]))
        print(input_tags)
        # input_tags = [tag for tag in input_tags if tag not in tags_to_remove]
        new_input_tags = []
        for tag in input_tags:
            flag = True
            for remove_tag in tags_to_remove:
                if (remove_tag not in tag) and (tag not in remove_tag):
                    flag = False
            if flag:
                new_input_tags.append(tag)
        new_input_tags += tags_to_add
        print(new_input_tags)
        input_document = ' '.join(new_input_tags)
        self._get_content_recommendations(input_document)
        self._show(self.candidate_documents, self.candidate_tags, 
                   removed_tags=tags_to_remove, added_tags=tags_to_add,
                   new_input_document=input_document)

In [135]:
recommender = ContentRecommender()

100%|██████████████████████████████████████████████████████████████████████████| 43404/43404 [00:22<00:00, 1958.36it/s]


In [136]:
recommender.initial_recommendations(input_document=documents[779], 
                                    internal_n_recs=50, show_n_recs=5,
                                    n_input_tags=10, n_candidate_tags=5, 
                                    n_adjacent_tags=10)

Input Document: 5-pocket jeans in washed stretch denim with a regular waist, zip fly and skinny legs.
Adjacent Tags: ['bootcut', 'ultra-flexible', '5-pocket jeans', 'keeps', 'technology', 'lycra®', 'freefit®', 'super-generous', 'function', 'white']
Input Tags: [('regular waist', 50), ('regular', 50), ('jeans', 49), ('5-pocket', 47), ('denim', 35), ('stretch', 10), ('waist zip', 5), ('stretch denim', 4)]

5-pocket jeans in stretch denim with a regular waist, zip fly and skinny legs.


{'regular waist': 0.6028207223327273,
 'regular': 0.6011850999552139,
 'jeans': 0.5487427672609305,
 '5-pocket': 0.5025385498471577,
 'stretch': 0.4573499936719452}

------------------------------------------------------------------------------------------------------------------------------------------------------
5-pocket jeans in washed, stretch denim with a regular waist, zip fly and button, and skinny legs.


{'regular waist': 0.5909443795231419,
 'regular': 0.5884097379357883,
 'jeans': 0.5595405533048357,
 '5-pocket': 0.5223657454774103,
 'denim': 0.4725822954614911}

------------------------------------------------------------------------------------------------------------------------------------------------------
5-pocket jeans in washed stretch denim with a regular waist, zip fly and button, and skinny legs.


{'regular waist': 0.5909443795231419,
 'regular': 0.5884097379357883,
 'jeans': 0.5595405533048357,
 '5-pocket': 0.5223657454774103,
 'denim': 0.4725822954614911}

------------------------------------------------------------------------------------------------------------------------------------------------------
5-pocket jeans in washed stretch denim with a regular waist, zip fly and button at the waist, and skinny legs.


{'regular waist': 0.5931458520947404,
 'regular': 0.5916117406622277,
 'jeans': 0.5583473148655085,
 '5-pocket': 0.5216162785086719,
 'denim': 0.470882043014526}

------------------------------------------------------------------------------------------------------------------------------------------------------
5-pocket superskinny-fit jeans in stretch denim with a regular waist, zip fly and skinny legs.


{'regular': 0.6038269979262616,
 'regular waist': 0.603794636815649,
 'jeans': 0.5362180144674937,
 '5-pocket': 0.4920809781546825,
 'stretch': 0.4741421375355526}

------------------------------------------------------------------------------------------------------------------------------------------------------


In [137]:
recommender.rerank(selected_tags=['stretch','skinny','jeans'])

Input Document: 5-pocket jeans in washed stretch denim with a regular waist, zip fly and skinny legs.
Adjacent Tags: ['bootcut', 'ultra-flexible', '5-pocket jeans', 'keeps', 'technology', 'lycra®', 'freefit®', 'super-generous', 'function', 'white']
Input Tags: [('regular waist', 50), ('regular', 50), ('jeans', 49), ('5-pocket', 47), ('denim', 35), ('stretch', 10), ('waist zip', 5), ('stretch denim', 4)]

5-pocket superskinny-fit jeans in stretch denim with a regular waist, zip fly and skinny legs.


{'regular': 0.6038269979262616,
 'regular waist': 0.603794636815649,
 'jeans': 0.5362180144674937,
 '5-pocket': 0.4920809781546825,
 'stretch': 0.4741421375355526}

------------------------------------------------------------------------------------------------------------------------------------------------------
5-pocket jeans in stretch denim with a regular waist, zip fly and skinny legs.


{'regular waist': 0.6028207223327273,
 'regular': 0.6011850999552139,
 'jeans': 0.5487427672609305,
 '5-pocket': 0.5025385498471577,
 'stretch': 0.4573499936719452}

------------------------------------------------------------------------------------------------------------------------------------------------------
5-pocket jeans in stretch denim with a regular waist, zip fly and button, and skinny legs.


{'regular waist': 0.5973427135374775,
 'regular': 0.594806260377072,
 'jeans': 0.5463888810800022,
 '5-pocket': 0.5098727280380733,
 'stretch': 0.4528149332318748}

------------------------------------------------------------------------------------------------------------------------------------------------------
5-pocket jeans in stretch denim with a regular waist, button fly and skinny legs.


{'regular waist': 0.589496907817101,
 'regular': 0.5855901614627093,
 'jeans': 0.5381789572414468,
 '5-pocket': 0.49592194136756756,
 'stretch': 0.4443003575832288}

------------------------------------------------------------------------------------------------------------------------------------------------------
Jeans in washed stretch denim with decorative beads on the front, a regular waist, zip fly and skinny legs.


{'regular': 0.5738765013947391,
 'regular waist': 0.5727754456700839,
 'jeans': 0.5169642127555306,
 'stretch': 0.4515667988206658,
 'stretch denim': 0.4466956881912558}

------------------------------------------------------------------------------------------------------------------------------------------------------


In [138]:
# TODO: This is not working correctly
recommender.replace_tags(tags_to_remove=['5-pocket','skinny'], tags_to_add=['bootcut'])

['skinny legs', 'stretch', 'waist', 'washed', 'denim', 'zip', 'stretch denim', 'jeans', 'legs', 'regular', 'skinny', '5-pocket jeans', 'fly', 'regular waist', 'waist zip', '5-pocket', 'zip fly']
['bootcut']
Input Document: 5-pocket jeans in washed stretch denim with a regular waist, zip fly and skinny legs.
Adjacent Tags: ['stretch denim', 'thermolite®', 'jeans', '5-pocket jeans', 'keeping', 'denim', 'holds', 'technical', 'function', '5-pocket']

New Input Document: bootcut
Input Tags: [('bootcut', 5)]
Removed Tags: ['5-pocket', 'skinny']
Added Tags: ['bootcut']

5-pocket bootcut jeans in washed stretch denim with a regular waist.


{'bootcut': 0.0052936658037313795}

------------------------------------------------------------------------------------------------------------------------------------------------------
5-pocket slim-fit jeans in washed stretch denim with an adjustable elasticated waist and zip fly.


{}

------------------------------------------------------------------------------------------------------------------------------------------------------
5-pocket skinny-fit jeans in washed stretch denim with a button fly. The jeans are made from thermal Thermolite® fabric.


{}

------------------------------------------------------------------------------------------------------------------------------------------------------
5-pocket skinny-fit jeans in washed stretch denim with an adjustable elasticated waist and zip fly and press-stud. The jeans are made from thermal Thermolite® fabric.


{}

------------------------------------------------------------------------------------------------------------------------------------------------------
5-pocket jeans in washed stretch denim with a button fly and skinny legs. The jeans are made of COOLMAX® material that is designed to keep you cool in hot weather.


{}

------------------------------------------------------------------------------------------------------------------------------------------------------
