In [2]:
import pandas as pd
import numpy as np
import re
from gensim import corpora, models, similarities

In [5]:
product_descriptions = pd.read_csv('D:/_Barq/HDPSR/product_descriptions.csv')
descriptions = product_descriptions.product_description

# Split joint capital character words
descriptions = [re.sub(r'([a-z])([A-Z])',r'\1 \2',description) 
                for description in descriptions]
# Remove some unnecessary chars
all_terms = (re.findall('([a-z]+)', description.lower()) for description in descriptions)

# remove common words and tokenize
stoplist = set('for a of the and to in at from or on with '
                'be am is are was were '
                'it he she they you we ' 
                'will would should shall may might must '
                'do does ' 
                'my her his them our us '
                'mine hers his yours theirs its '
                'not only also which that this these those '
                '1 2 3 4 5 6 7 8 9 0 '
                'a b c d e f g h i j k l m n o p q r s t u v w x y z xx'
                'where when who why'.split())

terms = [[term for term in all_term if term not in stoplist] 
        for all_term in all_terms]

# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for term in terms:
    for token in term:
        frequency[token] += 1

terms = [[token for token in term if frequency[token] > 1]
          for term in terms]

dictionary = corpora.Dictionary(terms)
dictionary.save('D:/_Barq/HDPSR/Discription.dict') # store the dictionary, for future reference

corpus = [dictionary.doc2bow(term) for term in terms]
corpora.MmCorpus.serialize('D:/_Barq/HDPSR/Discription.mm', corpus) # store to disk, for later use
print(dictionary)

Dictionary(36624 unique tokens: ['incandescent', 'subsidiary', 'skills', 'relit', 'sigman']...)


In [6]:
index = similarities.docsim.Similarity(None, corpus, num_features=36624)
index.save('D:/_Barq/HDPSR/Discription.index')
print(1)

1


In [3]:
test = pd.read_csv('D:/_Barq/HDPSR/test.csv',sep=",", encoding="ISO-8859-1")
dictionary = corpora.Dictionary.load('D:/_Barq/HDPSR/Discription.dict')
index = similarities.docsim.Similarity.load('D:/_Barq/HDPSR/Discription.index')

ident = []
relev = []
for idx, search_term in enumerate(test.search_term):
    similarities = index[dictionary.doc2bow(search_term.lower().split())]
    relev.extend([2+similarities[test.product_uid[idx]-100001]])
    ident.extend([test.id[idx]])
    
df = pd.DataFrame({'id': ident, 'relevance': relev})
df.to_csv('D:/_Barq/HDPSR/MyPredDocsimDescription.csv',index=False)
df.describe()

Unnamed: 0,id,relevance
count,166693.0,166693.0
mean,123932.839741,2.173486
std,71518.389174,0.138988
min,1.0,2.0
25%,61669.0,2.063246
50%,124004.0,2.15523
75%,187036.0,2.262613
max,240760.0,2.860073
