In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

stemmer = SnowballStemmer('english')

df_train = pd.read_csv('D:/_Barq/HDPSR/train.csv', encoding="ISO-8859-1")
num_train = df_train.shape[0]
df_test = pd.read_csv('D:/_Barq/HDPSR/test.csv', encoding="ISO-8859-1")
df_pro_desc = pd.read_csv('D:/_Barq/HDPSR/product_descriptions.csv')
df_attribute = pd.read_csv('D:/_Barq/HDPSR/attributes.csv',sep=",", encoding="ISO-8859-1")
df_attribute['attribute'] = df_attribute.name + ' ' + df_attribute.value
df_attribute = df_attribute.drop(['name', 'value'], axis=1)
df_attribute = df_attribute.groupby(['product_uid'])['attribute'].apply( \
                    lambda x: ' '.join(x.astype('str'))).reset_index()

stops = set(stopwords.words("english")) \
        | set('bullet b c e f g h j k l m n o p q r s t u v w x y z'.split())
def cleaner(info,stops):
        info = re.sub(r'([a-z])([A-Z])',r'\1 \2',info) 
        info = re.sub(r'[^a-zA-Z]',' ',info) 
        info = info.lower().split() 
        info = [word for word in info if not word in stops] 
        return " ".join( info )
    
df_pro_desc.product_description = [cleaner(info,stops) \
                                   for info in df_pro_desc.product_description.astype('str')]
df_train.product_title = [cleaner(info,stops) \
                                   for info in df_train.product_title.astype('str')]
df_test.product_title = [cleaner(info,stops) \
                                   for info in df_test.product_title.astype('str')]
df_attribute.attribute = [cleaner(info,stops) \
                                   for info in df_attribute.attribute.astype('str')]

print('Data is ready')

Data is ready


In [2]:

df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
df_all = pd.merge(df_all, df_pro_desc, how='left', on='product_uid')
df_all = pd.merge(df_all, df_attribute, how='left', on='product_uid')

def str_stemmer(s):
	return " ".join([stemmer.stem(word) for word in s.lower().split()])

def str_common_word(str1, str2):
	return sum(int(str2.find(word)>=0) for word in str1.split())

df_all['search_term'] = df_all['search_term'].map(lambda x:str_stemmer(x))
df_all['product_title'] = df_all['product_title'].map(lambda x:str_stemmer(x))
df_all['product_description'] = df_all['product_description'].map(lambda x:str_stemmer(x))
df_all['attribute'] = df_all['attribute'].map(lambda x:str_stemmer(str(x)))

df_all['len_of_query'] = df_all['search_term'].map(lambda x:len(x.split())).astype(np.int64)

df_all['product_info'] = df_all['search_term']+" "+df_all['product_title']+ \
                        " "+df_all['product_description']+" "+df_all['attribute']
    
df_all.to_csv('D:/_Barq/HDPSR/df_all.csv',index=False)
df_all.head()

Unnamed: 0,id,product_title,product_uid,relevance,search_term,product_description,attribute,len_of_query,product_info
0,2,simpson strong tie gaug angl,100001,3.0,angl bracket,angl make joint stronger also provid consist s...,versatil connector various connect home repair...,2,angl bracket simpson strong tie gaug angl angl...
1,3,simpson strong tie gaug angl,100001,2.5,l bracket,angl make joint stronger also provid consist s...,versatil connector various connect home repair...,2,l bracket simpson strong tie gaug angl angl ma...
2,9,behr premium textur deck gal sc tugboat wood c...,100002,3.0,deck over,behr premium textur deckov innov solid color c...,applic method brush roller spray assembl depth...,2,deck over behr premium textur deck gal sc tugb...
3,16,delta vero handl shower faucet trim kit chrome...,100005,2.33,rain shower head,updat bathroom delta vero singl handl shower f...,bath faucet type combo tub shower built water ...,3,rain shower head delta vero handl shower fauce...
4,17,delta vero handl shower faucet trim kit chrome...,100005,2.67,shower onli faucet,updat bathroom delta vero singl handl shower f...,bath faucet type combo tub shower built water ...,3,shower onli faucet delta vero handl shower fau...


In [None]:
# Corpus and dictionary
from gensim import corpora, models, similarities, matutils

terms = [[term for term in doc] for doc in df_all.product_info]

dictionary = corpora.Dictionary(terms)
dictionary.save('D:/_Barq/HDPSR/all_info.dict') # store the dictionary, for future reference

corpus = [dictionary.doc2bow(term) for term in terms]
corpora.MmCorpus.serialize('D:/_Barq/HDPSR/all_info.mm', corpus) # store to disk, for later use
print(dictionary)


In [None]:
dictionary = corpora.Dictionary.load('D:/_Barq/HDPSR/all_info.dict')
corpus = corpora.MmCorpus('D:/_Barq/HDPSR/all_info.mm')
print('Discriptions Dictionary and corpus were loaded.')

lda = models.LdaModel(corpus, id2word=dictionary, num_topics=300)
print('Discriptions LDA model was constructed . ')

lda[dictionary.doc2bow(df_all.product_info[])]

lda.save('D:/_Barq/HDPSR/all_info.lda')
print('Discriptions LDA model was saved.')

In [None]:
df_all['word_in_title'] = df_all['product_info'].map( \
                        lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))
df_all['word_in_description'] = df_all['product_info'].map( \
                        lambda x:str_common_word(x.split('\t')[0],x.split('\t')[2]))
df_all['word_in_attribute'] = df_all['product_info'].map( \
                        lambda x:str_common_word(x.split('\t')[0],x.split('\t')[3]))

In [74]:
df_data = df_all.drop(['search_term','product_title','product_description', \
                      'product_info','attribute'],axis=1)
df_train = df_data.iloc[:num_train]
df_test = df_data.iloc[num_train:]
id_test = df_test['id']

y_train = df_train['relevance'].values
X_train = df_train.drop(['id','relevance'],axis=1).values
X_test = df_test.drop(['id','relevance'],axis=1).values

rf = RandomForestRegressor(n_estimators=40, max_depth=15, random_state=0)
clf = BaggingRegressor(rf, n_estimators=100, max_samples=0.1, random_state=25)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

pd.DataFrame({"id": id_test, "relevance": y_pred}) \
.to_csv('D:/_Barq/HDPSR/Keyword_Search_RFR.csv',index=False)

In [8]:
df_all.product_info[3]

'rain shower head delta vero handl shower faucet trim kit chrome valv includ updat bathroom delta vero singl handl shower faucet trim kit chrome sleek modern minimalist aesthet multi choic univers valv keep water temperatur within degre fahrenheit help prevent scald california resid see nbsp proposit inform includ trim kit rough kit unbx sold separ includ handl maintain balanc pressur hot cold water even valv turn elsewher system due water sens regul state new york pleas confirm ship zip code restrict use item meet water sens qualif bath faucet type combo tub shower built water filter includ trim kit rough kit unbx sold separ includ handl maintain balanc pressur hot cold water even valv turn elsewher system due water sens regul state new york pleas confirm ship zip code restrict use item meet water sens qualif certif list ada compliant csa certifi iapmo certifi color famili chrome color finish chrome connect size faucet featur addit featur faucet includ compon handl pressur balanc scal