In [1]:
from gensim import corpora, models, similarities, matutils
import re
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

stemmer = SnowballStemmer('english')

df_train = pd.read_csv("D:/_Barq/HDPSR/train.csv", encoding="ISO-8859-1")
df_test = pd.read_csv('D:/_Barq/HDPSR/test.csv', encoding="ISO-8859-1")
df_pro_desc = pd.read_csv('D:/_Barq/HDPSR/product_descriptions.csv')
df_attribute = pd.read_csv("D:/_Barq/HDPSR/attributes.csv", sep=",", encoding="ISO-8859-1")
df_attribute['attribute'] = df_attribute.name + ' ' + df_attribute.value
df_attribute = df_attribute.drop(['name', 'value'], axis=1)
df_attribute = df_attribute.groupby(['product_uid'])['attribute'].apply( \
                    lambda x: ' '.join(x.astype('str'))).reset_index()

stops = set(stopwords.words("english")) \
        | set('bullet b c e f g h j k l m n o p q r s t u v w x y z'.split())
def cleaner(info,stops):
        info = re.sub(r'([a-z])([A-Z])',r'\1 \2',info) 
        info = re.sub(r'[^a-zA-Z]',' ',info) 
        info = info.lower().split() 
        info = [word for word in info if not word in stops] 
        return " ".join( info )
    
df_pro_desc.product_description = [cleaner(info,stops) \
                                   for info in df_pro_desc.product_description.astype('str')]
df_train.product_title = [cleaner(info,stops) \
                                   for info in df_train.product_title.astype('str')]
df_test.product_title = [cleaner(info,stops) \
                                   for info in df_test.product_title.astype('str')]
df_attribute.attribute = [cleaner(info,stops) \
                                   for info in df_attribute.attribute.astype('str')]

print('All libraries imported.')

All libraries imported.


In [None]:
num_train = df_train.shape[0]
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
df_all = pd.merge(df_all, df_pro_desc, how='left', on='product_uid')
df_all = pd.merge(df_all, df_attribute, how='left', on='product_uid')

def str_stemmer(s):
	return " ".join([stemmer.stem(word) for word in s.lower().split()])

def str_common_word(str1, str2):
	return sum(int(str2.find(word)>=0) for word in str1.split())

df_all['search_term'] = df_all['search_term'].map(lambda x:str_stemmer(x))
df_all['product_title'] = df_all['product_title'].map(lambda x:str_stemmer(x))
df_all['product_description'] = df_all['product_description'].map(lambda x:str_stemmer(x))
df_all['attribute'] = df_all['attribute'].map(lambda x:str_stemmer(str(x)))

In [None]:
descriptions = df_all.product_description[:num_train]

terms = [[term for term in all_term if term not in stoplist] 
        for all_term in all_terms]

# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for term in terms:
    for token in term:
        frequency[token] += 1

terms = [[token for token in term if frequency[token] > 1]
          for term in terms]

dictionary = corpora.Dictionary(terms)
dictionary.save('D:/_Barq/HDPSR/Discription.dict') # store the dictionary, for future reference

corpus = [dictionary.doc2bow(term) for term in terms]
corpora.MmCorpus.serialize('D:/_Barq/HDPSR/Discription.mm', corpus) # store to disk, for later use
print(dictionary)

In [99]:
train = pd.read_csv('D:/_Barq/HDPSR/train.csv',sep=",", encoding="ISO-8859-1")
test = pd.read_csv('D:/_Barq/HDPSR/test.csv',sep=",", encoding="ISO-8859-1")

train_df = train[['product_uid', 'product_title']]
test_df = test[['product_uid', 'product_title']]
titles_df = train_df.append(test_df)
titles_df = titles_df.drop_duplicates()

# Split joint capital character words
titles = [re.sub(r'([a-z])([A-Z])',r'\1 \2',title) 
                for title in titles_df.product_title]
# Remove some unnecessary chars and split titles
all_title_terms = (re.findall('([a-z]+)', title.lower()) for title in titles)

# remove common words and tokenize
stoplist = set('for a of the and to in at from or on with '
                'be am is are was were '
                'it he she they you we ' 
                'will would should shall may might must '
                'do does ' 
                'my her his them our us '
                'mine hers his yours theirs its '
                'not only also which that this these those '
                '1 2 3 4 5 6 7 8 9 0 '
                'a b c d e f g h i j k l m n o p q r s t u v w x y z xx'
                'where when who why'.split())

title_terms = [[term for term in all_term if term not in stoplist] 
        for all_term in all_title_terms]

dictionary = corpora.Dictionary(title_terms)
dictionary.save('D:/_Barq/HDPSR/Titles.dict') # store the dictionary, for future reference

corpus = [dictionary.doc2bow(term) for term in title_terms]
corpora.MmCorpus.serialize('D:/_Barq/HDPSR/Titles.mm', corpus) # store to disk, for later use
print(dictionary)

Dictionary(20514 unique tokens: ['baits', 'condenser', 'lillington', 'sub', 'gala']...)


In [277]:
attributes_df = pd.read_csv('D:/_Barq/HDPSR/attributes.csv',sep=",", encoding="ISO-8859-1")
attributes_df['name_value'] = attributes_df.name + ' ' + attributes_df.value
attributes_df = attributes_df.drop(['name', 'value'], axis=1)
attributes_df = attributes_df.groupby(['product_uid'])['name_value'
                ].apply(lambda x: ' '.join(x.astype('str'))).reset_index()

# Split joint capital character words
attributes = [re.sub(r'([a-z])([A-Z])',r'\1 \2',attribute) 
                for attribute in attributes_df.name_value]
# Remove some unnecessary chars and split titles
all_attribute_terms = (re.findall('([a-z]+)', attribute.lower()) for attribute in attributes)

# remove common words and tokenize
stoplist = set('for a of the and to in at from or on with '
                'be am is are was were '
                'it he she they you we ' 
                'will would should shall may might must '
                'do does ' 
                'my her his them our us '
                'mine hers his yours theirs its '
                'not only also which that this these those '
                '1 2 3 4 5 6 7 8 9 0 '
                'a b c d e f g h i j k l m n o p q r s t u v w x y z xx'
                'where when who why'
                'bullet'.split())

attribute_terms = [[term for term in all_term if term not in stoplist] 
        for all_term in all_attribute_terms]

dictionary = corpora.Dictionary(attribute_terms)
dictionary.save('D:/_Barq/HDPSR/Attributes.dict') # store the dictionary, for future reference

corpus = [dictionary.doc2bow(term) for term in title_terms]
corpora.MmCorpus.serialize('D:/_Barq/HDPSR/Attributes.mm', corpus) # store to disk, for later use
print(dictionary)

Dictionary(29506 unique tokens: ['baits', 'torques', 'sub', 'chiropractic', 'mini']...)
