In [72]:
import re
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

stemmer = SnowballStemmer('english')

df_train = pd.read_csv('D:/_Barq/HDPSR/train.csv', encoding="ISO-8859-1")
df_test = pd.read_csv('D:/_Barq/HDPSR/test.csv', encoding="ISO-8859-1")
df_pro_desc = pd.read_csv('D:/_Barq/HDPSR/product_descriptions.csv')
df_attribute = pd.read_csv('D:/_Barq/HDPSR/attributes.csv',sep=",", encoding="ISO-8859-1")
df_attribute['attribute'] = df_attribute.name + ' ' + df_attribute.value
df_attribute = df_attribute.drop(['name', 'value'], axis=1)
df_attribute = df_attribute.groupby(['product_uid'])['attribute'].apply( \
                    lambda x: ' '.join(x.astype('str'))).reset_index()

stops = set(stopwords.words("english")) \
        | set('bullet b c e f g h j k l m n o p q r s t u v w x y z'.split())
def cleaner(info,stops):
        info = re.sub(r'([a-z])([A-Z])',r'\1 \2',info) 
        info = re.sub(r'[^a-zA-Z]',' ',info) 
        info = info.lower().split() 
        info = [word for word in info if not word in stops] 
        return " ".join( info )
    
df_pro_desc.product_description = [cleaner(info,stops) \
                                   for info in df_pro_desc.product_description.astype('str')]
df_train.product_title = [cleaner(info,stops) \
                                   for info in df_train.product_title.astype('str')]
df_test.product_title = [cleaner(info,stops) \
                                   for info in df_test.product_title.astype('str')]
df_attribute.attribute = [cleaner(info,stops) \
                                   for info in df_attribute.attribute.astype('str')]



In [73]:
num_train = df_train.shape[0]
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
df_all = pd.merge(df_all, df_pro_desc, how='left', on='product_uid')
df_all = pd.merge(df_all, df_attribute, how='left', on='product_uid')

def str_stemmer(s):
	return " ".join([stemmer.stem(word) for word in s.lower().split()])

def str_common_word(str1, str2):
	return sum(int(str2.find(word)>=0) for word in str1.split())

df_all['search_term'] = df_all['search_term'].map(lambda x:str_stemmer(x))
df_all['product_title'] = df_all['product_title'].map(lambda x:str_stemmer(x))
df_all['product_description'] = df_all['product_description'].map(lambda x:str_stemmer(x))
df_all['attribute'] = df_all['attribute'].map(lambda x:str_stemmer(str(x)))

df_all['len_of_query'] = df_all['search_term'].map(lambda x:len(x.split())).astype(np.int64)

df_all['product_info'] = df_all['search_term']+"\t"+df_all['product_title']+ \
                        "\t"+df_all['product_description']+"\t"+df_all['attribute']

df_all['word_in_title'] = df_all['product_info'].map( \
                        lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))
df_all['word_in_description'] = df_all['product_info'].map( \
                        lambda x:str_common_word(x.split('\t')[0],x.split('\t')[2]))
df_all['word_in_attribute'] = df_all['product_info'].map( \
                        lambda x:str_common_word(x.split('\t')[0],x.split('\t')[3]))

df_data = df_all.drop(['search_term','product_title','product_description', \
                      'product_info','attribute'],axis=1)

df_train = df_data.iloc[:num_train]
df_test = df_data.iloc[num_train:]
id_test = df_test['id']

y_train = df_train['relevance'].values
X_train = df_train.drop(['id','relevance'],axis=1).values
X_test = df_test.drop(['id','relevance'],axis=1).values



In [74]:
rf = RandomForestRegressor(n_estimators=40, max_depth=15, random_state=0)
clf = BaggingRegressor(rf, n_estimators=100, max_samples=0.1, random_state=25)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

pd.DataFrame({"id": id_test, "relevance": y_pred}) \
.to_csv('D:/_Barq/HDPSR/Keyword_Search_RFR.csv',index=False)