#### Import packages:

In [41]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import Ridge,Lasso, LinearRegression
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction import stop_words
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from xgboost import XGBRegressor
from sklearn import grid_search
import warnings
import re
import nltk
warnings.simplefilter('ignore')
warnings.filterwarnings('ignore')
from sklearn import pipeline, grid_search
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error, make_scorer

#### Loading train and test data and merging it:

In [3]:
train=pd.read_csv('./Home Depot Product Search Relevance/train.csv', index_col=0, header=0,encoding="ISO-8859-1")
test=pd.read_csv('./Home Depot Product Search Relevance/test.csv', index_col=0, header=0,encoding="ISO-8859-1")
print(train.columns)
print(test.columns)

Index(['product_uid', 'product_title', 'search_term', 'relevance'], dtype='object')
Index(['product_uid', 'product_title', 'search_term'], dtype='object')


In [4]:
data = pd.concat((train, test), axis=0, ignore_index=True)

In [5]:
def spellchecker(word):
    if isinstance(word, str):
        word = word.replace("toliet","toilet")
        word = word.replace("airconditioner","air condition")
        word = word.replace("vinal","vinyl")
        word = word.replace("vynal","vinyl")
        word = word.replace("skill","skil")
        word = word.replace("snowbl","snow bl")
        word = word.replace("plexigla","plexi gla")
        word = word.replace("rustoleum","rust oleum")
        word = word.replace("whirpool","whirlpool")
        word = word.replace("whirlpoolga", "whirlpool ga")
        word = word.replace("whirlpoolstainless","whirlpool stainless")
        word = word.replace("cablrail","CableRail")
        word = word.replace("whellbarrow","wheelbarrow")
        word = word.replace("kolher","kohler")
        word = word.replace("rustolem","rust oleum")
        word = word.replace("hagchet","hatchet")
        word = word.replace("fridgedaire","frigidaire")
        word = word.replace("doorbell","door bell")
        word = word.replace("traficmaster","trafficmaster")
        word = word.replace("freesstanding","freestanding")
        word = word.replace("versatiube","versatube")
        word = word.replace("walmound","wall mount")
        word = word.replace("makersbot","maker bot")
        word = word.replace("pfistersaxton","pfister saxton")
        word = word.replace("flotex","flotec")
        word = word.replace("windowbalance","window balance")
        word = word.replace("meu","med")
        word = word.replace("loadspeaker","loudspeaker")
        word = word.replace("tapecase","tape case")
        word = word.replace("spgot","spigot")
        word = word.replace("whitesilicone","white silicone")
        word = word.replace("bracket","angle")
        word = word.replace("simpon","simpson")
        word = word.replace("papertowels","paper towels")
        word = word.replace("organozers","organizer")
        word = word.replace("miraposa","mariposa")
        if word=="deck over":
            word = "deckover"
    word=re.sub('[,.;:"!@#$-]', ' ', word).strip().lower()
    return word
data['search_term']=data['search_term'].map(lambda x: spellchecker(x))
data['product_title']=data['product_title'].map(lambda x: spellchecker(x))

In [6]:
data.head()

Unnamed: 0,product_title,product_uid,relevance,search_term
0,simpson strong tie 12 gauge angle,100001,3.0,angle angle
1,simpson strong tie 12 gauge angle,100001,2.5,l angle
2,behr premium textured deckover 1 gal sc 141 t...,100002,3.0,deckover
3,delta vero 1 handle shower only faucet trim ki...,100005,2.33,rain shower head
4,delta vero 1 handle shower only faucet trim ki...,100005,2.67,shower only faucet


#### Loading attributes information. I am going to use brand and material information from that (according to instructions):

In [7]:
attributes=pd.read_csv('./Home Depot Product Search Relevance/attributes.csv',header=0, dtype=str)
attributes.fillna('', inplace=True)

In [8]:
%%time
brand=attributes[['product_uid','value']][attributes.name=='MFG Brand Name']
brand.columns=['product_uid','brand']
brand['brand']=brand.brand.apply(lambda x: x.lower())
material=attributes[['product_uid','value']][attributes.name.str.contains("Material")]
dframe=pd.DataFrame(columns=['product_uid','material'])
product_uid=[]
material_lst=[]
for u in material.product_uid.unique():
    lst=""
    m=material[material.product_uid==u]
    for v in m.value.unique():
        lst=lst+v.lower()
    product_uid.append(u)
    material_lst.append(lst)
dframe['product_uid']=product_uid
dframe['material']=material_lst
dframe['material']=dframe.material.apply(lambda x: x.lower())

CPU times: user 6min 21s, sys: 3.64 s, total: 6min 24s
Wall time: 6min 46s


In [9]:
%%time
functionality=attributes[['product_uid','value']][attributes.name.str.contains("Bullet")]
func_frame=pd.DataFrame(columns=['product_uid','functionality'])
product_uid=[]
func_lst=[]
for u in functionality.product_uid.unique():
    lst=""
    f=functionality[functionality.product_uid==u]
    for v in f.value.unique():
        lst=lst+re.sub('[,.;:"!@#$-]', ' ', v).strip().lower()
    product_uid.append(u)
    func_lst.append(lst)
func_frame['product_uid']=product_uid
func_frame['functionality']=func_lst

CPU times: user 1h 23min 24s, sys: 16 s, total: 1h 23min 40s
Wall time: 1h 24min 26s


#### Loading product description information:

In [10]:
proddescription=pd.read_csv('./Home Depot Product Search Relevance/product_descriptions.csv',header=0)

#### Merging all data (train-test set, attributes and product description information):

In [11]:
data= pd.merge(data, proddescription, how='left', on='product_uid')
dframe['product_uid']=dframe.product_uid.astype(int)
data= pd.merge(data, dframe, how='left', on='product_uid')
brand['product_uid']=brand.product_uid.astype(int)
data= pd.merge(data, brand, how='left', on='product_uid')
func_frame['product_uid']=func_frame.product_uid.astype(int)
data= pd.merge(data, func_frame, how='left', on='product_uid')

In [12]:
data[['material','brand','functionality']]=data[['material','brand','functionality']].fillna("")

In [13]:
data['product_title']=data.product_title.apply(lambda x: re.sub('[,.;:"!@#$-]', ' ', x).strip().lower())
data['search_term']=data.search_term.apply(lambda x: re.sub('[,.;:"!@#$-]', ' ', x).strip().lower())
data['product_description']=data.product_description.apply(lambda x: re.sub('[,.;:"!@#$-]', ' ', x).strip().lower())
data['brand']=data.brand.apply(lambda x: re.sub('[,.;:"!@#$-]', ' ', x).strip().lower())

#### I am going to stem words in product_title, search_term, product_description, material, brand and functionality

In [42]:
def stop_word(x,d=1):
    if d==1:
        stop_words = ['for', 'and', 'in', 'on','with','what','from','that','less']
    else:
        stop_words = nltk.corpus.stopwords.words('english')
    for word in x.split(" "):
        if word in stop_words:
            x.replace(word,"")
    return x
data['search_term']=data['search_term'].apply(lambda x: stop_word(x,1))
data['product_description']=data['product_description'].apply(lambda x: stop_word(x,2))
data['functionality']=data['functionality'].apply(lambda x: stop_word(x,2))

In [14]:
%%time
stem=nltk.stem.PorterStemmer()
def stemmer(x):
    if len(x)>0:
        x = (" ").join([stem.stem(str(z)) for z in x.split(" ")])
        return x
    else:
        return 'null'
for col in ['product_title','search_term','product_description','material','brand','functionality']:
    data[col]=data[col].apply(lambda x: stemmer(x))

CPU times: user 19min 2s, sys: 3.35 s, total: 19min 6s
Wall time: 19min 12s


#### Create variables: quantity of word which are in search_term and in product_title, product_description, material, brand, functionality:

In [76]:
variables=['len_st']
for col in ['product_title','product_description','material','brand','functionality']:
    data['st_in_'+col]=data[['search_term', col]].apply(lambda x: len([w for w in x[0].split(" ") if w in x[1].split(" ")]), axis=1)
    #data['rat_'+col]=data[['search_term', col]].apply(lambda x: len([w for w in x[0].split(" ") if w in x[1].split(" ")])/len(x[0].split(" ")), axis=1)
    data['len_'+col]=data[col].apply(lambda x: len(x.split(" ")))
    variables.append('st_in_'+col)
    variables.append('rat_'+col)
    variables.append('len_'+col)
data['len_st']=data.search_term.apply(lambda x: len(x.split(" ")))
data['st_in_brand']=data['st_in_brand'].apply(lambda x:2*x)
data['st_in_functionality']=data['st_in_functionality'].apply(lambda x:2*x)

In [77]:
lentr=train.shape[0]
data_train=data[:lentr]
data_test=data[lentr:]
Y_train=data_train['relevance']

#### Use XGBRegressor

In [78]:
X_train=data_train[variables]
X_test=data_test[variables]
estimator=XGBRegressor(seed=0)
#estimator1 = RandomForestRegressor(random_state = 0, n_jobs=-1, verbose=1)
#estimator=BaggingRegressor(estimator1,random_state=0,n_estimators=15)
estimator.get_params().keys()

dict_keys(['base_score', 'colsample_bylevel', 'colsample_bytree', 'gamma', 'learning_rate', 'max_delta_step', 'max_depth', 'min_child_weight', 'missing', 'n_estimators', 'nthread', 'objective', 'reg_alpha', 'reg_lambda', 'scale_pos_weight', 'seed', 'silent', 'subsample'])

In [79]:
params_grid = {
    'max_depth':[4,5,6],
    'n_estimators' : [75,80,85,90,95]
}
grid = grid_search.GridSearchCV(estimator, params_grid, scoring = 'neg_mean_squared_error', cv = 5)
grid.fit(X=X_train,y=Y_train)
print(grid.best_score_)
print(grid.best_estimator_)

-0.23531153820586398
XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=4,
       min_child_weight=1, missing=None, n_estimators=95, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)


#### Save results

In [80]:
estimator=grid.best_estimator_
y_test=estimator.predict(X_test)
test['pred']=np.round(y_test,2)
test['pred'].to_csv('submision.csv',sep=',')

#### Final score 0.48685