In [None]:
import warnings

with warnings.catch_warnings():
    warnings.filterwarnings("ignore",category=DeprecationWarning)
    import numpy as np
    import pandas as pd
    from sklearn.ensemble import RandomForestRegressor
    from sklearn import pipeline, grid_search
    from sklearn.base import BaseEstimator, TransformerMixin
    from sklearn.pipeline import FeatureUnion
    from sklearn.decomposition import TruncatedSVD
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics import mean_squared_error, make_scorer
    from nltk.stem.porter import *
    stemmer = PorterStemmer()
    import re
    import random
    random.seed(2016)


## Importing the dataset

In [None]:
df_train = pd.read_csv('C:/Users/sneha/Google Drive/Github/DataSet/Kaggle/data/train.csv', encoding="ISO-8859-1")
df_test = pd.read_csv('C:/Users/sneha/Google Drive/Github/DataSet/Kaggle/data/test.csv', encoding="ISO-8859-1")
df_pro_desc = pd.read_csv('C:/Users/sneha/Google Drive/Github/DataSet/Kaggle/data/product_descriptions.csv')
df_attr = pd.read_csv('C:/Users/sneha/Google Drive/Github/DataSet/Kaggle/data/attributes.csv')

## Creating new Dataframes 

In [None]:
df_brand = df_attr[df_attr.name == "MFG Brand Name"][["product_uid", "value"]].rename(columns={"value": "brand"})
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
df_all = pd.merge(df_all, df_pro_desc, how='left', on='product_uid')
df_all = pd.merge(df_all, df_brand, how='left', on='product_uid')

In [None]:
num_train = df_train.shape[0]

## Pre-processing

In [None]:
strNum = {'zero':0,'one':1,'two':2,'three':3,'four':4,'five':5,'six':6,'seven':7,'eight':8,'nine':9}

f = open('spelling.txt','r')
zspell = {}
for line in f:
    a, b = line.strip("\n").split("|")
    zspell[a]=b
f.close()

In [None]:
import string
from nltk.corpus import stopwords
def text_process(s):
    """
    Takes in a string of text, then performs the following:
    1. Clean up data
    2. Remove all stopwords
    3. Returns a list of the cleaned text  
    
    """
    
    s = re.sub(r"(\w)\.([A-Z])", r"\1 \2", s) #Split words with a.A
    s = re.sub(r"([a-z])([A-Z])", r"\1 \2", s) #Split words with a A
    s = s.lower()
    s = s.replace("  "," ")
    s = re.sub(r"([0-9])( *),( *)([0-9])", r"\1\4", s)
    s = s.replace(","," ")
    s = s.replace("$"," ")
    s = s.replace("?"," ")
    s = s.replace("-"," ")
    s = s.replace("//","/")
    s = s.replace("..",".")
    s = s.replace(" / "," ")
    s = s.replace(" \\ "," ")
    s = s.replace("."," . ")
    s = s.replace("   "," ")
    s = s.replace("  "," ").strip(" ")
    s = re.sub(r"(.*)\.$", r"\1", s) #end period
    s = re.sub(r"(.*)\/$", r"\1", s) #end period
    s = re.sub(r"^\.(.*)", r"\1", s) #start period
    s = re.sub(r"^\/(.*)", r"\1", s) #start slash
    s = re.sub(r"([0-9])([a-z])", r"\1 \2", s)
    s = re.sub(r"([a-z])([0-9])", r"\1 \2", s)
    s = s.replace(" x "," xbi ")
    s = re.sub(r"([a-z])( *)\.( *)([a-z])", r"\1 \4", s)
    s = re.sub(r"([a-z])( *)/( *)([a-z])", r"\1 \4", s)
    s = s.replace("*"," xbi ")
    s = s.replace(" by "," xbi ")
    s = re.sub(r"([0-9])( *)\.( *)([0-9])", r"\1.\4", s)
    s = re.sub(r"([0-9]+)( *)(inches|inch|in|')\.?", r"\1in. ", s)
    s = re.sub(r"([0-9]+)( *)(foot|feet|ft|'')\.?", r"\1ft. ", s)
    s = re.sub(r"([0-9]+)( *)(pounds|pound|lbs|lb)\.?", r"\1lb. ", s)
    s = re.sub(r"([0-9]+)( *)(square|sq) ?\.?(feet|foot|ft)\.?", r"\1sq.ft. ", s)
    s = re.sub(r"([0-9]+)( *)(cubic|cu) ?\.?(feet|foot|ft)\.?", r"\1cu.ft. ", s)
    s = re.sub(r"([0-9]+)( *)(gallons|gallon|gal)\.?", r"\1gal. ", s)
    s = re.sub(r"([0-9]+)( *)(ounces|ounce|oz)\.?", r"\1oz. ", s)
    s = re.sub(r"([0-9]+)( *)(centimeters|cm)\.?", r"\1cm. ", s)
    s = re.sub(r"([0-9]+)( *)(milimeters|mm)\.?", r"\1mm. ", s)
    s = re.sub(r"([0-9]+)( *)(degrees|degree)\.?", r"\1deg. ", s)
    s = s.replace(" v "," volts ")
    s = re.sub(r"([0-9]+)( *)(volts|volt)\.?", r"\1volt. ", s)
    s = re.sub(r"([0-9]+)( *)(watts|watt)\.?", r"\1watt. ", s)
    s = re.sub(r"([0-9]+)( *)(amperes|ampere|amps|amp)\.?", r"\1amp. ", s)
    s = s.replace("  "," ")
    s = s.replace(" . "," ")   
    s = (" ").join([z for z in s.split(" ") if z not in stopwords.words('english')])
    s = (" ").join([str(strNum[z]) if z in strNum else z for z in s.split(" ")])
    s = (" ").join([stemmer.stem(z) for z in s.split(" ")])
    s = s.lower()
    s = (" ").join([str(zspell[z]) if z in zspell else z for z in s.split(" ")])
    return s

In [None]:
df_all['search_term'] = df_all['search_term'].apply(text_process)
df_all['product_title'] = df_all['product_title'].apply(text_process)
df_all['product_description'] = df_all['product_description'].apply(text_process)

In [None]:
def text_stem(s):
    if isinstance(s, str):
        s = s.replace(","," ")
        s = s.replace("$"," ")
        s = s.replace("?"," ")
        s = s.replace("-"," ")
        s = s.replace("//","/")
        s = s.replace("..",".")
        s = s.replace(" / "," ")
        s = s.replace(" \\ "," ")
        s = s.replace("."," . ")
        s = s.replace("   "," ")
        s = s.replace("  "," ").strip(" ")
        s = (" ").join([z for z in s.split(" ") if z not in stopwords.words('english')])
        s = (" ").join([str(strNum[z]) if z in strNum else z for z in s.split(" ")])
        s = (" ").join([stemmer.stem(z) for z in s.split(" ")])
        s = s.lower()       
        return s
    else:
        return "null"

In [None]:

df_all['brand'] = df_all['brand'].map(lambda x:text_stem(x))

In [None]:
df_all.head()

## Building features

In [None]:
df_all['len_of_query'] = df_all['search_term'].map(lambda x:len(x.split())).astype(np.int64)
df_all['len_of_title'] = df_all['product_title'].map(lambda x:len(x.split())).astype(np.int64)
df_all['len_of_description'] = df_all['product_description'].map(lambda x:len(x.split())).astype(np.int64)
df_all['len_of_brand'] = df_all['brand'].map(lambda x:len(x.split())).astype(np.int64)

In [None]:
def isFloat(string):
    try:
        float(string)
        return True
    except ValueError:
        return False

def eval_f(expr):
    first = ''
    second = ""
    flag = 0

    if expr.find("/") == 1:    
        for a in expr:            
            if a == "/":
                flag = 1
                continue
            if flag == 0:
                first += a
            if flag == 1:
                second += a
        
        if (first.isnumeric() == True and  second.isnumeric() == True):
            if (float(second)!= 0):
                return (float(first)/float(second))
            else:
                return 1000
    else:
        if (expr.isnumeric()):
            return float(expr)
        else:
            return 1

def multiply_search(exp):
    if isFloat(exp):
        return 0
    a = exp.split()
    
    prod = 0
    for i in range(0,len(a)):
        if a[i] == 'xbi':        
            
            if (i+1 != len(a)):
                #print (a[i-1],a[i+1])
                try:
                    prod = eval_f(re.sub("[a-z.]", "",a[i-1])) * eval_f(re.sub("[a-z.]", "",a[i+1]))
                except Exception as e:
                    pass
                
                a[i] = str(prod)
                #print(prod)
   
    return (" ".join(a))   

def to_float(exp):
    if isinstance(exp, str): 
        s = (" ").join([str(float(z)) for z in exp.split(" ") if isFloat(z) ])        
        return s
def to_str(exp):
    if isinstance(exp, str): 
        s = (" ").join([z for z in exp.split(" ") if not isFloat(z) ])
        return s

In [None]:
df_all['search_term'] = df_all['search_term'].apply(multiply_search)
df_all['search_term_num'] = df_all['search_term'].apply(to_float)
df_all['search_term_str'] = df_all['search_term'].apply(to_str)



In [None]:
df_all['product_description'] = df_all['product_description'].apply(multiply_search)
df_all['product_description_num'] = df_all['product_description'].apply(to_float)
df_all['product_description_str'] = df_all['product_description'].apply(to_str)

In [None]:
def prod_title_score(description, search_term):    
    if isinstance(search_term, str):
        
        prod_title_list = str(description).split()
        total_len = len(search_term.split())
        count = 0
        for word in search_term.split():
            count +=1            
        if total_len != 0:
            return count
        else:
            return 0
    else:
        return 0

In [None]:
df_all['prod_desc_score'] = df_all.apply(lambda df_all: prod_title_score(df_all['product_description'],df_all['search_term'] ) ,axis = 1)

In [None]:
df_train.head(5)