# Features

In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import graphlab as gl
import sklearn as sk
import time
from util import *
start_time = time.time()

In [24]:
# def convert_data_to_utf8(files):
#     files = ['./data/'+file for file in files]
#     for file in files:
#         df = pd.read_csv(file, encoding='ISO-8859-1')
#         df.to_csv(file[:-4]+'_utf8.csv', encoding='utf-8')
        
# convert_data_to_utf8(['train.csv', 'attributes.csv', 'product_descriptions.csv', 'test.csv'])        

## Load Files

In [3]:
df_train = gl.SFrame.read_csv('./data/train_utf8.csv')
df_test = gl.SFrame.read_csv('./data/test_utf8.csv')
df_attr = gl.SFrame.read_csv('./data/attributes_utf8.csv', column_type_hints=[long,long,str,str])
df_desp = gl.SFrame.read_csv('./data/product_descriptions_utf8.csv')

------------------------------------------------------
Inferred types from first line of file as 
column_type_hints=[long,long,long,str,str,float]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
------------------------------------------------------


Inferred types from first line of file as 
column_type_hints=[long,long,long,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
------------------------------------------------------


Inferred types from first line of file as 
column_type_hints=[long,long,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [4]:
num_train = len(df_train)
df_test['relevance'] = 0.0
df = df_train.append(df_test)
df_brand = df_attr[df_attr['name'] == 'MFG Brand Name'][['product_uid', 'value']].rename({'value': 'brand'})
df = df.join(df_desp, on='product_uid', how='left')
df = df.join(df_brand, on='product_uid', how='left')
# print('--- Files Loaded: %s minutes ---' % round(((time.time() - start_time) / 60), 2))

## Fix Typo

In [5]:
df['search_term'] = df['search_term'].apply(correct_typo)
# print("--- Typo Fixed: %s minutes ---" % round(((time.time() - start_time) / 60), 2))

## Stemming

In [6]:
df['search_term'] = df['search_term'].apply(lambda x: stem_str(x))
df['product_title'] = df['product_title'].apply(lambda x: stem_str(x))
df['product_description'] = df['product_description'].apply(lambda x: stem_str(x))
df['brand'] = df['brand'].apply(lambda x: stem_str(x))
# print('--- Stemming: %s minutes ---' % round(((time.time() - start_time) / 60), 2))

## Length of Texts

In [10]:
df['len_search_term'] = df['search_term'].apply(lambda x: len(x.split()))
df['len_product_title'] = df['product_title'].apply(lambda x: len(x.split()))
df['len_product_description'] = df['product_description'].apply(lambda x: len(x.split()))
df['len_brand'] = df['brand'].apply(lambda x: len(x.split()))
# print('--- Length of Texts: %s minutes ---' % round(((time.time() - start_time) / 60), 2))

## Text Matching

In [None]:
df['product_info'] = df_all['search_term']+"\t"+df_all['product_title'] +"\t"+df_all['product_description']
# print('--- Prod Info: %s minutes ---' % round(((time.time() - start_time) / 60), 2))

df['query_in_title'] = df_all['product_info'].map(lambda x:str_whole_word(x.split('\t')[0],x.split('\t')[1],0))
df['query_in_description'] = df_all['product_info'].map(lambda x:str_whole_word(x.split('\t')[0],x.split('\t')[2],0))
# print('--- Query In: %s minutes ---' % round(((time.time() - start_time) / 60), 2))

df['query_last_word_in_title'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0].split(" ")[-1],x.split('\t')[1]))
df['query_last_word_in_description'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0].split(" ")[-1],x.split('\t')[2]))
# print(''--- Query Last Word In: %s minutes ---'' % round(((time.time() - start_time) / 60), 2))

## TF-IDF

In [11]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 1), min_df = 0, stop_words = 'english', encoding='utf-8', decode_error='ignore')
# tfidf_mat_product_title = tfidf.fit_transform(df['product_title'])

In [7]:
df['tfidf_product_title'] = gl.text_analytics.tf_idf(df['product_title'])

In [8]:
df['top20_tfidf_product_title'] = df['tfidf_product_title'].apply(lambda x: zip(*sorted([(v, k) for (k, v) in x.iteritems()], reverse=True)[:20])[1])

In [23]:
df['tfidf_product_description'] = gl.text_analytics.tf_idf(df['product_description'])

In [24]:
df['top20_tfidf_product_description'] = df['tfidf_product_description'].apply(lambda x: zip(*sorted([(v, k) for (k, v) in x.iteritems()], reverse=True)[:20])[1])

## Text Similarities

In [9]:
def sum_value(array):
    return sum(filter(None, array))

def max_value(array):
    try:
        return max(filter(None, array))
    except:
        return 0

def min_value(array):
    try:
        return min(filter(None, array))
    except:
        return 0

### Levenshtein Distance

Lev-Dist Vector -> min/max/sum  
Keep L-D Vector or not?

In [10]:
import Levenshtein as lv
def lev_dist(text1, text2):
    return [lv.distance(x, y) for x in text1 for y in text2]

In [11]:
df['lev_dist_to_product_title_list'] = df.apply(lambda x: lev_dist(x['search_term'].split(), x['top20_tfidf_product_title']))

In [12]:
df['lev_dist_to_product_title_min'] = df['lev_dist_to_product_title_list'].apply(lambda x: min_value(x))
df['lev_dist_to_product_title_max'] = df['lev_dist_to_product_title_list'].apply(lambda x: max_value(x))
df['lev_dist_to_product_title_sum'] = df['lev_dist_to_product_title_list'].apply(lambda x: sum_value(x))

In [25]:
df['lev_dist_to_product_description_list'] = df.apply(lambda x: lev_dist(x['search_term'].split(), x['top20_tfidf_product_description']))

In [26]:
df['lev_dist_to_product_description_min'] = df['lev_dist_to_product_description_list'].apply(lambda x: min_value(x))
df['lev_dist_to_product_description_max'] = df['lev_dist_to_product_description_list'].apply(lambda x: max_value(x))
df['lev_dist_to_product_description_sum'] = df['lev_dist_to_product_description_list'].apply(lambda x: sum_value(x))

### Word2Vec

## Filling NAs

In [48]:
def find_columns_with_na(df):
    ret = []
    for col in df.column_names():
        print 'checking ', col
        num_na = df[col].sketch_summary().num_undefined()
        if num_na > 0:
            ret.append(col)
            print col, 'has missing values!'

na_cols = find_columns_with_na(df)

checking  X1
checking  id
checking  product_uid
checking  product_title
checking  search_term
checking  relevance
checking  product_description
checking  brand
brand
checking  len_search_term
checking  len_product_title
checking  len_product_description
checking  len_brand
checking  lev_dist_to_product_title_min
checking  lev_dist_to_product_title_max
checking  lev_dist_to_product_title_sum


In [12]:
df = df.fillna('brand', 0)

## Export

In [14]:
for col in columns_to_remove:
    try:
        df.remove_column(col)
    except:
        pass

In [13]:
to_append = df[['lev_dist_to_product_title_min', 'lev_dist_to_product_title_max', 'lev_dist_to_product_title_sum']]

In [27]:
original = pd.read_csv('./df_final_lev_dist.csv', encoding='ISO-8859-1', index_col=0)

In [29]:
original['lev_dist_to_product_description_min'] = df['lev_dist_to_product_description_min']

In [30]:
original['lev_dist_to_product_description_max'] = df['lev_dist_to_product_description_max']

In [33]:
original['lev_dist_to_product_description_sum'] = df['lev_dist_to_product_description_sum']

In [16]:
original['lev_dist_to_product_title_min'] = df['lev_dist_to_product_title_min']

In [18]:
original['lev_dist_to_product_title_max'] = df['lev_dist_to_product_title_max']

In [19]:
original['lev_dist_to_product_title_sum'] = df['lev_dist_to_product_title_sum']

In [34]:
original

Unnamed: 0,id,product_title,product_uid,relevance,search_term,product_description,brand,product_info,len_of_query,len_of_title,...,word_in_brand,ratio_brand,brand_feature,search_term_feature,lev_dist_to_product_title_min,lev_dist_to_product_title_max,lev_dist_to_product_title_sum,lev_dist_to_product_description_min,lev_dist_to_product_description_max,lev_dist_to_product_description_sum
0,2,simpson strong tie 12 gaug angl,100001,3.00,angl bracket,not onli do angl make joint stronger they also...,simpson strong tie,angl bracket\tsimpson strong tie 12 gaug angl\...,2,6,...,0,0.000000,1000,12,3,7,63,3,16,321
1,3,simpson strong tie 12 gaug angl,100001,2.50,l bracket,not onli do angl make joint stronger they also...,simpson strong tie,l bracket\tsimpson strong tie 12 gaug angl\tno...,2,6,...,0,0.000000,1000,9,3,7,67,3,18,337
2,9,behr premium textur deckov 1gal. #sc 141 tugbo...,100002,3.00,deck over,behr premium textur deckov is an innov solid c...,behr premium textur deckov,deck over\tbehr premium textur deckov 1gal. #s...,2,12,...,1,0.250000,1003,9,2,7,120,2,18,337
3,16,delta vero 1 handl shower onli faucet trim kit...,100005,2.33,rain shower head,updat your bathroom with the delta vero singl ...,delta,rain shower head\tdelta vero 1 handl shower on...,3,14,...,0,0.000000,1006,16,2,9,197,3,14,428
4,17,delta vero 1 handl shower onli faucet trim kit...,100005,2.67,shower onli faucet,updat your bathroom with the delta vero singl ...,delta,shower onli faucet\tdelta vero 1 handl shower ...,3,14,...,0,0.000000,1006,18,3,8,206,3,14,440
5,18,whirlpool 1.9cu.ft. over the rang convect micr...,100006,3.00,convect otr,achiev delici result is almost effortless with...,whirlpool,convect otr\twhirlpool 1.9cu.ft. over the rang...,2,13,...,0,0.000000,1009,11,2,9,163,3,16,312
6,20,whirlpool 1.9cu.ft. over the rang convect micr...,100006,2.67,microwav over stove,achiev delici result is almost effortless with...,whirlpool,microwav over stove\twhirlpool 1.9cu.ft. over ...,3,13,...,0,0.000000,1009,19,3,9,262,3,16,477
7,21,whirlpool 1.9cu.ft. over the rang convect micr...,100006,3.00,microwav,achiev delici result is almost effortless with...,whirlpool,microwav\twhirlpool 1.9cu.ft. over the rang co...,1,13,...,0,0.000000,1009,8,6,9,113,6,15,170
8,23,lithonia light quantum 2 light black led emerg...,100007,2.67,emerg light,the quantum adjust 2 light led black emerg lig...,lithonia light,emerg light\tlithonia light quantum 2 light bl...,2,10,...,1,0.500000,1012,11,4,8,84,3,17,297
9,27,hous of fara 34in. xbi 3in. xbi 8ft. mdf flute...,100009,3.00,mdf 34,get the hous of fara 34in. xbi 3in. xbi 8ft. m...,hous of fara,mdf 34\thous of fara 34in. xbi 3in. xbi 8ft. m...,2,11,...,0,0.000000,1015,6,1,5,76,2,11,231


In [35]:
original.to_csv('df_final_lev_dist_more.csv', encoding='ISO-8859-1')