# Data Preparation

## Imports

In [24]:
import pandas as pd
import math
from decimal import Decimal
import numpy as np
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.preprocessing import LabelEncoder

## Read CSV

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
train.head()

Unnamed: 0,id,query,product_title,product_description,median_relevance,relevance_variance
0,1,bridal shower decorations,Accent Pillow with Heart Design - Red/Black,Red satin accent pillow embroidered with a hea...,1,0.0
1,2,led christmas lights,Set of 10 Battery Operated Multi LED Train Chr...,Set of 10 Battery Operated Train Christmas Lig...,4,0.0
2,4,projector,ViewSonic Pro8200 DLP Multimedia Projector,,4,0.471
3,5,wine rack,Concept Housewares WR-44526 Solid-Wood Ceiling...,"Like a silent and sturdy tree, the Southern En...",4,0.0
4,7,light bulb,Wintergreen Lighting Christmas LED Light Bulb ...,"WTGR1011\nFeatures\nNickel base, 60,000 averag...",2,0.471


In [4]:
test.head()

Unnamed: 0,id,query,product_title,product_description
0,3,electric griddle,Star-Max 48 in Electric Griddle,
1,6,phillips coffee maker,Philips SENSEO HD7810 WHITE Single Serve Pod C...,
2,9,san francisco 49ers,2013 San Francisco 49ers Clock,A 2013 San Francisco 49ers clock is the ultima...
3,11,aveeno shampoo,AVEENO 10.5FLOZ NRSH SHINE SH,"Water, Ammonium Lauryl Sulfate, Dimethicone, S..."
4,12,flea and tick control for dogs,Merial Frontline Plus Flea and Tick Control fo...,


In [5]:
len(train)

10158

In [6]:
train = train.drop(
    ['relevance_variance'
    ], axis=1)

In [7]:
test['median_relevance'] = np.nan

In [8]:
data = pd.concat([train, test])

In [9]:
data['product_description'] = data['product_description'].apply(lambda x: re.sub('\s+', ' ', str(x)))
data['product_title'] = data['product_title'].apply(lambda x: re.sub('\s+', ' ', str(x)))

In [10]:
data['product_description'] = data['product_description'].apply(lambda x: re.sub('<[^<]+?>', ' ', x))
data['product_title'] = data['product_title'].apply(lambda x: re.sub('<[^<]+?>', ' ', x))

In [11]:
data['product_description'] = data['product_description'].str.lower()
data['product_title'] = data['product_title'].str.lower()

In [21]:
data.head()

Unnamed: 0,id,query,product_title,product_description,median_relevance,query_in_title
0,1,bridal shower decor,accent pillow with heart design - red/black,red satin accent pillow embroidered with a hea...,1.0,False
1,2,led christmas light,set of 10 battery operated multi led train chr...,set of 10 battery operated train christmas lig...,4.0,False
2,4,projector,viewsonic pro8200 dlp multimedia projector,,4.0,False
3,5,wine rack,concept housewares wr-44526 solid-wood ceiling...,"like a silent and sturdy tree, the southern en...",4.0,False
4,7,light bulb,wintergreen lighting christmas led light bulb ...,"wtgr1011 features nickel base, 60,000 average ...",2.0,False


In [14]:
ps = PorterStemmer()
stem = []
for word in data['product_title']:
    stem.append(ps.stem(word))
data['product_title'] = stem

In [15]:
ps = PorterStemmer()
stem = []
for word in data['product_description']:
    stem.append(ps.stem(word))
data['product_description'] = stem

In [16]:
ps = PorterStemmer()
stem = []
for word in data['query']:
    stem.append(ps.stem(word))
data['query'] = stem

In [None]:
#data.apply(lambda r: r['product_title'.contains(r['query'].split(' ')), axis=1)

In [18]:
data['query_in_title'] = data['query'].isin(data.product_title)

In [22]:
(data['query_in_title']==True).sum()

1895

In [23]:
(data['query_in_title']==False).sum()

30776

# Encode

In [26]:
le = LabelEncoder()
toEncode = data[['query', 'product_title', 'product_description']]
encoded = toEncode.apply(le.fit_transform)

In [28]:
encoded.head()

Unnamed: 0,query,product_title,product_description
0,30,975,12875
1,121,23002,13442
2,170,27237,11025
3,249,5364,10063
4,127,28099,21698


In [31]:
data['query'] = encoded['query']
data['product_title'] = encoded['product_title']
data['product_description'] = encoded['product_description']

In [32]:
data.head()

Unnamed: 0,id,query,product_title,product_description,median_relevance,query_in_title
0,1,30,975,12875,1.0,False
1,2,121,23002,13442,4.0,False
2,4,170,27237,11025,4.0,False
3,5,249,5364,10063,4.0,False
4,7,127,28099,21698,2.0,False


# Train Test Split of the data set 

In [33]:
train = data[:10158]
test = data [10159:]

In [34]:
train.tail()

Unnamed: 0,id,query,product_title,product_description,median_relevance,query_in_title
10153,32655,160,20188,21460,4.0,False
10154,32659,211,15304,11025,2.0,False
10155,32663,162,16430,2129,2.0,False
10156,32666,79,12612,12300,1.0,False
10157,32668,163,20296,11025,4.0,True


In [35]:
test.head()

Unnamed: 0,id,query,product_title,product_description,median_relevance,query_in_title
1,6,158,19858,11025,,False
2,9,191,407,975,,False
3,11,11,2188,21046,,False
4,12,78,16061,11025,,False
5,14,219,5087,21031,,False


In [None]:
## Scale Features
#from sklearn import preprocessing
#train_columns = train_clean.copy()
#scaler = preprocessing.StandardScaler()
#train_clean = scaler.fit_transform(train_clean)
#train_clean = pd.DataFrame(train_clean, columns=train_columns.columns)

#test_columns = test_clean.copy()
#test_clean = scaler.transform(test_clean)
#test_clean = pd.DataFrame(test_clean, columns=test_columns.columns)

## Save data set for further use

In [None]:
train.to_csv('data/data_clean_save_ML_train_clean.csv', index=False, header=True)
test.to_csv('data/data_clean_save_ML_test_clean.csv', index=False, header=True)