In [1]:
# In this notebook we preprocess the train.csv and finally we save the preprocessed file as df_train_prep.pkl

In [18]:
import pandas as pd

# print non truncated column info in pandas dataframe
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', 500)

In [19]:
from nltk.tokenize import RegexpTokenizer

from nltk.corpus import stopwords

from nltk.stem.porter import *

In [21]:
# read the trainset
df_train = pd.read_csv(r'C:\Users\Dimos\Desktop\MSc\Semester 4\NLP\Coursework\data\train.csv', encoding='latin-1')
# read the test set
#df_test = pd.read_csv('data/test.csv', encoding='latin-1')

## - Preprocess Trainset

In [22]:
df_train.head()

Unnamed: 0,id,product_uid,product_title,search_term,relevance
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 Tugboat Wood and Concrete Coating,deck over,3.0
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Kit in Chrome (Valve Not Included),rain shower head,2.33
4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Kit in Chrome (Valve Not Included),shower only faucet,2.67


In [23]:
df_train.shape

(74067, 5)

#### -- Column product_title - PT

In [24]:
# convert to lower case
df_train['PT_lower'] = df_train['product_title'].apply(lambda text: text.lower())

In [25]:
# remove punctuation and tokenize
# create a new column with the tokens
tokenizer = RegexpTokenizer(r'\w+')
df_train['PT_tokens'] = df_train['PT_lower'].apply(lambda text: tokenizer.tokenize(text))

In [26]:
# remove stopwords
stop_words = set(stopwords.words('english'))
df_train['PT_tokens_sw'] = df_train['PT_tokens'].apply(lambda tokens: [i for i in tokens if i not in stop_words])

In [27]:
# create a column wiht the tokens as text (without stopwords)
df_train['PT_text'] = df_train['PT_tokens_sw'].apply(lambda tokens: ' '.join(tokens))

In [28]:
# stemming
stemmer = PorterStemmer()
df_train['PT_stem'] = df_train['PT_tokens_sw'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])

In [29]:
# create a list of terms of 'search_term' and another of 'product_title' that contain numbers
def hasNumbers(text):
    term_list = text.split(' ')
    numerics = []
    for term in term_list:
        if any(char.isdigit() for char in term)==1:
            numerics.append(term)
    return numerics

# create a list of terms of 'product_title' that contain numbers
df_train['PT_numerics'] = df_train['PT_lower'].apply(lambda x: hasNumbers(x))

In [30]:
# create a list of terms of 'search_term' and another of 'product_title' that don't contain numbers
def has_not_Numbers(text):
    term_list = text.split(' ')
    nonnumerics = []
    for term in term_list:
        if any(char.isdigit() for char in term)==1:
            continue
        else:
            nonnumerics.append(term)
    return nonnumerics

df_train['PT_Non_numerics'] = df_train['PT_lower'].apply(lambda x: has_not_Numbers(x))

#### -- Column search_term - ST

In [31]:
# convert to lower case
df_train['ST_lower'] = df_train['search_term'].apply(lambda text: text.lower())

In [32]:
# remove punctuation and tokenize
# create a new column with the tokens
tokenizer = RegexpTokenizer(r'\w+')
df_train['ST_tokens'] = df_train['ST_lower'].apply(lambda text: tokenizer.tokenize(text))

In [33]:
# remove stopwords
stop_words = set(stopwords.words('english'))
df_train['ST_tokens_sw'] = df_train['ST_tokens'].apply(lambda tokens: [i for i in tokens if i not in stop_words])

In [34]:
# create a column wiht the tokens as text (without stopwords)
df_train['ST_text'] = df_train['ST_tokens_sw'].apply(lambda tokens: ' '.join(tokens))

In [35]:
# stemming
stemmer = PorterStemmer()
df_train['ST_stem'] = df_train['ST_tokens_sw'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])

In [36]:
# create a list of terms of 'search_term_tokens' that contain numbers    
df_train['ST_numerics'] = df_train['ST_lower'].apply(lambda x: hasNumbers(x))

In [37]:
df_train['ST_Non_numerics'] = df_train['ST_lower'].apply(lambda x: has_not_Numbers(x))

In [52]:
df_train.head(20)

Unnamed: 0,id,product_uid,product_title,search_term,relevance,PT_lower,PT_tokens,PT_tokens_sw,PT_text,PT_stem,PT_numerics,PT_Non_numerics,ST_lower,ST_tokens,ST_tokens_sw,ST_text,ST_stem,ST_numerics,ST_Non_numerics
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0,simpson strong-tie 12-gauge angle,"[simpson, strong, tie, 12, gauge, angle]","[simpson, strong, tie, 12, gauge, angle]",simpson strong tie 12 gauge angle,"[simpson, strong, tie, 12, gaug, angl]",[12-gauge],"[simpson, strong-tie, angle]",angle bracket,"[angle, bracket]","[angle, bracket]",angle bracket,"[angl, bracket]",[],"[angle, bracket]"
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5,simpson strong-tie 12-gauge angle,"[simpson, strong, tie, 12, gauge, angle]","[simpson, strong, tie, 12, gauge, angle]",simpson strong tie 12 gauge angle,"[simpson, strong, tie, 12, gaug, angl]",[12-gauge],"[simpson, strong-tie, angle]",l bracket,"[l, bracket]","[l, bracket]",l bracket,"[l, bracket]",[],"[l, bracket]"
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 Tugboat Wood and Concrete Coating,deck over,3.0,behr premium textured deckover 1-gal. #sc-141 tugboat wood and concrete coating,"[behr, premium, textured, deckover, 1, gal, sc, 141, tugboat, wood, and, concrete, coating]","[behr, premium, textured, deckover, 1, gal, sc, 141, tugboat, wood, concrete, coating]",behr premium textured deckover 1 gal sc 141 tugboat wood concrete coating,"[behr, premium, textur, deckov, 1, gal, sc, 141, tugboat, wood, concret, coat]","[1-gal., #sc-141]","[behr, premium, textured, deckover, tugboat, wood, and, concrete, coating]",deck over,"[deck, over]",[deck],deck,[deck],[],"[deck, over]"
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Kit in Chrome (Valve Not Included),rain shower head,2.33,delta vero 1-handle shower only faucet trim kit in chrome (valve not included),"[delta, vero, 1, handle, shower, only, faucet, trim, kit, in, chrome, valve, not, included]","[delta, vero, 1, handle, shower, faucet, trim, kit, chrome, valve, included]",delta vero 1 handle shower faucet trim kit chrome valve included,"[delta, vero, 1, handl, shower, faucet, trim, kit, chrome, valv, includ]",[1-handle],"[delta, vero, shower, only, faucet, trim, kit, in, chrome, (valve, not, included)]",rain shower head,"[rain, shower, head]","[rain, shower, head]",rain shower head,"[rain, shower, head]",[],"[rain, shower, head]"
4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Kit in Chrome (Valve Not Included),shower only faucet,2.67,delta vero 1-handle shower only faucet trim kit in chrome (valve not included),"[delta, vero, 1, handle, shower, only, faucet, trim, kit, in, chrome, valve, not, included]","[delta, vero, 1, handle, shower, faucet, trim, kit, chrome, valve, included]",delta vero 1 handle shower faucet trim kit chrome valve included,"[delta, vero, 1, handl, shower, faucet, trim, kit, chrome, valv, includ]",[1-handle],"[delta, vero, shower, only, faucet, trim, kit, in, chrome, (valve, not, included)]",shower only faucet,"[shower, only, faucet]","[shower, faucet]",shower faucet,"[shower, faucet]",[],"[shower, only, faucet]"
5,18,100006,Whirlpool 1.9 cu. ft. Over the Range Convection Microwave in Stainless Steel with Sensor Cooking,convection otr,3.0,whirlpool 1.9 cu. ft. over the range convection microwave in stainless steel with sensor cooking,"[whirlpool, 1, 9, cu, ft, over, the, range, convection, microwave, in, stainless, steel, with, sensor, cooking]","[whirlpool, 1, 9, cu, ft, range, convection, microwave, stainless, steel, sensor, cooking]",whirlpool 1 9 cu ft range convection microwave stainless steel sensor cooking,"[whirlpool, 1, 9, cu, ft, rang, convect, microwav, stainless, steel, sensor, cook]",[1.9],"[whirlpool, cu., ft., over, the, range, convection, microwave, in, stainless, steel, with, sensor, cooking]",convection otr,"[convection, otr]","[convection, otr]",convection otr,"[convect, otr]",[],"[convection, otr]"
6,20,100006,Whirlpool 1.9 cu. ft. Over the Range Convection Microwave in Stainless Steel with Sensor Cooking,microwave over stove,2.67,whirlpool 1.9 cu. ft. over the range convection microwave in stainless steel with sensor cooking,"[whirlpool, 1, 9, cu, ft, over, the, range, convection, microwave, in, stainless, steel, with, sensor, cooking]","[whirlpool, 1, 9, cu, ft, range, convection, microwave, stainless, steel, sensor, cooking]",whirlpool 1 9 cu ft range convection microwave stainless steel sensor cooking,"[whirlpool, 1, 9, cu, ft, rang, convect, microwav, stainless, steel, sensor, cook]",[1.9],"[whirlpool, cu., ft., over, the, range, convection, microwave, in, stainless, steel, with, sensor, cooking]",microwave over stove,"[microwave, over, stove]","[microwave, stove]",microwave stove,"[microwav, stove]",[],"[microwave, over, stove]"
7,21,100006,Whirlpool 1.9 cu. ft. Over the Range Convection Microwave in Stainless Steel with Sensor Cooking,microwaves,3.0,whirlpool 1.9 cu. ft. over the range convection microwave in stainless steel with sensor cooking,"[whirlpool, 1, 9, cu, ft, over, the, range, convection, microwave, in, stainless, steel, with, sensor, cooking]","[whirlpool, 1, 9, cu, ft, range, convection, microwave, stainless, steel, sensor, cooking]",whirlpool 1 9 cu ft range convection microwave stainless steel sensor cooking,"[whirlpool, 1, 9, cu, ft, rang, convect, microwav, stainless, steel, sensor, cook]",[1.9],"[whirlpool, cu., ft., over, the, range, convection, microwave, in, stainless, steel, with, sensor, cooking]",microwaves,[microwaves],[microwaves],microwaves,[microwav],[],[microwaves]
8,23,100007,Lithonia Lighting Quantum 2-Light Black LED Emergency Fixture Unit,emergency light,2.67,lithonia lighting quantum 2-light black led emergency fixture unit,"[lithonia, lighting, quantum, 2, light, black, led, emergency, fixture, unit]","[lithonia, lighting, quantum, 2, light, black, led, emergency, fixture, unit]",lithonia lighting quantum 2 light black led emergency fixture unit,"[lithonia, light, quantum, 2, light, black, led, emerg, fixtur, unit]",[2-light],"[lithonia, lighting, quantum, black, led, emergency, fixture, unit]",emergency light,"[emergency, light]","[emergency, light]",emergency light,"[emerg, light]",[],"[emergency, light]"
9,27,100009,House of Fara 3/4 in. x 3 in. x 8 ft. MDF Fluted Casing,mdf 3/4,3.0,house of fara 3/4 in. x 3 in. x 8 ft. mdf fluted casing,"[house, of, fara, 3, 4, in, x, 3, in, x, 8, ft, mdf, fluted, casing]","[house, fara, 3, 4, x, 3, x, 8, ft, mdf, fluted, casing]",house fara 3 4 x 3 x 8 ft mdf fluted casing,"[hous, fara, 3, 4, x, 3, x, 8, ft, mdf, flute, case]","[3/4, 3, 8]","[house, of, fara, in., x, in., x, ft., mdf, fluted, casing]",mdf 3/4,"[mdf, 3, 4]","[mdf, 3, 4]",mdf 3 4,"[mdf, 3, 4]",[3/4],[mdf]


In [40]:
df_train.to_pickle(r'C:\Users\Dimos\Desktop\MSc\Semester 4\NLP\Coursework\data\preprocessed\df_train_prep.pkl')