In [2]:
# In this notebook we create the features that will be used in the modeling phase
# So, we first read the files that we created in the preprocessing phase
# Finally we save 3 pickle files with features:
# 1) df_train_ptstkwat.pkl - it has several numeric features
# 2) df_similarities.pkl - it has features based on similarities (Levenshtein, Cosine, Jaccard, Jaro)
# 3) df_fuzzy.pkl - it has features based on this --> https://towardsdatascience.com/natural-language-processing-for-fuzzy-string-matching-with-python-6632b7824c49

In [1]:
import pandas as pd
import stringdist

from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# print non truncated column info in pandas dataframe
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', 500)

In [2]:
# read the preprocessed trainset
df_train = pd.read_pickle(r'C:\Users\Dimos\Desktop\MSc\Semester 4\NLP\Coursework\data\preprocessed\df_train_prep.pkl')
# read the file with the preprocessed descriptions and their keywords
df_descs = pd.read_pickle(r'C:\Users\Dimos\Desktop\MSc\Semester 4\NLP\Coursework\data\preprocessed\df_descr_kwd.pkl')
# read the file with the preprocessed attributes
df_attrs = pd.read_pickle(r'C:\Users\Dimos\Desktop\MSc\Semester 4\NLP\Coursework\data\preprocessed\df_attrs_prep.pkl')

In [3]:
# merge the above dataframes
df_train = df_train.merge(df_descs, left_on='product_uid', right_on='product_uid', how='left')
df_train = df_train.merge(df_attrs, left_on='product_uid', right_on='product_uid', how='left')

## - Features from TRAINSET + Descriptions + Attributes

In [8]:
df_train.head(2)

Unnamed: 0,id,product_uid,product_title,search_term,relevance,PT_lower,PT_tokens,PT_tokens_sw,PT_text,PT_stem,PT_numerics,PT_Non_numerics,ST_lower,ST_tokens,ST_tokens_sw,ST_text,ST_stem,ST_numerics,ST_Non_numerics,PD_lower,PD_stem,PD_numerics,Keywords_Descr,Atrr_text,Atrr_stem
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0,simpson strong-tie 12-gauge angle,"[simpson, strong, tie, 12, gauge, angle]","[simpson, strong, tie, 12, gauge, angle]",simpson strong tie 12 gauge angle,"[simpson, strong, tie, 12, gaug, angl]",[12-gauge],"[simpson, strong-tie, angle]",angle bracket,"[angle, bracket]","[angle, bracket]",angle bracket,"[angl, bracket]",[],"[angle, bracket]","not only do angles make joints stronger, they also provide more consistent, straight corners. simpson strong-tie offers a wide variety of angles in various sizes and thicknesses to handle light-duty jobs or projects where a structural connection is needed. some can be bent (skewed) to match the project. for outdoor projects or those where moisture is present, use our zmax zinc-coated connectors, which provide extra resistance against corrosion (look for a ""z"" at the end of the model number).versatile connector for various 90 connections and home repair projectsstronger than angled nailing or screw fastening alonehelp ensure joints are consistently straight and strongdimensions: 3 in. x 3 in. x 1-1/2 in.made from 12-gauge steelgalvanized for extra corrosion resistanceinstall with 10d common nails or #9 x 1-1/2 in. strong-drive sd screws","[angl, make, joint, stronger, also, provid, consist, straight, corner, simpson, strong, tie, offer, wide, varieti, angl, variou, size, thick, handl, light, duti, job, project, structur, connect, need, bent, skew, match, project, outdoor, project, moistur, present, use, zmax, zinc, coat, connector, provid, extra, resist, corros, look, z, end, model, number, versatil, connector, variou, 90, connect, home, repair, projectsstrong, angl, nail, screw, fasten, alonehelp, ensur, joint, consist, straight, strongdimens, 3, x, 3, x, 1, 1, 2, made, 12, gaug, steelgalvan, extra, corros, resistanceinstal, 10d, common, nail, 9, x, 1, 1, 2, strong, drive, sd, screw]","[90, 3, 3, 1-1/2, 12-gauge, 10d, #9, 1-1/2]","[strongdimensions, alonehelp, projectsstronger, zmax, skewed, resistanceinstall, sd, bent, steelgalvanized, thicknesses]",versatile connector for various 90â° connections and home repair projects stronger than angled nailing or screw fastening alone help ensure joints are consistently straight and strong dimensions: 3 in. x 3 in. x 1-1/2 in. made from 12-gauge steel galvanized for extra corrosion resistance install with 10d common nails or #9 x 1-1/2 in. strong-drive sd screws 12 galvanized steel simpson strong-tie 1 1.5 3 0.26 3,"[versatil, connector, variou, 90â, connect, home, repair, project, stronger, angl, nail, screw, fasten, alon, help, ensur, joint, consist, straight, strong, dimens, 3, x, 3, x, 1, 1, 2, made, 12, gaug, steel, galvan, extra, corros, resist, instal, 10d, common, nail, 9, x, 1, 1, 2, strong, drive, sd, screw, 12, galvan, steel, simpson, strong, tie, 1, 1, 5, 3, 0, 26, 3]"
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5,simpson strong-tie 12-gauge angle,"[simpson, strong, tie, 12, gauge, angle]","[simpson, strong, tie, 12, gauge, angle]",simpson strong tie 12 gauge angle,"[simpson, strong, tie, 12, gaug, angl]",[12-gauge],"[simpson, strong-tie, angle]",l bracket,"[l, bracket]","[l, bracket]",l bracket,"[l, bracket]",[],"[l, bracket]","not only do angles make joints stronger, they also provide more consistent, straight corners. simpson strong-tie offers a wide variety of angles in various sizes and thicknesses to handle light-duty jobs or projects where a structural connection is needed. some can be bent (skewed) to match the project. for outdoor projects or those where moisture is present, use our zmax zinc-coated connectors, which provide extra resistance against corrosion (look for a ""z"" at the end of the model number).versatile connector for various 90 connections and home repair projectsstronger than angled nailing or screw fastening alonehelp ensure joints are consistently straight and strongdimensions: 3 in. x 3 in. x 1-1/2 in.made from 12-gauge steelgalvanized for extra corrosion resistanceinstall with 10d common nails or #9 x 1-1/2 in. strong-drive sd screws","[angl, make, joint, stronger, also, provid, consist, straight, corner, simpson, strong, tie, offer, wide, varieti, angl, variou, size, thick, handl, light, duti, job, project, structur, connect, need, bent, skew, match, project, outdoor, project, moistur, present, use, zmax, zinc, coat, connector, provid, extra, resist, corros, look, z, end, model, number, versatil, connector, variou, 90, connect, home, repair, projectsstrong, angl, nail, screw, fasten, alonehelp, ensur, joint, consist, straight, strongdimens, 3, x, 3, x, 1, 1, 2, made, 12, gaug, steelgalvan, extra, corros, resistanceinstal, 10d, common, nail, 9, x, 1, 1, 2, strong, drive, sd, screw]","[90, 3, 3, 1-1/2, 12-gauge, 10d, #9, 1-1/2]","[strongdimensions, alonehelp, projectsstronger, zmax, skewed, resistanceinstall, sd, bent, steelgalvanized, thicknesses]",versatile connector for various 90â° connections and home repair projects stronger than angled nailing or screw fastening alone help ensure joints are consistently straight and strong dimensions: 3 in. x 3 in. x 1-1/2 in. made from 12-gauge steel galvanized for extra corrosion resistance install with 10d common nails or #9 x 1-1/2 in. strong-drive sd screws 12 galvanized steel simpson strong-tie 1 1.5 3 0.26 3,"[versatil, connector, variou, 90â, connect, home, repair, project, stronger, angl, nail, screw, fasten, alon, help, ensur, joint, consist, straight, strong, dimens, 3, x, 3, x, 1, 1, 2, made, 12, gaug, steel, galvan, extra, corros, resist, instal, 10d, common, nail, 9, x, 1, 1, 2, strong, drive, sd, screw, 12, galvan, steel, simpson, strong, tie, 1, 1, 5, 3, 0, 26, 3]"


### -- General counts about numerics and non numerics in Product title and Search term

In [5]:
# number of numeric terms in product_title
df_train['N_numerics_PT'] = df_train['PT_numerics'].apply(lambda x: len(x))
# number of numeric terms in search_term
df_train['N_numerics_ST'] = df_train['ST_numerics'].apply(lambda x: len(x))

# number of non numeric terms in product_title
df_train['N_non_numerics_PT'] = df_train['PT_Non_numerics'].apply(lambda x: len(x))
# number of non numeric terms in search_term
df_train['N_non_numerics_ST'] = df_train['ST_Non_numerics'].apply(lambda x: len(x))

In [5]:
# DONT INCLUDE IT

# number of common words between 'product_title' & 'search_term'
# df_train['N_common_words'] = df_train.apply(lambda x: 
#                             len(list(set(x['product_title_tokens']).intersection(x['search_term_tokens']))), axis=1)

### -- Common terms between Search term & Product title

In [6]:
%%time

def common_words_leven(tokens_1, tokens_2):
    #N = 0
    common_terms = []
    tokens_1 = list(set(tokens_1))
    tokens_2 = list(set(tokens_2))
    
    for token1 in tokens_1:
        for token2 in tokens_2:
            if 1 - stringdist.levenshtein_norm(token1, token2)>0.85:
                #N += 1
                common_terms.append(token2)
    try:
        return common_terms
    except:
        return common_terms

# number of common nonnumeric terms between 'PT_Non_numerics' & 'ST_Non_numerics' with levensthein distance
df_train['Common_words_leven'] = df_train.apply(lambda x: common_words_leven(x['PT_Non_numerics'], x['ST_Non_numerics']),
                                                                              axis=1)

df_train['N_common_words_leven'] = df_train['Common_words_leven'].apply(lambda x: len(x))

Wall time: 1min 46s


In [7]:
# Jaccard similarity based on the above common words with levensthein
def get_jaccard_sim(words1, words2, common): 
    """Returns jaccard similarity between 2 list of words"""
    a = set(words1)
    b = set(words2)
    c = set(common)
    return float(len(c)) / (len(a) + len(b) - len(c))

df_train['JC_sim'] = df_train.apply(lambda x: get_jaccard_sim(x['PT_Non_numerics'], x['ST_Non_numerics'], x['Common_words_leven']),
                                         axis=1)

In [8]:
# DONT INCLUDE IT

# number of common stemmed words between 'product_title_stem' & 'search_term_stem'
# df_train['N_common_words_stem'] = df_train.apply(lambda x: 
#                             len(list(set(x['product_title_stem']).intersection(x['search_term_stem']))), axis=1)

### -- Non numeric terms of Search term that are substrings of Product title | Descrtiption | Attributes 

In [12]:
# number of non numeric terms of the search_term that appears in the Product title | Descrtiption | Attributes

def n_substrings(tokens, text):
    #N = 0
    substrings = []
    for token in tokens:
        try:
            if token in text:
                #N += 1
                substrings.append(token)
        except:
            # in case the text is float
            pass
    return substrings

##### PRODUCT TITLE 
# list of terms of search_term_tokens that are substrings of PT_lower
df_train['Substrs_PT_x'] = df_train.apply(lambda x: n_substrings(x['ST_Non_numerics'], x['PT_lower']), axis=1)

# Number of terms of search_term_tokens that are substrings of product_title_lower
df_train['N_substrs_PT_x'] = df_train['Substrs_PT_x'].apply(lambda x: len(x))


#### PRODUCT DESCRIPTION
# list of terms of search_term_tokens that are substrings of PD_lower
df_train['Substrs_PD_x'] = df_train.apply(lambda x: n_substrings(x['ST_Non_numerics'], x['PD_lower']), axis=1)

# Number of terms of search_term_tokens that are substrings of product_title_lower
df_train['N_substrs_PD_x'] = df_train['Substrs_PD_x'].apply(lambda x: len(x))

#### PRODUCT ATTRIBUTES
# list of terms of search_term_tokens that are substrings of PD_lower
df_train['Substrs_Atr_x'] = df_train.apply(lambda x: n_substrings(x['ST_Non_numerics'], x['Atrr_text']), axis=1)

# Number of terms of search_term_tokens that are substrings of product_title_lower
df_train['N_substrs_Atr_x'] = df_train['Substrs_Atr_x'].apply(lambda x: len(x))

In [44]:
def perc_xxx(tokens1, tokens2, tokens3, tokens4):
    try:
        tokens123 = list(set(tokens1 + tokens2 + tokens3))
        return len(tokens123)/len(tokens4)
    except:
        return 0

# percentage of terms of search_term_tokens that are substrings of PT_lower or PD_lower or Atrr_text
df_train['Perc_substrs_x'] = df_train.apply(lambda x: perc_xxx(x['Substrs_PT_x'],x['Substrs_PD_x'],x['Substrs_Atr_x'],x['ST_Non_numerics']),
                                          axis=1)

### -- Numeric terms of Search term that are substrings of Product title | Descrtiption | Attributes 

In [14]:
# number of numeric terms of the search_term that appears in the Product title | Descrtiption | Attributes
def n_substrings(tokens, text):
    #N = 0
    substrings = []
    for token in tokens:
        try:
            if token in text:
                #N += 1
                substrings.append(token)
        except:
            # in case the text is float
            pass
    return substrings

##### PRODUCT TITLE 
# list of terms of search_term_tokens that are substrings of PT_lower
df_train['Substrs_PT_y'] = df_train.apply(lambda x: n_substrings(x['ST_numerics'], x['PT_lower']), axis=1)

# Number of terms of search_term_tokens that are substrings of product_title_lower
df_train['N_substrs_PT_y'] = df_train['Substrs_PT_y'].apply(lambda x: len(x))


#### PRODUCT DESCRIPTION
# list of terms of search_term_tokens that are substrings of PD_lower
df_train['Substrs_PD_y'] = df_train.apply(lambda x: n_substrings(x['ST_numerics'], x['PD_lower']), axis=1)

# Number of terms of search_term_tokens that are substrings of product_title_lower
df_train['N_substrs_PD_y'] = df_train['Substrs_PD_y'].apply(lambda x: len(x))

#### PRODUCT ATTRIBUTES
# list of terms of search_term_tokens that are substrings of PD_lower
df_train['Substrs_Atr_y'] = df_train.apply(lambda x: n_substrings(x['ST_numerics'], x['Atrr_text']), axis=1)

# Number of terms of search_term_tokens that are substrings of product_title_lower
df_train['N_substrs_Atr_y'] = df_train['Substrs_Atr_y'].apply(lambda x: len(x))

In [45]:
def perc_xxx(tokens1, tokens2, tokens3, tokens4):
    try:
        tokens123 = list(set(tokens1 + tokens2 + tokens3))
        return len(tokens123)/len(tokens4)
    except:
        return 0

# percentage of terms of search_term_tokens that are substrings of PT_lower or PD_lower or Atrr_text
df_train['Perc_substrs_y'] = df_train.apply(lambda x: perc_xxx(x['Substrs_PT_y'],x['Substrs_PD_y'],x['Substrs_Atr_y'],x['ST_numerics']),
                                          axis=1)

### -- Levensthein distance between Search terms & Product title

In [16]:
# Levensthein distance between 'product_title_text' & 'search_term_text'
df_train['Leven_sim_ST_PT'] = df_train.apply(lambda x: 1 - stringdist.levenshtein_norm(x['PT_text'], x['ST_text']),
                                         axis=1)

### -- Keywords of Description that appear in the Search term (with levensthein distance)

In [23]:
%%time

def common_words_leven(tokens_1, tokens_2):
    #N = 0
    common_terms = []
    tokens_1 = list(set(tokens_1))
    tokens_2 = list(set(tokens_2))
    
    for token1 in tokens_1:
        for token2 in tokens_2:
            if 1 - stringdist.levenshtein_norm(token1, token2)>0.85:
                #N += 1
                common_terms.append(token1)
    try:
        return common_terms
    except:
        return common_terms

# list of Descripton Keywords that appear in the 'ST_Non_numerics' with levensthein distance
df_train['Keywords_leven'] = df_train.apply(lambda x: common_words_leven(x['Keywords_Descr'], x['ST_Non_numerics']),
                                                                              axis=1)
# number of Descripton Keywords that appear in the 'ST_Non_numerics' with levensthein distance
df_train['N_keywords_leven'] = df_train['Keywords_leven'].apply(lambda x: len(x))

Wall time: 1min 32s


In [30]:
df_train.columns

Index(['id', 'product_uid', 'product_title', 'search_term', 'relevance',
       'PT_lower', 'PT_tokens', 'PT_tokens_sw', 'PT_text', 'PT_stem',
       'PT_numerics', 'PT_Non_numerics', 'ST_lower', 'ST_tokens',
       'ST_tokens_sw', 'ST_text', 'ST_stem', 'ST_numerics', 'ST_Non_numerics',
       'PD_lower', 'PD_numerics', 'Keywords_Descr', 'Atrr_text',
       'N_numerics_PT', 'N_numerics_ST', 'N_non_numerics_PT',
       'N_non_numerics_ST', 'Common_words_leven', 'N_common_words_leven',
       'JC_sim', 'Substrs_PT', 'N_substrs_PT', 'Substrs_PD', 'N_substrs_PD',
       'Substrs_Atr', 'N_substrs_Atr', 'Substrs_PT_x', 'N_substrs_PT_x',
       'Substrs_PD_x', 'N_substrs_PD_x', 'Substrs_Atr_x', 'N_substrs_Atr_x',
       'Perc_substrs', 'Substrs_PT_y', 'N_substrs_PT_y', 'Substrs_PD_y',
       'N_substrs_PD_y', 'Substrs_Atr_y', 'N_substrs_Atr_y', 'Leven_sim_ST_PT',
       'Keywords_leven', 'N_keywords_leven', 'Perc_substrs_x',
       'Perc_substrs_y'],
      dtype='object')

In [46]:
# keep only those columns
df_train2 = df_train[['product_uid',
       'N_numerics_PT', 'N_numerics_ST', 'N_non_numerics_PT',
       'N_non_numerics_ST', 'N_common_words_leven',
       'JC_sim', 'N_substrs_PT_x',
       'N_substrs_PD_x', 'N_substrs_Atr_x',
       'Perc_substrs_x', 'N_substrs_PT_y',
       'N_substrs_PD_y', 'N_substrs_Atr_y', 'Perc_substrs_y',
       'Leven_sim_ST_PT', 'N_keywords_leven', 'relevance']]

In [47]:
df_train2.head()

Unnamed: 0,product_uid,N_numerics_PT,N_numerics_ST,N_non_numerics_PT,N_non_numerics_ST,N_common_words_leven,JC_sim,N_substrs_PT_x,N_substrs_PD_x,N_substrs_Atr_x,Perc_substrs_x,N_substrs_PT_y,N_substrs_PD_y,N_substrs_Atr_y,Perc_substrs_y,Leven_sim_ST_PT,N_keywords_leven,relevance
0,100001,1,0,3,2,1,0.25,1,1,1,0.5,0,0,0,0.0,0.181818,0,3.0
1,100001,1,0,3,2,0,0.0,1,1,1,0.5,0,0,0,0.0,0.121212,0,2.5
2,100002,2,0,9,2,0,0.0,2,2,2,1.0,0,0,0,0.0,0.054795,0,3.0
3,100005,1,0,12,3,1,0.071429,1,1,2,0.666667,0,0,0,0.0,0.21875,0,2.33
4,100005,1,0,12,3,3,0.25,3,3,3,1.0,0,0,0,0.0,0.203125,0,2.67


In [51]:
# save the 1st dataframe with features
df_train2.to_pickle(r'C:\Users\Dimos\Desktop\MSc\Semester 4\NLP\Coursework\trainsets\df_train_ptstkwat.pkl')

### -- Distances: Levenshtein, Cosine, Jaccard, Jaro

#### --- Search term vs Product Title

In [42]:
# Levensthein distance between 'PT_text' & 'ST_text'
df_train['Leven_sim_PT'] = df_train.apply(lambda x: 1 - stringdist.levenshtein_norm(x['PT_text'], x['ST_text']),
                                         axis=1)

In [43]:
# Jaccard similarity
# Notes: εδω πιανουμε κοινες λεξεις που μπορει να εχουν διαφορετικες καταληξεις
def get_jaccard_sim(words1, words2): 
    """Returns jaccard similarity between 2 list of words"""
    try:
        a = set(words1) 
        b = set(words2)
        c = a.intersection(b)
        return float(len(c)) / (len(a) + len(b) - len(c))
    except:
        return 0

df_train['JC_sim_PT'] = df_train.apply(lambda x: get_jaccard_sim(x['PT_stem'], x['ST_stem']),
                                         axis=1)

In [44]:
def get_cosine_sim(*strs):
    try:
        vectors = [t for t in get_vectors(*strs)]
        return cosine_similarity(vectors)[0][1]
    except:
        return 0
    
def get_vectors(*strs):
    text = [t for t in strs]
    vectorizer = CountVectorizer(text)
    vectorizer.fit(text)
    return vectorizer.transform(text).toarray()

# cosine
df_train['Cosine_sim_PT'] = df_train.apply(lambda x: get_cosine_sim(' '.join(x['PT_stem']), ' '.join(x['ST_stem'])),
                                         axis=1)

#### --- Search term vs Description

In [45]:
# Levensthein distance between 'Keywords_Descr' as text & 'ST_text'
df_train['Leven_sim_PD'] = df_train.apply(lambda x: 1 - stringdist.levenshtein_norm(' '.join(x['Keywords_Descr']), x['ST_text']),
                                         axis=1)

In [46]:
# Jaccard similarity
df_train['JC_sim_PD'] = df_train.apply(lambda x: get_jaccard_sim(x['PD_stem'], x['ST_stem']),
                                         axis=1)

In [47]:
df_train['Cosine_sim_PD'] = df_train.apply(lambda x: get_cosine_sim(' '.join(x['PD_stem']), ' '.join(x['ST_stem'])),
                                         axis=1)

#### --- Search term vs Attributes

In [50]:
df_train['Atrr_stem'] = df_train['Atrr_stem'].apply(lambda d: d if isinstance(d, list) else [])

In [64]:
def get_leven(x):
    try:
        return 1 - stringdist.levenshtein_norm(' '.join(x['Atrr_stem']), x['ST_text'])
    except:
        #print('error')
        return 0

# Levensthein distance between 'PT_text' & 'ST_text'
df_train['Leven_sim_Atrr'] = df_train.apply(lambda x: get_leven(x),
                                         axis=1)
# df_train['Leven_sim_Atrr'] = df_train.apply(lambda x: 1 - stringdist.levenshtein_norm(' '.join(x['Atrr_stem']), x['ST_text']),
#                                          axis=1)

In [48]:
# Jaccard similarity
df_train['JC_sim_Atrr'] = df_train.apply(lambda x: get_jaccard_sim(x['Atrr_stem'], x['ST_stem']),
                                         axis=1)

In [51]:
df_train['Cosine_sim_Atrr'] = df_train.apply(lambda x: get_cosine_sim(' '.join(x['Atrr_stem']), ' '.join(x['ST_stem'])),
                                         axis=1)

In [66]:
df_train.columns

Index(['id', 'product_uid', 'product_title', 'search_term', 'relevance',
       'PT_lower', 'PT_tokens', 'PT_tokens_sw', 'PT_text', 'PT_stem',
       'PT_numerics', 'PT_Non_numerics', 'ST_lower', 'ST_tokens',
       'ST_tokens_sw', 'ST_text', 'ST_stem', 'ST_numerics', 'ST_Non_numerics',
       'PD_lower', 'PD_stem', 'PD_numerics', 'Keywords_Descr', 'Atrr_text_all',
       'Atrr_stem', 'Leven_sim_PT', 'JC_sim_PT', 'Cosine_sim_PT',
       'Leven_sim_PD', 'JC_sim_PD', 'Cosine_sim_PD', 'JC_sim_Atrr',
       'Cosine_sim_Atrr', 'Leven_sim_Atrr'],
      dtype='object')

In [67]:
df_train_sims = df_train[['id', 'product_uid', 'JC_sim_PT', 'Cosine_sim_PT', 'Leven_sim_PD', 'JC_sim_PD',
       'Cosine_sim_PD', 'JC_sim_Atrr', 'Cosine_sim_Atrr', 'Leven_sim_Atrr']]

In [68]:
df_train_sims.head()

Unnamed: 0,id,product_uid,JC_sim_PT,Cosine_sim_PT,Leven_sim_PD,JC_sim_PD,Cosine_sim_PD,JC_sim_Atrr,Cosine_sim_Atrr,Leven_sim_Atrr
0,2,100001,0.142857,0.288675,0.091743,0.014085,0.19696,0.035714,0.138675,0.057143
1,3,100001,0.0,0.0,0.06422,0.0,0.0,0.0,0.0,0.04
2,9,100002,0.0,0.0,0.030075,0.010204,0.226134,0.029412,0.312348,0.017937
3,16,100005,0.076923,0.182574,0.123894,0.015152,0.063372,0.0,0.0,0.066667
4,17,100005,0.181818,0.447214,0.079646,0.03125,0.15523,0.0,0.0,0.05


In [69]:
# save the 2nd dataframe with features
df_train_sims.to_pickle(r'C:\Users\Dimos\Desktop\MSc\Semester 4\NLP\Coursework\trainsets\df_similarities.pkl')

### -- Fuzzy matching

In [7]:
#!pip install fuzzywuzzy
from fuzzywuzzy import fuzz



In [19]:
df_train.head(5)

Unnamed: 0,id,product_uid,product_title,search_term,relevance,PT_lower,PT_tokens,PT_tokens_sw,PT_text,PT_stem,PT_numerics,PT_Non_numerics,ST_lower,ST_tokens,ST_tokens_sw,ST_text,ST_stem,ST_numerics,ST_Non_numerics,PD_lower,PD_stem,PD_numerics,Keywords_Descr,Atrr_text_all,Atrr_stem,FZ_PT_1,FZ_PT_2,FZ_PT_3,FZ_PT_4,FZ_Attr_1,FZ_Attr_2,FZ_Attr_3,FZ_Attr_4
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0,simpson strong-tie 12-gauge angle,"[simpson, strong, tie, 12, gauge, angle]","[simpson, strong, tie, 12, gauge, angle]",simpson strong tie 12 gauge angle,"[simpson, strong, tie, 12, gaug, angl]",[12-gauge],"[simpson, strong-tie, angle]",angle bracket,"[angle, bracket]","[angle, bracket]",angle bracket,"[angl, bracket]",[],"[angle, bracket]","not only do angles make joints stronger, they also provide more consistent, straight corners. simpson strong-tie offers a wide variety of angles in various sizes and thicknesses to handle light-duty jobs or projects where a structural connection is needed. some can be bent (skewed) to match the project. for outdoor projects or those where moisture is present, use our zmax zinc-coated connectors, which provide extra resistance against corrosion (look for a ""z"" at the end of the model number).versatile connector for various 90 connections and home repair projectsstronger than angled nailing or screw fastening alonehelp ensure joints are consistently straight and strongdimensions: 3 in. x 3 in. x 1-1/2 in.made from 12-gauge steelgalvanized for extra corrosion resistanceinstall with 10d common nails or #9 x 1-1/2 in. strong-drive sd screws","[angl, make, joint, stronger, also, provid, consist, straight, corner, simpson, strong, tie, offer, wide, varieti, angl, variou, size, thick, handl, light, duti, job, project, structur, connect, need, bent, skew, match, project, outdoor, project, moistur, present, use, zmax, zinc, coat, connector, provid, extra, resist, corros, look, z, end, model, number, versatil, connector, variou, 90, connect, home, repair, projectsstrong, angl, nail, screw, fasten, alonehelp, ensur, joint, consist, straight, strongdimens, 3, x, 3, x, 1, 1, 2, made, 12, gaug, steelgalvan, extra, corros, resistanceinstal, 10d, common, nail, 9, x, 1, 1, 2, strong, drive, sd, screw]","[90, 3, 3, 1-1/2, 12-gauge, 10d, #9, 1-1/2]","[strongdimensions, alonehelp, projectsstronger, zmax, skewed, resistanceinstall, sd, bent, steelgalvanized, thicknesses]",versatile connector for various 90â° connections and home repair projects stronger than angled nailing or screw fastening alone help ensure joints are consistently straight and strong dimensions: 3 in. x 3 in. x 1-1/2 in. made from 12-gauge steel galvanized for extra corrosion resistance install with 10d common nails or #9 x 1-1/2 in. strong-drive sd screws 12 galvanized steel simpson strong-tie 1 1.5 3 0.26 3,"[versatil, connector, variou, 90â, connect, home, repair, project, stronger, angl, nail, screw, fasten, alon, help, ensur, joint, consist, straight, strong, dimens, 3, x, 3, x, 1, 1, 2, simpson, strong, tie]",22,56,39,56,4,8,4,5
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5,simpson strong-tie 12-gauge angle,"[simpson, strong, tie, 12, gauge, angle]","[simpson, strong, tie, 12, gauge, angle]",simpson strong tie 12 gauge angle,"[simpson, strong, tie, 12, gaug, angl]",[12-gauge],"[simpson, strong-tie, angle]",l bracket,"[l, bracket]","[l, bracket]",l bracket,"[l, bracket]",[],"[l, bracket]","not only do angles make joints stronger, they also provide more consistent, straight corners. simpson strong-tie offers a wide variety of angles in various sizes and thicknesses to handle light-duty jobs or projects where a structural connection is needed. some can be bent (skewed) to match the project. for outdoor projects or those where moisture is present, use our zmax zinc-coated connectors, which provide extra resistance against corrosion (look for a ""z"" at the end of the model number).versatile connector for various 90 connections and home repair projectsstronger than angled nailing or screw fastening alonehelp ensure joints are consistently straight and strongdimensions: 3 in. x 3 in. x 1-1/2 in.made from 12-gauge steelgalvanized for extra corrosion resistanceinstall with 10d common nails or #9 x 1-1/2 in. strong-drive sd screws","[angl, make, joint, stronger, also, provid, consist, straight, corner, simpson, strong, tie, offer, wide, varieti, angl, variou, size, thick, handl, light, duti, job, project, structur, connect, need, bent, skew, match, project, outdoor, project, moistur, present, use, zmax, zinc, coat, connector, provid, extra, resist, corros, look, z, end, model, number, versatil, connector, variou, 90, connect, home, repair, projectsstrong, angl, nail, screw, fasten, alonehelp, ensur, joint, consist, straight, strongdimens, 3, x, 3, x, 1, 1, 2, made, 12, gaug, steelgalvan, extra, corros, resistanceinstal, 10d, common, nail, 9, x, 1, 1, 2, strong, drive, sd, screw]","[90, 3, 3, 1-1/2, 12-gauge, 10d, #9, 1-1/2]","[strongdimensions, alonehelp, projectsstronger, zmax, skewed, resistanceinstall, sd, bent, steelgalvanized, thicknesses]",versatile connector for various 90â° connections and home repair projects stronger than angled nailing or screw fastening alone help ensure joints are consistently straight and strong dimensions: 3 in. x 3 in. x 1-1/2 in. made from 12-gauge steel galvanized for extra corrosion resistance install with 10d common nails or #9 x 1-1/2 in. strong-drive sd screws 12 galvanized steel simpson strong-tie 1 1.5 3 0.26 3,"[versatil, connector, variou, 90â, connect, home, repair, project, stronger, angl, nail, screw, fasten, alon, help, ensur, joint, consist, straight, strong, dimens, 3, x, 3, x, 1, 1, 2, simpson, strong, tie]",10,36,10,10,2,11,3,4
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 Tugboat Wood and Concrete Coating,deck over,3.0,behr premium textured deckover 1-gal. #sc-141 tugboat wood and concrete coating,"[behr, premium, textured, deckover, 1, gal, sc, 141, tugboat, wood, and, concrete, coating]","[behr, premium, textured, deckover, 1, gal, sc, 141, tugboat, wood, concrete, coating]",behr premium textured deckover 1 gal sc 141 tugboat wood concrete coating,"[behr, premium, textur, deckov, 1, gal, sc, 141, tugboat, wood, concret, coat]","[1-gal., #sc-141]","[behr, premium, textured, deckover, tugboat, wood, and, concrete, coating]",deck over,"[deck, over]",[deck],deck,[deck],[],"[deck, over]","behr premium textured deckover is an innovative solid color coating. it will bring your old, weathered wood or concrete back to life. the advanced 100% acrylic resin formula creates a durable coating for your tired and worn out deck, rejuvenating to a whole new look. for the best results, be sure to properly prepare the surface using other applicable behr products displayed above.california residents: see&nbsp;proposition 65 informationrevives wood and composite decks, railings, porches and boat docks, also great for concrete pool decks, patios and sidewalks100% acrylic solid color coatingresists cracking and peeling and conceals splinters and cracks up to 1/4 in.provides a durable, mildew resistant finishcovers up to 75 sq. ft. in 2 coats per galloncreates a textured, slip-resistant finishfor best results, prepare with the appropriate behr product for your wood or concrete surfaceactual paint colors may vary from on-screen and printer representationscolors available to be tinted in most storesonline price includes paint care fee in the following states: ca, co, ct, me, mn, or, ri, vt","[behr, premium, textur, deckov, innov, solid, color, coat, bring, old, weather, wood, concret, back, life, advanc, 100, acryl, resin, formula, creat, durabl, coat, tire, worn, deck, rejuven, whole, new, look, best, result, sure, properli, prepar, surfac, use, applic, behr, product, display, california, resid, see, nbsp, proposit, 65, informationrev, wood, composit, deck, rail, porch, boat, dock, also, great, concret, pool, deck, patio, sidewalks100, acryl, solid, color, coatingresist, crack, peel, conceal, splinter, crack, 1, 4, provid, durabl, mildew, resist, finishcov, 75, sq, ft, 2, coat, per, galloncr, textur, slip, resist, finishfor, best, result, prepar, appropri, behr, product, wood, concret, surfaceactu, paint, color, ...]","[100%, 65, sidewalks100%, 1/4, 75, 2]","[representationscolors, storesonline, galloncreates, informationrevives, coatingresists, deckover, rejuvenating, splinters, finishcovers, tired]","brush,roller,spray 6.63 in 7.76 in 6.63 in revives wood and composite decks, railings, porches and boat docks, also great for concrete pool decks, patios and sidewalks 100% acrylic solid color coating resists cracking and peeling and conceals splinters and cracks up to 1/4 in. provides a durable, mildew resistant finish covers up to 75 sq. ft. in 2 coats per gallon creates a textured, slip-resistant finish for best results, prepare with the appropriate behr product for your wood or concrete surface actual paint colors may vary from on-screen and printer representations colors available to be tinted in most stores online price includes paint care fee in the following states: ca, co, ct, me, mn, or, ri, vt soap and water browns / tans tugboat yes 1 ga-gallon 75 yes exterior behr premium textured deckover yes solid exterior paint/stain restoration coating cottage 119:100:086 no 6 no solid yes no","[reviv, wood, composit, deck, rail, porch, boat, dock, also, great, concret, pool, deck, patio, sidewalk, 100, acryl, solid, color, coat, resist, crack, peel, conceal, splinter, crack, 1, 4, provid, durabl, mildew, resist, finish, behr, premium, textur, deckov]",18,89,19,19,2,67,1,1
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Kit in Chrome (Valve Not Included),rain shower head,2.33,delta vero 1-handle shower only faucet trim kit in chrome (valve not included),"[delta, vero, 1, handle, shower, only, faucet, trim, kit, in, chrome, valve, not, included]","[delta, vero, 1, handle, shower, faucet, trim, kit, chrome, valve, included]",delta vero 1 handle shower faucet trim kit chrome valve included,"[delta, vero, 1, handl, shower, faucet, trim, kit, chrome, valv, includ]",[1-handle],"[delta, vero, shower, only, faucet, trim, kit, in, chrome, (valve, not, included)]",rain shower head,"[rain, shower, head]","[rain, shower, head]",rain shower head,"[rain, shower, head]",[],"[rain, shower, head]","update your bathroom with the delta vero single-handle shower faucet trim kit in chrome. it has a sleek, modern and minimalistic aesthetic. the multichoice universal valve keeps the water temperature within +/-3 degrees fahrenheit to help prevent scalding.california residents: see&nbsp;proposition 65 informationincludes the trim kit only, the rough-in kit (r10000-unbx) is sold separatelyincludes the handlemaintains a balanced pressure of hot and cold water even when a valve is turned on or off elsewhere in the systemdue to watersense regulations in the state of new york, please confirm your shipping zip code is not restricted from use of items that do not meet watersense qualifications","[updat, bathroom, delta, vero, singl, handl, shower, faucet, trim, kit, chrome, sleek, modern, minimalist, aesthet, multichoic, univers, valv, keep, water, temperatur, within, 3, degre, fahrenheit, help, prevent, scald, california, resid, see, nbsp, proposit, 65, informationinclud, trim, kit, rough, kit, r10000, unbx, sold, separatelyinclud, handlemaintain, balanc, pressur, hot, cold, water, even, valv, turn, elsewher, systemdu, watersens, regul, state, new, york, pleas, confirm, ship, zip, code, restrict, use, item, meet, watersens, qualif]","[+/-3, 65, (r10000-unbx)]","[handlemaintains, systemdue, qualifications, vero, multichoice, minimalistic, separatelyincludes, elsewhere, scalding, unbx]","combo tub and shower no includes the trim kit only, the rough-in kit (r10000-unbx) is sold separately includes the handle maintains a balanced pressure of hot and cold water even when a valve is turned on or off elsewhere in the system due to watersense regulations in the state of new york, please confirm your shipping zip code is not restricted from use of items that do not meet watersense qualifications ada compliant,csa certified,iapmo certified chrome chrome 1/2 in. no additional features handles,pressure balance/scald guard bath faucet 2.5 lever delta single handle 1 1 15.28 24 7.09 4.06 fixed mount rain","[includ, trim, kit, rough, kit, r10000, unbx, sold, separ, includ, handl, maintain, balanc, pressur, hot, cold, water, even, valv, turn, elsewher, system, due, watersens, regul, state, new, york, pleas, confirm, ship, zip, code, restrict, use, item, meet, watersens, qualif, delta]",26,62,26,55,4,62,4,81
4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Kit in Chrome (Valve Not Included),shower only faucet,2.67,delta vero 1-handle shower only faucet trim kit in chrome (valve not included),"[delta, vero, 1, handle, shower, only, faucet, trim, kit, in, chrome, valve, not, included]","[delta, vero, 1, handle, shower, faucet, trim, kit, chrome, valve, included]",delta vero 1 handle shower faucet trim kit chrome valve included,"[delta, vero, 1, handl, shower, faucet, trim, kit, chrome, valv, includ]",[1-handle],"[delta, vero, shower, only, faucet, trim, kit, in, chrome, (valve, not, included)]",shower only faucet,"[shower, only, faucet]","[shower, faucet]",shower faucet,"[shower, faucet]",[],"[shower, only, faucet]","update your bathroom with the delta vero single-handle shower faucet trim kit in chrome. it has a sleek, modern and minimalistic aesthetic. the multichoice universal valve keeps the water temperature within +/-3 degrees fahrenheit to help prevent scalding.california residents: see&nbsp;proposition 65 informationincludes the trim kit only, the rough-in kit (r10000-unbx) is sold separatelyincludes the handlemaintains a balanced pressure of hot and cold water even when a valve is turned on or off elsewhere in the systemdue to watersense regulations in the state of new york, please confirm your shipping zip code is not restricted from use of items that do not meet watersense qualifications","[updat, bathroom, delta, vero, singl, handl, shower, faucet, trim, kit, chrome, sleek, modern, minimalist, aesthet, multichoic, univers, valv, keep, water, temperatur, within, 3, degre, fahrenheit, help, prevent, scald, california, resid, see, nbsp, proposit, 65, informationinclud, trim, kit, rough, kit, r10000, unbx, sold, separatelyinclud, handlemaintain, balanc, pressur, hot, cold, water, even, valv, turn, elsewher, systemdu, watersens, regul, state, new, york, pleas, confirm, ship, zip, code, restrict, use, item, meet, watersens, qualif]","[+/-3, 65, (r10000-unbx)]","[handlemaintains, systemdue, qualifications, vero, multichoice, minimalistic, separatelyincludes, elsewhere, scalding, unbx]","combo tub and shower no includes the trim kit only, the rough-in kit (r10000-unbx) is sold separately includes the handle maintains a balanced pressure of hot and cold water even when a valve is turned on or off elsewhere in the system due to watersense regulations in the state of new york, please confirm your shipping zip code is not restricted from use of items that do not meet watersense qualifications ada compliant,csa certified,iapmo certified chrome chrome 1/2 in. no additional features handles,pressure balance/scald guard bath faucet 2.5 lever delta single handle 1 1 15.28 24 7.09 4.06 fixed mount rain","[includ, trim, kit, rough, kit, r10000, unbx, sold, separ, includ, handl, maintain, balanc, pressur, hot, cold, water, even, valv, turn, elsewher, system, due, watersens, regul, state, new, york, pleas, confirm, ship, zip, code, restrict, use, item, meet, watersens, qualif, delta]",38,100,38,100,6,67,6,100


In [None]:
# examples
#https://towardsdatascience.com/natural-language-processing-for-fuzzy-string-matching-with-python-6632b7824c49
fuzz.ratio("this is a test", "this is a test!")
fuzz.partial_ratio("this is a test", "this is a test!")
fuzz.ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")
fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")
fuzz.token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")
fuzz.token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")

In [9]:
# product title & search_term
df_train['FZ_PT_1'] = df_train.apply(lambda x: fuzz.ratio(x['PT_lower'], x['ST_lower']), axis=1)
df_train['FZ_PT_2'] = df_train.apply(lambda x: fuzz.partial_ratio(x['PT_lower'], x['ST_lower']), axis=1)
df_train['FZ_PT_3'] = df_train.apply(lambda x: fuzz.token_sort_ratio(x['PT_lower'], x['ST_lower']), axis=1)
df_train['FZ_PT_4'] = df_train.apply(lambda x: fuzz.token_set_ratio(x['PT_lower'], x['ST_lower']), axis=1)

In [12]:
# convert 'Atrr_text_all' column to strings in order to avoid errors
df_train['Atrr_text_all'] = df_train['Atrr_text_all'].astype('str')

In [13]:
# product title & search_term
df_train['FZ_Attr_1'] = df_train.apply(lambda x: fuzz.ratio(x['Atrr_text_all'], x['ST_lower']), axis=1)
df_train['FZ_Attr_2'] = df_train.apply(lambda x: fuzz.partial_ratio(x['Atrr_text_all'], x['ST_lower']), axis=1)
df_train['FZ_Attr_3'] = df_train.apply(lambda x: fuzz.token_sort_ratio(x['Atrr_text_all'], x['ST_lower']), axis=1)
df_train['FZ_Attr_4'] = df_train.apply(lambda x: fuzz.token_set_ratio(x['Atrr_text_all'], x['ST_lower']), axis=1)

In [16]:
df_train.columns

Index(['id', 'product_uid', 'product_title', 'search_term', 'relevance',
       'PT_lower', 'PT_tokens', 'PT_tokens_sw', 'PT_text', 'PT_stem',
       'PT_numerics', 'PT_Non_numerics', 'ST_lower', 'ST_tokens',
       'ST_tokens_sw', 'ST_text', 'ST_stem', 'ST_numerics', 'ST_Non_numerics',
       'PD_lower', 'PD_stem', 'PD_numerics', 'Keywords_Descr', 'Atrr_text_all',
       'Atrr_stem', 'FZ_PT_1', 'FZ_PT_2', 'FZ_PT_3', 'FZ_PT_4', 'FZ_Attr_1',
       'FZ_Attr_2', 'FZ_Attr_3', 'FZ_Attr_4'],
      dtype='object')

In [20]:
df_fuzzy = df_train[['product_uid', 'FZ_PT_1', 'FZ_PT_2', 'FZ_PT_3', 'FZ_PT_4', 'FZ_Attr_1',
       'FZ_Attr_2', 'FZ_Attr_3', 'FZ_Attr_4']]

In [21]:
df_fuzzy.head()

Unnamed: 0,product_uid,FZ_PT_1,FZ_PT_2,FZ_PT_3,FZ_PT_4,FZ_Attr_1,FZ_Attr_2,FZ_Attr_3,FZ_Attr_4
0,100001,22,56,39,56,4,8,4,5
1,100001,10,36,10,10,2,11,3,4
2,100002,18,89,19,19,2,67,1,1
3,100005,26,62,26,55,4,62,4,81
4,100005,38,100,38,100,6,67,6,100


In [22]:
# save the 3rd dataframe with features
df_fuzzy.to_pickle(r'C:\Users\Dimos\Desktop\MSc\Semester 4\NLP\Coursework\trainsets\df_fuzzy.pkl')