In [0]:
# In this notebook we create the features that will be used in the modeling phase
# So, we first read the files that we created in the preprocessing phase
# Finally we save 3 pickle files with features:
# 1) df_train_ptstkwat.pkl - it has several numeric features
# 2) df_similarities.pkl - it has features based on similarities (Levenshtein, Cosine, Jaccard, Jaro)
# 3) df_fuzzy.pkl - it has features based on this --> https://towardsdatascience.com/natural-language-processing-for-fuzzy-string-matching-with-python-6632b7824c49

In [0]:
import pandas as pd

# pip stringdist

# import stringdist

from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# print non truncated column info in pandas dataframe
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', 500)

In [0]:
# read the preprocessed trainset
df_train = pd.read_pickle(r'C:\Users\Dimos\Desktop\MSc\Semester 4\NLP\Coursework\data\preprocessed\df_train_prep.pkl')
# read the file with the preprocessed descriptions and their keywords
df_descs = pd.read_pickle(r'C:\Users\Dimos\Desktop\MSc\Semester 4\NLP\Coursework\data\preprocessed\df_descr_kwd.pkl')
# read the file with the preprocessed attributes
df_attrs = pd.read_pickle(r'C:\Users\Dimos\Desktop\MSc\Semester 4\NLP\Coursework\data\preprocessed\df_attrs_prep.pkl')

In [0]:
# merge the above dataframes
df_train = df_train.merge(df_descs, left_on='product_uid', right_on='product_uid', how='left')
df_train = df_train.merge(df_attrs, left_on='product_uid', right_on='product_uid', how='left')

## - Features from TRAINSET + Descriptions + Attributes

In [0]:
df_train.head(2)

### -- General counts about numerics and non numerics in Product title and Search term

In [0]:
# number of numeric terms in product_title
df_train['N_numerics_PT'] = df_train['PT_numerics'].apply(lambda x: len(x))
# number of numeric terms in search_term
df_train['N_numerics_ST'] = df_train['ST_numerics'].apply(lambda x: len(x))

# number of non numeric terms in product_title
df_train['N_non_numerics_PT'] = df_train['PT_Non_numerics'].apply(lambda x: len(x))
# number of non numeric terms in search_term
df_train['N_non_numerics_ST'] = df_train['ST_Non_numerics'].apply(lambda x: len(x))

In [0]:
# DONT INCLUDE IT

# number of common words between 'product_title' & 'search_term'
# df_train['N_common_words'] = df_train.apply(lambda x: 
#                             len(list(set(x['product_title_tokens']).intersection(x['search_term_tokens']))), axis=1)

### -- Common terms between Search term & Product title

In [0]:
%%time

def common_words_leven(tokens_1, tokens_2):
    #N = 0
    common_terms = []
    tokens_1 = list(set(tokens_1))
    tokens_2 = list(set(tokens_2))
    
    for token1 in tokens_1:
        for token2 in tokens_2:
            if 1 - stringdist.levenshtein_norm(token1, token2)>0.85:
                #N += 1
                common_terms.append(token2)
    try:
        return common_terms
    except:
        return common_terms

# number of common nonnumeric terms between 'PT_Non_numerics' & 'ST_Non_numerics' with levensthein distance
df_train['Common_words_leven'] = df_train.apply(lambda x: common_words_leven(x['PT_Non_numerics'], x['ST_Non_numerics']),
                                                                              axis=1)

df_train['N_common_words_leven'] = df_train['Common_words_leven'].apply(lambda x: len(x))

In [0]:
# Jaccard similarity based on the above common words with levensthein
def get_jaccard_sim(words1, words2, common): 
    """Returns jaccard similarity between 2 list of words"""
    a = set(words1)
    b = set(words2)
    c = set(common)
    return float(len(c)) / (len(a) + len(b) - len(c))

df_train['JC_sim'] = df_train.apply(lambda x: get_jaccard_sim(x['PT_Non_numerics'], x['ST_Non_numerics'], x['Common_words_leven']),
                                         axis=1)

In [0]:
# DONT INCLUDE IT

# number of common stemmed words between 'product_title_stem' & 'search_term_stem'
# df_train['N_common_words_stem'] = df_train.apply(lambda x: 
#                             len(list(set(x['product_title_stem']).intersection(x['search_term_stem']))), axis=1)

### -- Non numeric terms of Search term that are substrings of Product title | Descrtiption | Attributes 

In [0]:
# number of non numeric terms of the search_term that appears in the Product title | Descrtiption | Attributes

def n_substrings(tokens, text):
    #N = 0
    substrings = []
    for token in tokens:
        try:
            if token in text:
                #N += 1
                substrings.append(token)
        except:
            # in case the text is float
            pass
    return substrings

##### PRODUCT TITLE 
# list of terms of search_term_tokens that are substrings of PT_lower
df_train['Substrs_PT_x'] = df_train.apply(lambda x: n_substrings(x['ST_Non_numerics'], x['PT_lower']), axis=1)

# Number of terms of search_term_tokens that are substrings of product_title_lower
df_train['N_substrs_PT_x'] = df_train['Substrs_PT_x'].apply(lambda x: len(x))


#### PRODUCT DESCRIPTION
# list of terms of search_term_tokens that are substrings of PD_lower
df_train['Substrs_PD_x'] = df_train.apply(lambda x: n_substrings(x['ST_Non_numerics'], x['PD_lower']), axis=1)

# Number of terms of search_term_tokens that are substrings of product_title_lower
df_train['N_substrs_PD_x'] = df_train['Substrs_PD_x'].apply(lambda x: len(x))

#### PRODUCT ATTRIBUTES
# list of terms of search_term_tokens that are substrings of PD_lower
df_train['Substrs_Atr_x'] = df_train.apply(lambda x: n_substrings(x['ST_Non_numerics'], x['Atrr_text']), axis=1)

# Number of terms of search_term_tokens that are substrings of product_title_lower
df_train['N_substrs_Atr_x'] = df_train['Substrs_Atr_x'].apply(lambda x: len(x))

In [0]:
def perc_xxx(tokens1, tokens2, tokens3, tokens4):
    try:
        tokens123 = list(set(tokens1 + tokens2 + tokens3))
        return len(tokens123)/len(tokens4)
    except:
        return 0

# percentage of terms of search_term_tokens that are substrings of PT_lower or PD_lower or Atrr_text
df_train['Perc_substrs_x'] = df_train.apply(lambda x: perc_xxx(x['Substrs_PT_x'],x['Substrs_PD_x'],x['Substrs_Atr_x'],x['ST_Non_numerics']),
                                          axis=1)

### -- Numeric terms of Search term that are substrings of Product title | Descrtiption | Attributes 

In [0]:
# number of numeric terms of the search_term that appears in the Product title | Descrtiption | Attributes
def n_substrings(tokens, text):
    #N = 0
    substrings = []
    for token in tokens:
        try:
            if token in text:
                #N += 1
                substrings.append(token)
        except:
            # in case the text is float
            pass
    return substrings

##### PRODUCT TITLE 
# list of terms of search_term_tokens that are substrings of PT_lower
df_train['Substrs_PT_y'] = df_train.apply(lambda x: n_substrings(x['ST_numerics'], x['PT_lower']), axis=1)

# Number of terms of search_term_tokens that are substrings of product_title_lower
df_train['N_substrs_PT_y'] = df_train['Substrs_PT_y'].apply(lambda x: len(x))


#### PRODUCT DESCRIPTION
# list of terms of search_term_tokens that are substrings of PD_lower
df_train['Substrs_PD_y'] = df_train.apply(lambda x: n_substrings(x['ST_numerics'], x['PD_lower']), axis=1)

# Number of terms of search_term_tokens that are substrings of product_title_lower
df_train['N_substrs_PD_y'] = df_train['Substrs_PD_y'].apply(lambda x: len(x))

#### PRODUCT ATTRIBUTES
# list of terms of search_term_tokens that are substrings of PD_lower
df_train['Substrs_Atr_y'] = df_train.apply(lambda x: n_substrings(x['ST_numerics'], x['Atrr_text']), axis=1)

# Number of terms of search_term_tokens that are substrings of product_title_lower
df_train['N_substrs_Atr_y'] = df_train['Substrs_Atr_y'].apply(lambda x: len(x))

In [0]:
def perc_xxx(tokens1, tokens2, tokens3, tokens4):
    try:
        tokens123 = list(set(tokens1 + tokens2 + tokens3))
        return len(tokens123)/len(tokens4)
    except:
        return 0

# percentage of terms of search_term_tokens that are substrings of PT_lower or PD_lower or Atrr_text
df_train['Perc_substrs_y'] = df_train.apply(lambda x: perc_xxx(x['Substrs_PT_y'],x['Substrs_PD_y'],x['Substrs_Atr_y'],x['ST_numerics']),
                                          axis=1)

### -- Levensthein distance between Search terms & Product title

In [0]:
# Levensthein distance between 'product_title_text' & 'search_term_text'
df_train['Leven_sim_ST_PT'] = df_train.apply(lambda x: 1 - stringdist.levenshtein_norm(x['PT_text'], x['ST_text']),
                                         axis=1)

### -- Keywords of Description that appear in the Search term (with levensthein distance)

In [0]:
%%time

def common_words_leven(tokens_1, tokens_2):
    #N = 0
    common_terms = []
    tokens_1 = list(set(tokens_1))
    tokens_2 = list(set(tokens_2))
    
    for token1 in tokens_1:
        for token2 in tokens_2:
            if 1 - stringdist.levenshtein_norm(token1, token2)>0.85:
                #N += 1
                common_terms.append(token1)
    try:
        return common_terms
    except:
        return common_terms

# list of Descripton Keywords that appear in the 'ST_Non_numerics' with levensthein distance
df_train['Keywords_leven'] = df_train.apply(lambda x: common_words_leven(x['Keywords_Descr'], x['ST_Non_numerics']),
                                                                              axis=1)
# number of Descripton Keywords that appear in the 'ST_Non_numerics' with levensthein distance
df_train['N_keywords_leven'] = df_train['Keywords_leven'].apply(lambda x: len(x))

In [0]:
df_train.columns

In [0]:
# keep only those columns
df_train2 = df_train[['product_uid',
       'N_numerics_PT', 'N_numerics_ST', 'N_non_numerics_PT',
       'N_non_numerics_ST', 'N_common_words_leven',
       'JC_sim', 'N_substrs_PT_x',
       'N_substrs_PD_x', 'N_substrs_Atr_x',
       'Perc_substrs_x', 'N_substrs_PT_y',
       'N_substrs_PD_y', 'N_substrs_Atr_y', 'Perc_substrs_y',
       'Leven_sim_ST_PT', 'N_keywords_leven', 'relevance']]

In [0]:
df_train2.head()

In [0]:
# save the 1st dataframe with features
df_train2.to_pickle(r'C:\Users\Dimos\Desktop\MSc\Semester 4\NLP\Coursework\trainsets\df_train_ptstkwat.pkl')

### -- Distances: Levenshtein, Cosine, Jaccard, Jaro

#### --- Search term vs Product Title

In [0]:
# Levensthein distance between 'PT_text' & 'ST_text'
df_train['Leven_sim_PT'] = df_train.apply(lambda x: 1 - stringdist.levenshtein_norm(x['PT_text'], x['ST_text']),
                                         axis=1)

In [0]:
# Jaccard similarity
# Notes: εδω πιανουμε κοινες λεξεις που μπορει να εχουν διαφορετικες καταληξεις
def get_jaccard_sim(words1, words2): 
    """Returns jaccard similarity between 2 list of words"""
    try:
        a = set(words1) 
        b = set(words2)
        c = a.intersection(b)
        return float(len(c)) / (len(a) + len(b) - len(c))
    except:
        return 0

df_train['JC_sim_PT'] = df_train.apply(lambda x: get_jaccard_sim(x['PT_stem'], x['ST_stem']),
                                         axis=1)

In [0]:
def get_cosine_sim(*strs):
    try:
        vectors = [t for t in get_vectors(*strs)]
        return cosine_similarity(vectors)[0][1]
    except:
        return 0
    
def get_vectors(*strs):
    text = [t for t in strs]
    vectorizer = CountVectorizer(text)
    vectorizer.fit(text)
    return vectorizer.transform(text).toarray()

# cosine
df_train['Cosine_sim_PT'] = df_train.apply(lambda x: get_cosine_sim(' '.join(x['PT_stem']), ' '.join(x['ST_stem'])),
                                         axis=1)

#### --- Search term vs Description

In [0]:
# Levensthein distance between 'Keywords_Descr' as text & 'ST_text'
df_train['Leven_sim_PD'] = df_train.apply(lambda x: 1 - stringdist.levenshtein_norm(' '.join(x['Keywords_Descr']), x['ST_text']),
                                         axis=1)

In [0]:
# Jaccard similarity
df_train['JC_sim_PD'] = df_train.apply(lambda x: get_jaccard_sim(x['PD_stem'], x['ST_stem']),
                                         axis=1)

In [0]:
df_train['Cosine_sim_PD'] = df_train.apply(lambda x: get_cosine_sim(' '.join(x['PD_stem']), ' '.join(x['ST_stem'])),
                                         axis=1)

#### --- Search term vs Attributes

In [0]:
df_train['Atrr_stem'] = df_train['Atrr_stem'].apply(lambda d: d if isinstance(d, list) else [])

In [0]:
def get_leven(x):
    try:
        return 1 - stringdist.levenshtein_norm(' '.join(x['Atrr_stem']), x['ST_text'])
    except:
        #print('error')
        return 0

# Levensthein distance between 'PT_text' & 'ST_text'
df_train['Leven_sim_Atrr'] = df_train.apply(lambda x: get_leven(x),
                                         axis=1)
# df_train['Leven_sim_Atrr'] = df_train.apply(lambda x: 1 - stringdist.levenshtein_norm(' '.join(x['Atrr_stem']), x['ST_text']),
#                                          axis=1)

In [0]:
# Jaccard similarity
df_train['JC_sim_Atrr'] = df_train.apply(lambda x: get_jaccard_sim(x['Atrr_stem'], x['ST_stem']),
                                         axis=1)

In [0]:
df_train['Cosine_sim_Atrr'] = df_train.apply(lambda x: get_cosine_sim(' '.join(x['Atrr_stem']), ' '.join(x['ST_stem'])),
                                         axis=1)

In [0]:
df_train.columns

In [0]:
df_train_sims = df_train[['id', 'product_uid', 'JC_sim_PT', 'Cosine_sim_PT', 'Leven_sim_PD', 'JC_sim_PD',
       'Cosine_sim_PD', 'JC_sim_Atrr', 'Cosine_sim_Atrr', 'Leven_sim_Atrr']]

In [0]:
df_train_sims.head()

Unnamed: 0,id,product_uid,JC_sim_PT,Cosine_sim_PT,Leven_sim_PD,JC_sim_PD,Cosine_sim_PD,JC_sim_Atrr,Cosine_sim_Atrr,Leven_sim_Atrr
0,2,100001,0.142857,0.288675,0.091743,0.014085,0.19696,0.035714,0.138675,0.057143
1,3,100001,0.0,0.0,0.06422,0.0,0.0,0.0,0.0,0.04
2,9,100002,0.0,0.0,0.030075,0.010204,0.226134,0.029412,0.312348,0.017937
3,16,100005,0.076923,0.182574,0.123894,0.015152,0.063372,0.0,0.0,0.066667
4,17,100005,0.181818,0.447214,0.079646,0.03125,0.15523,0.0,0.0,0.05


In [0]:
# save the 2nd dataframe with features
df_train_sims.to_pickle(r'C:\Users\Dimos\Desktop\MSc\Semester 4\NLP\Coursework\trainsets\df_similarities.pkl')

### -- Fuzzy matching

In [0]:
!pip install fuzzywuzzy
from fuzzywuzzy import fuzz

In [0]:
df_train.head(5)

NameError: ignored

In [0]:
# examples
#https://towardsdatascience.com/natural-language-processing-for-fuzzy-string-matching-with-python-6632b7824c49
fuzz.ratio("this is a test", "this is a test!")
fuzz.partial_ratio("this is a test", "this is a test!")
fuzz.ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")
fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")
fuzz.token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")
fuzz.token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")

100

In [0]:
# product title & search_term
df_train['FZ_PT_1'] = df_train.apply(lambda x: fuzz.ratio(x['PT_lower'], x['ST_lower']), axis=1)
df_train['FZ_PT_2'] = df_train.apply(lambda x: fuzz.partial_ratio(x['PT_lower'], x['ST_lower']), axis=1)
df_train['FZ_PT_3'] = df_train.apply(lambda x: fuzz.token_sort_ratio(x['PT_lower'], x['ST_lower']), axis=1)
df_train['FZ_PT_4'] = df_train.apply(lambda x: fuzz.token_set_ratio(x['PT_lower'], x['ST_lower']), axis=1)

NameError: ignored

In [0]:
# convert 'Atrr_text_all' column to strings in order to avoid errors
df_train['Atrr_text_all'] = df_train['Atrr_text_all'].astype('str')

In [0]:
# product title & search_term
df_train['FZ_Attr_1'] = df_train.apply(lambda x: fuzz.ratio(x['Atrr_text_all'], x['ST_lower']), axis=1)
df_train['FZ_Attr_2'] = df_train.apply(lambda x: fuzz.partial_ratio(x['Atrr_text_all'], x['ST_lower']), axis=1)
df_train['FZ_Attr_3'] = df_train.apply(lambda x: fuzz.token_sort_ratio(x['Atrr_text_all'], x['ST_lower']), axis=1)
df_train['FZ_Attr_4'] = df_train.apply(lambda x: fuzz.token_set_ratio(x['Atrr_text_all'], x['ST_lower']), axis=1)

In [0]:
df_train.columns

Index(['id', 'product_uid', 'product_title', 'search_term', 'relevance',
       'PT_lower', 'PT_tokens', 'PT_tokens_sw', 'PT_text', 'PT_stem',
       'PT_numerics', 'PT_Non_numerics', 'ST_lower', 'ST_tokens',
       'ST_tokens_sw', 'ST_text', 'ST_stem', 'ST_numerics', 'ST_Non_numerics',
       'PD_lower', 'PD_stem', 'PD_numerics', 'Keywords_Descr', 'Atrr_text_all',
       'Atrr_stem', 'FZ_PT_1', 'FZ_PT_2', 'FZ_PT_3', 'FZ_PT_4', 'FZ_Attr_1',
       'FZ_Attr_2', 'FZ_Attr_3', 'FZ_Attr_4'],
      dtype='object')

In [0]:
df_fuzzy = df_train[['product_uid', 'FZ_PT_1', 'FZ_PT_2', 'FZ_PT_3', 'FZ_PT_4', 'FZ_Attr_1',
       'FZ_Attr_2', 'FZ_Attr_3', 'FZ_Attr_4']]

In [0]:
df_fuzzy.head()

Unnamed: 0,product_uid,FZ_PT_1,FZ_PT_2,FZ_PT_3,FZ_PT_4,FZ_Attr_1,FZ_Attr_2,FZ_Attr_3,FZ_Attr_4
0,100001,22,56,39,56,4,8,4,5
1,100001,10,36,10,10,2,11,3,4
2,100002,18,89,19,19,2,67,1,1
3,100005,26,62,26,55,4,62,4,81
4,100005,38,100,38,100,6,67,6,100


In [0]:
# save the 3rd dataframe with features
df_fuzzy.to_pickle(r'C:\Users\Dimos\Desktop\MSc\Semester 4\NLP\Coursework\trainsets\df_fuzzy.pkl')