# A String Matching Story

Author: Ezequiel Ortiz Recalde

### 1. Imports

In [2]:
import itertools
import numpy as np
import pandas as pd

from nltk.metrics.distance import edit_distance
from fuzzywuzzy import fuzz
from polyfuzz import PolyFuzz
from polyfuzz.models import TFIDF

### 2. The data

In [3]:
def data_generator(variant,flavour,product,quantity):
    permutations=itertools.product(variant,flavour,product,quantity)
    combinations=list(permutations)
    data=pd.DataFrame(combinations, columns=["variant","flavour","product","quantity"])
    data["product_option_1"]="beverages "+data["product"]+" "+data["variant"]+" "+data["flavour"]+" "+data["quantity"]
    data["product_option_2"]="beverage "+data["variant"]+" "+data["flavour"]+" "+data["product"]+" "+data["quantity"]
    results=pd.concat([data["product_option_1"],data["product_option_2"]])
    results.drop_duplicates(inplace=True)
    results=pd.DataFrame(results,columns=["product"])
    return(results)

In [4]:
external_data=data_generator(variant=["diet","dit","diiet",""],
                             flavour=["vanilla","chocolate","acid"],
                             product=["coke","cokke","spriite","sprite","prite","fanta","afnta","pepsi"],
                             quantity=["500ml","1.5L","2.25L","354ml","500cc","354cc",
                                       "500 ml","1.5 L","2.25 L","354 ml","500 cc","354 cc",
                                       "500 ml.","1.5 LT.","2.25 L","354 ml","500 cc","354 cc",
                                       "500 ml.","1.5 l.","2.25 l","354 ml.","500 cc.","354 cc",
                                       "500 ml","1.5 lts","2.25 lts","354 ml","500 cc","354 cc",
                                       "500 ml.","1.5 lts.","2.25 lts","354 ml","500 cc","354 cc",
                                       "500 ml.","1.5 lt.","2.25 lt","354 ml.","500 cc.","354 cc"])

In [5]:
# Beverages of internal source
variant=["DIET"]
product=["COKE","SPRITE","FANTA","PEPSI"]
flavour=["VANILLA","CHOCOLATE","COCO","STRAWBERRY"]
quantity=["X 500ML","X 1.5L","X 2.25L","X 354ML"]
permutations=itertools.product(variant,flavour,product,quantity)
combinations=list(permutations)
internal_data=pd.DataFrame(combinations, columns=["variant","flavour","product","quantity"])
internal_data["product_name"]=internal_data["variant"]+" "+internal_data["flavour"]+" "+internal_data["product"]+" "+internal_data["quantity"]
internal_data=pd.DataFrame(internal_data["product_name"])

### 3. String preprocessing

In [6]:
def string_processing(df:pd.DataFrame,columns_to_clean:list):
    """
    Function to clean specific columns of a dataframe
    args:
        df= pandas dataframe
        ignored_columns=
    """
    for col in df.columns.difference(columns_to_clean):
        
        #Convert strings to lowercase
        df[col] = df[col].str.lower()
        
        #Remove the "-" symbol
        df[col] = df[col].str.replace(r"(\-)","",regex=True)

        # Remove words that don't add information
        df[col] = df[col].str.replace(r"(beverages|beverage)","",regex=True)
        
        # Replace consecutive blank spaces for 1 blank space
        df[col] = df[col].str.replace(r"(  |   )", " ",regex=True)
        df[col] = df[col].str.replace(r"(  )", " ",regex=True)
        
        # Remove starting blank space
        df[col] = df[col].str.replace(r"(^ )", "",regex=True)
        
        # Remove blank space at the end
        df[col] = df[col].str.replace(r"( $)", "",regex=True)
        
        # Standardize the spelling of litres and ml
        df[col] = df[col].str.replace(r"(\s?l$|\s?lt$|\s?lt\.$|\sl\.$|\s?lts$|\s?lts\.$)", "l",regex=True)
        df[col] = df[col].str.replace(r"((?<![\w])|(?<=\d))(ml\.?$|cc\.?$)", "ml",regex=True)
        df[col] = df[col].str.replace(r"(\s?ml$)", "ml",regex=True)
        df[col] = df[col].str.replace(r"((?<=\d))(\s?l$)", "l",regex=True)

In [7]:
# Create copies of main columns to be modified
external_data["product_cleaned_from"]=external_data["product"].copy()
internal_data["product_cleaned_to"]=internal_data["product_name"].copy()

In [8]:
# Clean columns
string_processing(df=external_data,columns_to_clean=["product"])
string_processing(df=internal_data,columns_to_clean=["product_name"])

### 4. String matching with Tf-Idf Vectorization

In [9]:
# We import sklearns' TFIDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
# Function to partition strings into n-grams (modified for the example)
def _create_ngrams(string: str):
        """ Create n_grams from a string
        Steps:
            * Extract character-level ngrams with `self.n_gram_range` (both ends inclusive)
            * Remove n-grams that have a whitespace in them
        """
        result = []
        for n in range(3, 4):
            ngrams = zip(*[string[i:] for i in range(n)])
            ngrams = [''.join(ngram) for ngram in ngrams if ' ' not in ngram]
            result.extend(ngrams)

        return result

In [11]:
# We define 2 lists to be matched, where the "to_list" can be seen as the reference (for example, a product catalogue)
from_list = ["coke 1.5L","sprite zero 1.5L"]
to_list = ["diet coke 1.5L"]

# We define the vectorizer using the union of elements of both lists
  # With min_df=1
  # With analyzer=_create_ngrams we are specifying the function to create the n grams
vectorizer = TfidfVectorizer(min_df=1, analyzer=_create_ngrams).fit(to_list + from_list)

# Ya teniendo el vectorizer generamos matrices esparsas con tantas columnas como n-gramas únicos y filas como palabras en cada lista
# After fitting the vectorizer, we generate sparse matrix with
 # as many columns as unique n-grams
 # as many rows as words in each list
tf_idf_to = vectorizer.transform(to_list)
tf_idf_from = vectorizer.transform(from_list)

In [12]:
# Check the vocabulary of the vectorizer, i.e. the unique partitions of n-grams
vectorizer.vocabulary_

{'die': 3,
 'iet': 5,
 'cok': 2,
 'oke': 7,
 '1.5': 1,
 '.5L': 0,
 'spr': 10,
 'pri': 8,
 'rit': 9,
 'ite': 6,
 'zer': 11,
 'ero': 4}

In [13]:
#We check the matrices
matrix_from=pd.DataFrame(tf_idf_from.todense(),columns=sorted(vectorizer.vocabulary_))
matrix_to=pd.DataFrame(tf_idf_to.todense(),columns=sorted(vectorizer.vocabulary_))

In [14]:
matrix_from

Unnamed: 0,.5L,1.5,cok,die,ero,iet,ite,oke,pri,rit,spr,zer
0,0.433708,0.433708,0.558478,0.0,0.0,0.0,0.0,0.558478,0.0,0.0,0.0,0.0
1,0.228215,0.228215,0.0,0.0,0.386401,0.0,0.386401,0.0,0.386401,0.386401,0.386401,0.386401


In [15]:
matrix_to

Unnamed: 0,.5L,1.5,cok,die,ero,iet,ite,oke,pri,rit,spr,zer
0,0.300832,0.300832,0.387376,0.509353,0.0,0.509353,0.0,0.387376,0.0,0.0,0.0,0.0


In [16]:
# We calculate the similarity between the pairs of vectors
cosine_similarity(matrix_from, matrix_to)

array([[0.69362794],
       [0.13730861]])

In [17]:
# Alternatively we could have avoided importing the cosine similarity function
# and use the dot product.
np.dot(np.array(matrix_from.iloc[0]),
       np.array(matrix_to.iloc[0])),np.dot(np.array(matrix_from.iloc[1]),
                                           np.array(matrix_to.iloc[0]))

(0.6936279421797706, 0.1373086119752811)

### 5. Additional useful measures: Token set ratio and Levenshtein distance

In [18]:
# We create 2 strings
s1="diet coke 600ml"
s2="diet cooke diet 600ml"

# We preprocess and tokenize the strings
tokens1 = set(fuzz.utils.full_process(s1).split())
tokens2 = set(fuzz.utils.full_process(s2).split())

# We obtain the intersection and differences of both sets
intersection = tokens1.intersection(tokens2)
diff1to2 = tokens1.difference(tokens2)
diff2to1 = tokens2.difference(tokens1)

# We concatenate the ordered the sets
sorted_sect = " ".join(sorted(intersection))
sorted_1to2 = " ".join(sorted(diff1to2))
sorted_2to1 = " ".join(sorted(diff2to1))

# We combine the results
combined_1to2 = sorted_sect + " " + sorted_1to2
combined_2to1 = sorted_sect + " " + sorted_2to1

# We remove redundant whitespaces
sorted_sect = sorted_sect.strip()
combined_1to2 = combined_1to2.strip()
combined_2to1 = combined_2to1.strip()

In [19]:
tokens1

{'600ml', 'coke', 'diet'}

In [20]:
# We calculate the fuzz ratio
pairwise = [fuzz.ratio(sorted_sect, combined_1to2)/100,
            fuzz.ratio(sorted_sect, combined_2to1)/100,
            fuzz.ratio(combined_1to2, combined_2to1)/100]
pairwise

[0.8, 0.77, 0.97]

In [21]:
# We pick the max
max(pairwise)

0.97

In [22]:
# which is obtained by comparing
combined_1to2, combined_2to1

('600ml diet coke', '600ml diet cooke')

In [23]:
# bonus, levenshtein distance:
edit_distance(combined_1to2,combined_2to1)

1

### 6. Building a baseline model

In [24]:
# Call the TFIDF vectorizer from PolyFuzz while specifying:
 # the n_grams in which the list items will be partitioned,
 # the min similarity to be considered in the output
 # whether or not stopwords and punctuations will be cleared

tfidf_vectorizer = TFIDF(n_gram_range=(3,3),clean_string=True)

# Define an instance of the PolyFuzz class using the tfidf_vectorizer model
model = PolyFuzz(tfidf_vectorizer)

# Specify a list from which its items will be matched to a refrence list
from_data = list(external_data["product_cleaned_from"])
to_data = list(internal_data["product_cleaned_to"])

# Send the lists of elements for the matching
model.match(from_data, to_data)

# Obtain the matches
matches = model.get_matches()
matches.dropna(inplace=True)

# Drop duplicates
data_matched = matches.drop_duplicates(["From","To"]).reset_index(drop=True).copy()

# Add some additional similarity measures
 # Token set ratio 
data_matched["Token_set_ratio"]=data_matched.apply(lambda x: fuzz.token_set_ratio(x["From"],x["To"])/100,axis=1)

 # Levenshtein distance
data_matched["Edit_distance"]= data_matched.apply(lambda x: edit_distance(x['From'],x['To']),axis=1)

# Extract the unity of measures of the products
data_matched["Quantity_from"]=data_matched["From"].str.extract(r"(\d+?\.?\d+?\s?ml$|\d?\d+?\.?\d+?\s?l$|\s\d?l$)")
data_matched["Quantity_to"]=data_matched["To"].str.extract(r"(\d+?\.?\d+?\s?ml$|\d?\d+?\.?\d+?\s?l$|\s\d?l$)")

# Specify the origin columns used for the matching process to be able to match the dataframes
data_matched["Property_from"] = "product_cleaned_from"
data_matched["Property_to"] = "product_cleaned_to"

# Match the results with the input data to associate them with all their data
# First we merge the external_data
data_matched = pd.merge(data_matched,external_data,left_on="From",right_on=f"{data_matched.Property_from[0]}")

# Then we use the previous join to merge it with the internal data
final_results = pd.merge(data_matched,internal_data,left_on="To",right_on="product_cleaned_to")

final_results=final_results[["product","product_name","From","To","Similarity",
                             "Token_set_ratio","Edit_distance","Quantity_from","Quantity_to"]]

In [25]:
final_results.sample(20)

Unnamed: 0,product,product_name,From,To,Similarity,Token_set_ratio,Edit_distance,Quantity_from,Quantity_to
570,beverage acid coke 2.25 lt,DIET VANILLA COKE X 2.25L,acid coke 2.25l,diet vanilla coke x 2.25l,0.601,0.8,12,2.25l,2.25l
4318,beverage dit chocolate pepsi 1.5 l.,DIET CHOCOLATE PEPSI X 1.5L,dit chocolate pepsi 1.5l,diet chocolate pepsi x 1.5l,0.901,0.94,3,1.5l,1.5l
2595,beverages pepsi dit acid 500cc,DIET VANILLA PEPSI X 500ML,pepsi dit acid 500ml,diet vanilla pepsi x 500ml,0.669,0.71,16,500ml,500ml
354,beverage dit acid cokke 1.5 L,DIET VANILLA COKE X 1.5L,dit acid cokke 1.5l,diet vanilla coke x 1.5l,0.222,0.6,9,1.5l,1.5l
3516,beverages prite diiet chocolate 1.5 lts,DIET CHOCOLATE SPRITE X 1.5L,prite diiet chocolate 1.5l,diet chocolate sprite x 1.5l,0.821,0.93,16,1.5l,1.5l
1783,beverage diiet vanilla spriite 354 cc,DIET VANILLA SPRITE X 354ML,diiet vanilla spriite 354ml,diet vanilla sprite x 354ml,0.739,0.93,4,354ml,354ml
477,beverages coke vanilla 2.25 L,DIET VANILLA COKE X 2.25L,coke vanilla 2.25l,diet vanilla coke x 2.25l,0.926,1.0,11,2.25l,2.25l
2795,beverages pepsi dit vanilla 2.25 L,DIET VANILLA PEPSI X 2.25L,pepsi dit vanilla 2.25l,diet vanilla pepsi x 2.25l,0.9,0.94,15,2.25l,2.25l
990,beverage vanilla spriite 500 ml,DIET VANILLA SPRITE X 500ML,vanilla spriite 500ml,diet vanilla sprite x 500ml,0.759,0.83,8,500ml,500ml
2014,beverage vanilla afnta 500ml,DIET VANILLA FANTA X 500ML,vanilla afnta 500ml,diet vanilla fanta x 500ml,0.658,0.81,9,500ml,500ml
