# Matching Algorithm - Matching Through One Column

##### Import Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re #regular expressions 

##### Read In Data

                                                      M&W Reduced

In [2]:
pd.set_option('display.max_rows', None)
redFood = pd.read_csv(r"C:\Users\medekar\Desktop\Product_Weight_Project\Data\Processed\Reduced Super Group\Cleaned\Reduced_SecondClean.csv", index_col=0)
redFood.rename(columns={'Super Group':'SuperGroup'})

Unnamed: 0,Food Code,Food Name,Group,SuperGroup,Sale format(s)
0,17-208,"Beer, bitter, best, premium",QA,Alcoholic beverages,"can, can multipack, bottle, bottle multipack"
1,17-224,"Cider, sweet",QC,Alcoholic beverages,"can, can multipack, bottle, bottle multipack"
2,17-234,Port,QF,Alcoholic beverages,bottle
3,17-236,"Sherry, medium",QF,Alcoholic beverages,bottle
4,17-247,"Spirits, 40% volume",QK,Alcoholic beverages,"bottle, miniature"
5,17-239,"Vermouth, dry",QG,Alcoholic beverages,bottle
6,17-752,"Wine, red",QE,Alcoholic beverages,"bottle, small bottle, box"
7,17-756,"Wine, white, medium",QE,Alcoholic beverages,"bottle, small bottle, box"
8,14-272,"Apple juice concentrate, unsweetened, commerical",PE,Beverages,"carton, tetrapak, bottle"
9,17-632,"Coffee, cappuccino, latte",P,Beverages,


                                                       Retail Data

In [3]:
CleanRetai = pd.read_csv(r'C:\Users\medekar\Desktop\Product_Weight_Project\Data\Processed\FoodPortionSized\FPS_VJ.csv')

In [4]:
CleanRetail = CleanRetai[CleanRetai['PurEqualCon'].isin(['Y', 'Y, P'])]
CleanRetail

Unnamed: 0,Group,Brand,Food Name,Portion Consumed,Weight,PurEqualCon,Purchased,Format,Source
0,BABY FOODS,Boots,"First Harvest,Dessert starter",,125g,Y,,,
1,BABY FOODS,Boots,"First Harvest, Infant desserts",,125g,Y,,,
2,BABY FOODS,Boots,"First Harvest,Infant savouries",,125g,Y,,,
3,BABY FOODS,Boots,"First Harvest,Junior desserts",,190g,Y,,,
4,BABY FOODS,Boots,"First Harvest,Junior savouries",,190g,Y,,,
5,BABY FOODS,Boots,"First Harvest,Savoury starters",,125g,Y,,,
6,BABY FOODS,Cow and Gate,Ready to drink baby juice (125m1s),,130g,Y,,,
7,BABY FOODS,Cow and Gate,Ready to drink baby juice,Stage 1 jars,150g,Y,,,
8,BABY FOODS,Cow and Gate,Ready to drink baby juice,Stage 2 jars,200g,Y,,,
10,BABY FOODS,Baby Danone,Fromage frais,,60g,Y,,,


###                                                        Match Data.

###### Once matched, Import Weight and Unit Columns into Reduced M&W table

# 1

                                                   Using Fuzzy Matching

In [7]:
%%time
from fuzzywuzzy import fuzz

def match_food_group(CleanRetail, redFood):
    """
    Matches food groups between two DataFrames using fuzzy matching and updates 
    redFood database with Weight and Unit from CleanRetail dataframe.

    Args:
        CleanRetail (DataFrame): The DataFrame containing the clean retail data.
        redFood (DataFrame): The DataFrame containing the red food data.

    Returns:
        DataFrame: The red food DataFrame with updated weight, unit, and super group values.
    """
    # Set a threshold for fuzzy matching.
    threshold = 65

    # Create a set to store the SKU descriptions that have already been matched.
    matched_skus = set()

    # Add Weight and Unit columns to the redFood DataFrame.
    redFood['Weight'] = ''
    redFood['PurEqualCon'] = ''
    
#    CleanRetail['SKUDesc'] = CleanRetail['SKUDesc'].str.lower()

    # Loop through the rows in the redFood DataFrame.
    for i, row in redFood.iterrows():
        # Get the food name from the current row.
        product = row['Food Name']

        # Check if the food name has already been matched.
        if product in matched_skus:
            continue

        # Check similarity between the food name and SKU descriptions using fuzzy matching.
        matches = CleanRetail['Food Name'].apply(lambda x: fuzz.token_sort_ratio(x, product) >= threshold)

        # Replace NaN values in the boolean Series with False.
        matches = matches.fillna(False)

        # If any match is found, update the 'Weight', 'Unit', and 'Super Group' values in the redFood DataFrame.
        if matches.any():
            match_indices = matches[matches].index
            match_index = match_indices[0]
            food = CleanRetail.loc[match_index, 'Weight']
            unit = CleanRetail.loc[match_index, 'PurEqualCon']
            matched_skus.add(product)

            
            redFood.loc[i, 'Weight'] = food
            redFood.loc[i, 'PurEqualCon'] = unit
            
            # If the weight is 0 and there is no unit, set the unit to '-'.
           
#             if redFood.loc[i, 'Weight'] == 0 and pd.isnull(redFood.loc[i, 'Unit']):
#                 redFood.loc[i, 'Unit'] = '-'
#         else:
#             redFood.loc[i, 'Weight'] = ' '
#             redFood.loc[i, 'Unit'] = ' '


    return redFood

redFood1 = match_food_group(CleanRetail.copy(), redFood.copy())

CPU times: total: 4.59 s
Wall time: 4.59 s


# 2

                                    Using Term Frequency-Inverse Document Frequency
                                                                        
Its a feature extraction technique used in natural language processing and information retrieval. The main purpose of TfidfVectorizer is to transform a collection of text documents into a numerical feature matrix that can be used as input for machine learning algorithms.

Starts with:

Tokenization: The input text is first split into individual words or tokens.

Term Frequency (TF): For each token, the term frequency is calculated, which represents the number of times a token appears in a specific document.

Inverse Document Frequency (IDF): The inverse document frequency measures how important a word is in the entire collection of documents. If a word appears frequently in many documents, its IDF score will be low. But if a word is rare and only appears in a few documents, its IDF score will be high.

TF-IDF Weighting: The TF-IDF score is calculated for each token in each document by multiplying the term frequency (TF) with the inverse document frequency (IDF)(how unique it is among all the documents).

The result is that words that appear often in a document but are rare overall get a high TF-IDF score.

In [22]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


def match_food_group(CleanRetail, redFood):
    """
    Matches food groups between two DataFrames using TF-IDF and cosine similarity and updates 
    redFood database with Weight and Unit from CleanRetail dataframe.

    Args:
        CleanRetail (DataFrame): The DataFrame containing the clean retail data.
        redFood (DataFrame): The DataFrame containing the red food data.

    Returns:
        DataFrame: The red food DataFrame with updated weight and unit values.
    """
    # Set a threshold for cosine similarity.
    threshold = 0.7

    # Add Weight and Unit columns to the redFood DataFrame.
    redFood['Weight'] = ''
    redFood['PurEqualCon'] = ''
    redFood['Portion Consumed'] = ''

    # Preprocess text data in both DataFrames for the "Food Name" field.
    CleanRetail['Food Name'] = CleanRetail['Food Name'].str.lower()
    redFood['Food Name'] = redFood['Food Name'].str.lower()

    # Create a TF-IDF vectorizer for the "Food Name" field only in the redFood DataFrame.
    vectorizer = TfidfVectorizer()
    vectorizer.fit(CleanRetail['Food Name'])

    # Transform the "Food Name" field into TF-IDF vectors for both DataFrames.
    tfidf_clean_retail = vectorizer.transform(CleanRetail['Food Name'])
    tfidf_red_food = vectorizer.transform(redFood['Food Name'])

    # Loop through the rows in the redFood DataFrame.
    for i, row in redFood.iterrows():
        # Get the food name from the current row.
        product = row['Food Name']

        # Calculate cosine similarity between the food name and all CleanRetail food names.
        similarities = cosine_similarity(tfidf_red_food[i], tfidf_clean_retail)

        # Find the index of the most similar food name based on cosine similarity.
        match_index = similarities.argmax()

        # If the similarity exceeds the threshold, update the 'Weight' and 'PurEqualCon' values in the redFood DataFrame.
        if similarities[0, match_index] >= threshold:
            food = CleanRetail.loc[CleanRetail.index[match_index], 'Weight']
            unit = CleanRetail.loc[CleanRetail.index[match_index], 'PurEqualCon']
            consumed = CleanRetail.loc[CleanRetail.index[match_index], 'Portion Consumed']

            redFood.at[i, 'Weight'] = food
            redFood.at[i, 'PurEqualCon'] = unit
            redFood.at[i, 'Portion Consumed'] = consumed

    return redFood

redFood2 = match_food_group(CleanRetail.copy(), redFood.copy())

CPU times: total: 719 ms
Wall time: 724 ms


# 3

                                               Using Fuzzy Matching



Here, I use fuzz.token_set_ratio from the fuzzywuzzy library to calculate a similarity score between each food name in redFood and all food names in CleanRetail. The token_set_ratio function considers the set of unique tokens (words) in the strings and calculates the similarity based on common token matches. The similarity score ranges from 0 to 100, where higher values indicate more similarity.


By adjusting the threshold value, you can control the matching sensitivity. Lower threshold values will allow for more leniency in matching, potentially catching more variations, while higher threshold values will require closer matches.

In [14]:
%%time
from fuzzywuzzy import fuzz

def match_food_group(CleanRetai, redFood):
    """
    Matches food groups between two DataFrames using approximate string matching and updates 
    redFood database with Weight and Unit from CleanRetail dataframe.

    Args:
        CleanRetail (DataFrame): The DataFrame containing the clean retail data.
        redFood (DataFrame): The DataFrame containing the red food data.

    Returns:
        DataFrame: The red food DataFrame with updated weight and unit values.
    """
    # Set a threshold for string matching similarity.
    threshold = 70  # You can experiment with different threshold values (0 to 100).

    # Add Weight and Unit columns to the redFood DataFrame.
    redFood['Weight'] = ''
    redFood['PurEqualCon'] = ''
    redFood['Portion Consumed'] = ''

    # Preprocess text data in both DataFrames for the "Food Name" field.
    CleanRetai['Food Name'] = CleanRetai['Food Name'].str.lower()
    redFood['Food Name'] = redFood['Food Name'].str.lower()

    # Loop through the rows in the redFood DataFrame.
    for i, row in redFood.iterrows():
        # Get the food name from the current row.
        product = row['Food Name']

        # Find the most similar food name in CleanRetail using fuzzy string matching.
        best_match = None
        best_score = 0

        for _, retail_row in CleanRetai.iterrows():
            retail_product = retail_row['Food Name']
            similarity_score = fuzz.token_set_ratio(product, retail_product)

            if similarity_score > best_score:
                best_score = similarity_score
                best_match = retail_row

        # If the similarity exceeds the threshold, update the 'Weight' and 'PurEqualCon' values in the redFood DataFrame.
        if best_score >= threshold:
            food = best_match['Weight']
            unit = best_match['PurEqualCon']
            consumed = best_match['Portion Consumed']

            redFood.at[i, 'Weight'] = food
            redFood.at[i, 'PurEqualCon'] = unit
            redFood.at[i, 'Portion Consumed'] = consumed

    return redFood

redFood3 = match_food_group(CleanRetai.copy(), redFood.copy())

CPU times: total: 1min 25s
Wall time: 1min 28s


                              Creating Source column so I can Identify where weights are from

In [9]:
redFood['Source'] = ' '

###### Identifying all columns with weights and assigning data source 

In [10]:
pd.set_option('display.max_rows', None)
WeiRdf = redFood.loc[redFood['Weight'] != '']
WeiRdf['Source'] = 'FPS'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  WeiRdf['Source'] = 'FPS'


###### Updating Red M&W List with Source indications

In [11]:
redFood.update(WeiRdf)

In [12]:
redFood

Unnamed: 0,Food Code,Food Name,Group,Super Group,Sale format(s),Weight,PurEqualCon,Portion Consumed,Source
0,17-208,"beer, bitter, best, premium",QA,Alcoholic beverages,"can, can multipack, bottle, bottle multipack",,,,
1,17-224,"cider, sweet",QC,Alcoholic beverages,"can, can multipack, bottle, bottle multipack",,,,
2,17-234,port,QF,Alcoholic beverages,bottle,,,,
3,17-236,"sherry, medium",QF,Alcoholic beverages,bottle,,,,
4,17-247,"spirits, 40% volume",QK,Alcoholic beverages,"bottle, miniature",29g,Y,1 miniature,FPS
5,17-239,"vermouth, dry",QG,Alcoholic beverages,bottle,,,,
6,17-752,"wine, red",QE,Alcoholic beverages,"bottle, small bottle, box",200g,Y,I small bottle,FPS
7,17-756,"wine, white, medium",QE,Alcoholic beverages,"bottle, small bottle, box",200g,Y,I small bottle,FPS
8,14-272,"apple juice concentrate, unsweetened, commerical",PE,Beverages,"carton, tetrapak, bottle",,,,
9,17-632,"coffee, cappuccino, latte",P,Beverages,,170g,Y,1 average vending machine cup,FPS


In [13]:
a = redFood.loc[redFood['Weight'] != '']
a['Food Name'].value_counts().sum()

94

Exporting Updated dataframe

In [10]:
#redFood.to_csv(r'C:\Users\medekar\Desktop\Product_Weight_Project\Data\Processed\ReducedwithWeights\RedM&Weight.csv')

                                        Importing Retail Data With Weights
                                        
                                        
To testing algorithm again I'm using an Update list of retail matched data to match M&W dataframe and append all unmatched products to M&W dataframe. 

In [13]:
RetailD = pd.read_csv(r'C:\Users\medekar\Desktop\Product_Weight_Project\Confidential\RetailSuperG.csv', index_col=0)
RetailD

In [14]:
RetailD.shape

## Algo to Import products from Sainsbury's df to M&W df

In [19]:
%%time
from fuzzywuzzy import fuzz

def match_food_retail(redFood, RetailD):
    """
    Matches food groups between two DataFrames using fuzzy matching and updates 
    redFood database with Weight and Unit from CleanRetail DataFrame.

    Args:
        CleanRetail (DataFrame): The DataFrame containing the clean retail data.
        redFood (DataFrame): The DataFrame containing the red food data.

    Returns:
        DataFrame: The red food DataFrame with updated items in RetailID that didn't match.
    """
    # Set a higher threshold for fuzzy matching.
    min_token_sort_ratio = 86

    # Create a list to store the SKU descriptions that have already been matched.
    matched_skus = []

    # Check similarity between the food names and SKU descriptions using fuzzy matching.
    for i, row in redFood.iterrows():
        product = redFood.loc[i, 'Food Name']
        matches = RetailD["SKUDesc"].apply(lambda x: fuzz.token_sort_ratio(x, product) >= min_token_sort_ratio)
        matched_skus.extend(matches[matches].index)

    # Filter the unmatched products based on the matched SKUs.
    unmatched_products_df = redFood[~redFood.index.isin(matched_skus)].copy()

    # Add unmatched retail data rows to unmatched_products_df with attached information.
    unmatched_retail_rows = RetailD[~RetailD.index.isin(matched_skus)].copy()
    unmatched_retail_rows["Unmatched Food Name"] = unmatched_retail_rows["SKUDesc"]
    unmatched_products_df = unmatched_products_df.append(unmatched_retail_rows)

    return unmatched_products_df


    unmatched_products_df = match_food_retail(redFood.copy(), RetailD.copy())

##### Checking to see products added to M&W. Did some cleaning in Excel at this point. 

In [21]:
unmatched_products_df

##### Read in refined M&W-Sainsbury Retail df

In [29]:
RetailUM = pd.read_csv(r'C:\Users\medekar\Desktop\Product_Weight_Project\Confidential\RetdJoin.csv')

In [35]:
RetailUM.shape

##### Correcting Sentence Case

In [39]:
RetailUM['Food Name'] = RetailUM['Food Name'].str.capitalize()
RetailUM

##### Updating Data Source for all M&W Data

In [51]:
MnW = RetailUM[RetailUM['Source Data'].isna()]
MM = MnW[~MnW['Food Code'].isna()]
MM['Source Data'] = 'M&W Data'

In [52]:
RetailUM.update(MM)

In [53]:
RetailUM

                                                   Exprot Data

In [54]:
RetailUM.to_csv(r'C:\Users\medekar\Desktop\Product_Weight_Project\Confidential\RetdJoin.csv')

                                                  NEXT NOTEBOOK