In [1]:
from rapidfuzz.process import extractOne
from rapidfuzz import fuzz
import pandas as pd

In [2]:
df_wow = pd.read_csv('2. Modified Data/Modified Woolworths.csv').fillna('')
df_coles = pd.read_csv('2. Modified Data/Modified Coles.csv').fillna('')
df_iga = pd.read_csv('2. Modified Data/Modified IGA.csv')

## Func/Testing

In [3]:
def find_fuzzymatch(item, choices):
     return extractOne(item, choices, scorer=fuzz.token_sort_ratio, score_cutoff=70)

In [4]:
# Testing and seeing what the function outputs

# for item in df_iga['Brand_Product_Size']:
#     results_tuple = find_fuzzymatch(item, df_coles['Brand_Product_Size'])
#     try:
#         print(results_tuple[0], results_tuple[1])
#     except:
#         print('None')

In [5]:
def get_fuzzymatches(target_df, comparison_df):
    
    target_df['FuzzyProduct Match'] = target_df['Brand_Product_Size'].apply(lambda x: find_fuzzymatch(x, comparison_df['Brand_Product_Size']))
   
    return target_df

### Performace Comparison - Original FuzzyWuzzy (Python implementation) vs RapidFuzz (Fully implemented in C++)

#### Timing RapidFuzz - IGA vs Coles

In [6]:
# %%timeit -r1 -n1
# get_fuzzymatches(df_iga, df_coles)

#### Timing FuzzyWuzzy - IGA vs Coles

In [7]:
# from fuzzywuzzy import process
# from fuzzywuzzy import fuzz as fz

In [8]:
# def slow_find_fuzzymatch(item, choices):
#     return process.extractOne(item, choices, scorer=fz.token_sort_ratio, score_cutoff=70)

In [9]:
# def slow_get_fuzzymatches(target_df, comparison_df):
    
#     target_df['FuzzyProduct Match'] = target_df['Brand_Product_Size'].apply(lambda x: slow_find_fuzzymatch(x, comparison_df['Brand_Product_Size']))
   
#     return target_df

In [10]:
# %%timeit -r1 -n1
# slow_get_fuzzymatches(df_iga, df_coles)

### Results
* For the exact same operation: 
    * RapidFuzz took 47 secs 
    * FuzzyWuzzy took 455 secs
* The above results show that RapidFuzz is about 10x faster than the original FuzzyWuzzy package coded in pure Python.
* It has all the functions FuzzyWuzzy has and the syntax is almost exactly the same.

## Fuzzy Matching Coles and Woolworths

In [11]:
fuzzy_coles_wow = get_fuzzymatches(df_coles, df_wow).copy()

In [12]:
fuzzy_coles_wow['Product Match'] = fuzzy_coles_wow['FuzzyProduct Match'].apply(lambda x: x[0] if x is not None else '')
fuzzy_coles_wow['Match Score'] = fuzzy_coles_wow['FuzzyProduct Match'].apply(lambda x: int(x[1]) if x is not None else '')

In [13]:
fuzzy_coles_wow.head()

Unnamed: 0,SKU,Brand_Product_Size,Brand,Product Name,COL Price,COL ppu,COL Size,COL Specials,COL Category,Product URL,FuzzyProduct Match,Product Match,Match Score
0,3980255P,Head & Shoulders Conditioner Smooth & Silky 660mL,Head & Shoulders,Conditioner Smooth & Silky,17.0,$2.58 per 100mL,660mL,,Health & Beauty,https://shop.coles.com.au/a/national/product/s...,(Head & Shoulders Smooth & Silky Conditioner 6...,Head & Shoulders Smooth & Silky Conditioner 660mL,100.0
1,3838775P,Coles Jujube Prepack 400g,Coles,Jujube Prepack,9.9,$24.75 per 1Kg,400g,,Fruit & Veg,https://shop.coles.com.au/a/national/product/f...,,,
2,6046740P,Fantastic Crispy Bacon Noodle Cup 70g,Fantastic,Crispy Bacon Noodle Cup,1.6,$2.29 per 100G,70g,,Pantry,https://shop.coles.com.au/a/national/product/f...,"(Fantastic Crispy Bacon Noodle Cup 70g, 100.0,...",Fantastic Crispy Bacon Noodle Cup 70g,100.0
3,403765P,McCain Frozen Ham & Pineapple Family Pizza 500g,McCain,Frozen Ham & Pineapple Family Pizza,5.5,$1.10 per 100G,500g,Specials,Frozen,https://shop.coles.com.au/a/national/product/m...,"(Mccain Pizza Ham & Pineapple 500g, 81.5789473...",Mccain Pizza Ham & Pineapple 500g,81.0
4,3274024P,Perfect Italiano Grated Cheese Perfect Bakes 250g,Perfect Italiano,Grated Cheese Perfect Bakes,4.25,$17.00 per 1Kg,250g,Specials,Dairy Eggs & Fridge,https://shop.coles.com.au/a/national/product/p...,(Perfect Italiano Perfect Bakes 3 Cheeses 450g...,Perfect Italiano Perfect Bakes 3 Cheeses 450g,87.0


In [14]:
coles_wow_fuzzymatched = fuzzy_coles_wow[[
        'SKU', 'Brand', 'Product Name', 'Brand_Product_Size', 'Product Match', 'Match Score', 'COL Price', 
        'COL ppu', 'COL Size', 'COL Specials', 'COL Category', 'Product URL'
    ]]
coles_wow_fuzzymatched = coles_wow_fuzzymatched.rename(columns={'Product Match': 'WOW Product Match'})

In [15]:
coles_wow_fuzzymatched.head()

Unnamed: 0,SKU,Brand,Product Name,Brand_Product_Size,WOW Product Match,Match Score,COL Price,COL ppu,COL Size,COL Specials,COL Category,Product URL
0,3980255P,Head & Shoulders,Conditioner Smooth & Silky,Head & Shoulders Conditioner Smooth & Silky 660mL,Head & Shoulders Smooth & Silky Conditioner 660mL,100.0,17.0,$2.58 per 100mL,660mL,,Health & Beauty,https://shop.coles.com.au/a/national/product/s...
1,3838775P,Coles,Jujube Prepack,Coles Jujube Prepack 400g,,,9.9,$24.75 per 1Kg,400g,,Fruit & Veg,https://shop.coles.com.au/a/national/product/f...
2,6046740P,Fantastic,Crispy Bacon Noodle Cup,Fantastic Crispy Bacon Noodle Cup 70g,Fantastic Crispy Bacon Noodle Cup 70g,100.0,1.6,$2.29 per 100G,70g,,Pantry,https://shop.coles.com.au/a/national/product/f...
3,403765P,McCain,Frozen Ham & Pineapple Family Pizza,McCain Frozen Ham & Pineapple Family Pizza 500g,Mccain Pizza Ham & Pineapple 500g,81.0,5.5,$1.10 per 100G,500g,Specials,Frozen,https://shop.coles.com.au/a/national/product/m...
4,3274024P,Perfect Italiano,Grated Cheese Perfect Bakes,Perfect Italiano Grated Cheese Perfect Bakes 250g,Perfect Italiano Perfect Bakes 3 Cheeses 450g,87.0,4.25,$17.00 per 1Kg,250g,Specials,Dairy Eggs & Fridge,https://shop.coles.com.au/a/national/product/p...


In [16]:
coles_wow_fuzzymatched.to_csv('3. Fuzzy Matching/Token Sort/Coles_Woolworths.csv', index=False, encoding='utf-8-sig')

## Woolworth-Coles (reverse of above)

In [17]:
fuzzy_wow_coles = get_fuzzymatches(df_wow, df_coles).copy()

In [18]:
fuzzy_wow_coles['Product Match'] = fuzzy_wow_coles['FuzzyProduct Match'].apply(lambda x: x[0] if x is not None else '')
fuzzy_wow_coles['Match Score'] = fuzzy_wow_coles['FuzzyProduct Match'].apply(lambda x: int(x[1]) if x is not None else '')

In [19]:
fuzzy_wow_coles.head()

Unnamed: 0,SKU,Brand_Product_Size,Brand,Product Name,WOW Price,WOW Size,WOW ppu,WOW Specials,WOW Category,Online Only,New Product,Product URL,FuzzyProduct Match,Product Match,Match Score
0,814139,Em Wholefoods Hemp Oil Cold Pressed 250mL,em wholefoods,Em Wholefoods Hemp Oil Cold Pressed,19.95,250mL,$7.98 / 100ML,,Pantry,,,https://www.woolworths.com.au/shop/productdeta...,"(Soulseed Cold Pressed Hemp Oil 250mL, 83.1168...",Soulseed Cold Pressed Hemp Oil 250mL,83.0
1,84972,Happy Little Camper Newborn Natural Nappies 36...,happy little camper,Happy Little Camper Newborn Natural Nappies,14.0,36 pack,$0.39 / 1EA,,Baby,,,https://www.woolworths.com.au/shop/productdeta...,,,
2,95412,Spc Spaghetti Rich Tomato 140g x12 pack,spc,Spc Spaghetti Rich Tomato,10.5,140g x12 pack,$0.63 / 100G,,Pantry,,,https://www.woolworths.com.au/shop/productdeta...,"(SPC Spaghetti in Rich Tomato 4 pack 220g, 83....",SPC Spaghetti in Rich Tomato 4 pack 220g,83.0
3,99040,Mint Glazed Antipasto Dish Blue each,mint,Mint Glazed Antipasto Dish Blue,2.0,each,,,NOT LISTED,,,https://www.woolworths.com.au/shop/productdeta...,,,
4,808305,Seedlip Grove 42 Alcohol Free 700mL,seedlip,Seedlip Grove 42 Alcohol Free,50.0,700mL,$71.43 / 1L,,Drinks,,,https://www.woolworths.com.au/shop/productdeta...,,,


In [20]:
wow_coles_fuzzymatched = fuzzy_wow_coles[[
        'SKU', 'Brand', 'Product Name', 'Brand_Product_Size', 'Product Match', 'Match Score', 'WOW Price', 
        'WOW ppu', 'WOW Size', 'WOW Specials', 'WOW Category', 'Product URL'
    ]]
wow_coles_fuzzymatched = wow_coles_fuzzymatched.rename(columns={'Product Match': 'COL Product Match'})

In [21]:
wow_coles_fuzzymatched.head()

Unnamed: 0,SKU,Brand,Product Name,Brand_Product_Size,COL Product Match,Match Score,WOW Price,WOW ppu,WOW Size,WOW Specials,WOW Category,Product URL
0,814139,em wholefoods,Em Wholefoods Hemp Oil Cold Pressed,Em Wholefoods Hemp Oil Cold Pressed 250mL,Soulseed Cold Pressed Hemp Oil 250mL,83.0,19.95,$7.98 / 100ML,250mL,,Pantry,https://www.woolworths.com.au/shop/productdeta...
1,84972,happy little camper,Happy Little Camper Newborn Natural Nappies,Happy Little Camper Newborn Natural Nappies 36...,,,14.0,$0.39 / 1EA,36 pack,,Baby,https://www.woolworths.com.au/shop/productdeta...
2,95412,spc,Spc Spaghetti Rich Tomato,Spc Spaghetti Rich Tomato 140g x12 pack,SPC Spaghetti in Rich Tomato 4 pack 220g,83.0,10.5,$0.63 / 100G,140g x12 pack,,Pantry,https://www.woolworths.com.au/shop/productdeta...
3,99040,mint,Mint Glazed Antipasto Dish Blue,Mint Glazed Antipasto Dish Blue each,,,2.0,,each,,NOT LISTED,https://www.woolworths.com.au/shop/productdeta...
4,808305,seedlip,Seedlip Grove 42 Alcohol Free,Seedlip Grove 42 Alcohol Free 700mL,,,50.0,$71.43 / 1L,700mL,,Drinks,https://www.woolworths.com.au/shop/productdeta...


In [22]:
wow_coles_fuzzymatched.to_csv('3. Fuzzy Matching/Token Sort/Woolworths_Coles.csv', index=False, encoding='utf-8-sig')

## Fuzzy Matching IGA and Coles

In [23]:
fuzzy_iga_coles = get_fuzzymatches(df_iga, df_coles).copy()

In [24]:
fuzzy_iga_coles['Product Match'] = fuzzy_iga_coles['FuzzyProduct Match'].apply(lambda x: x[0] if x is not None else '')
fuzzy_iga_coles['Match Score'] = fuzzy_iga_coles['FuzzyProduct Match'].apply(lambda x: int(x[1]) if x is not None else '')

In [25]:
fuzzy_iga_coles.head()

Unnamed: 0,SKU,Brand_Product_Size,IGA Price,IGA ppu,IGA Category,Product URL,FuzzyProduct Match,Product Match,Match Score
0,9300675009775,Diet Coke Soft Drink 600ml,4.1,2.46 per litre,Drinks,https://igashop.com.au/product/diet-coke-soft-...,"(Coca Cola Soft Drink Diet Coke Can 250mL, 72....",Coca Cola Soft Drink Diet Coke Can 250mL,72.0
1,4155,Granny Smith Apple,0.99,5.50 per kg,Fruit & Veg,https://igashop.com.au/product/granny-smith-ap...,,,
2,4156,Pink Lady Apple,1.2,6.00 per kg,Fruit & Veg,https://igashop.com.au/product/pink-lady-apple/,,,
3,9310023141460,Helga’s Wraps Traditional White 8 Pack 560g,5.55,0.99 per 100g,Bakery,https://igashop.com.au/product/helgas-wraps-tr...,"(Helga's Traditional White Wraps 8 pack 560g, ...",Helga's Traditional White Wraps 8 pack 560g,100.0
4,9310023141446,Helga’s Mixed Grain Wraps 8 Pack 560g,5.55,0.99 per 100g,Bakery,https://igashop.com.au/product/helgas-mixed-gr...,"(Helga's Mixed Grain Wraps 8 pack 560g, 100.0,...",Helga's Mixed Grain Wraps 8 pack 560g,100.0


In [26]:
iga_coles_fuzzymatched = fuzzy_iga_coles[['SKU', 'Brand_Product_Size', 'Product Match', 'Match Score', 'IGA Price', 'IGA ppu', 'IGA Category', 'Product URL']]
iga_coles_fuzzymatched = iga_coles_fuzzymatched.rename(columns={'Product Match': 'COL Product Match'})

In [27]:
iga_coles_fuzzymatched.head()

Unnamed: 0,SKU,Brand_Product_Size,COL Product Match,Match Score,IGA Price,IGA ppu,IGA Category,Product URL
0,9300675009775,Diet Coke Soft Drink 600ml,Coca Cola Soft Drink Diet Coke Can 250mL,72.0,4.1,2.46 per litre,Drinks,https://igashop.com.au/product/diet-coke-soft-...
1,4155,Granny Smith Apple,,,0.99,5.50 per kg,Fruit & Veg,https://igashop.com.au/product/granny-smith-ap...
2,4156,Pink Lady Apple,,,1.2,6.00 per kg,Fruit & Veg,https://igashop.com.au/product/pink-lady-apple/
3,9310023141460,Helga’s Wraps Traditional White 8 Pack 560g,Helga's Traditional White Wraps 8 pack 560g,100.0,5.55,0.99 per 100g,Bakery,https://igashop.com.au/product/helgas-wraps-tr...
4,9310023141446,Helga’s Mixed Grain Wraps 8 Pack 560g,Helga's Mixed Grain Wraps 8 pack 560g,100.0,5.55,0.99 per 100g,Bakery,https://igashop.com.au/product/helgas-mixed-gr...


In [28]:
iga_coles_fuzzymatched.to_csv('3. Fuzzy Matching/Token Sort/IGA_Coles.csv', index=False, encoding='utf-8-sig')

## Fuzzy Matching IGA and Woolworths

In [29]:
fuzzy_iga_wow = get_fuzzymatches(df_iga, df_wow).copy()

In [30]:
fuzzy_iga_wow['Product Match'] = fuzzy_iga_wow['FuzzyProduct Match'].apply(lambda x: x[0] if x is not None else '')
fuzzy_iga_wow['Match Score'] = fuzzy_iga_wow['FuzzyProduct Match'].apply(lambda x: int(x[1]) if x is not None else '')

In [31]:
fuzzy_iga_wow.head()

Unnamed: 0,SKU,Brand_Product_Size,IGA Price,IGA ppu,IGA Category,Product URL,FuzzyProduct Match,Product Match,Match Score
0,9300675009775,Diet Coke Soft Drink 600ml,4.1,2.46 per litre,Drinks,https://igashop.com.au/product/diet-coke-soft-...,"(Coca-cola Diet Soft Drink Bottle 600mL, 75.0,...",Coca-cola Diet Soft Drink Bottle 600mL,75
1,4155,Granny Smith Apple,0.99,5.50 per kg,Fruit & Veg,https://igashop.com.au/product/granny-smith-ap...,"(Fresh Granny Smith Apples each, 75.0, 13612)",Fresh Granny Smith Apples each,75
2,4156,Pink Lady Apple,1.2,6.00 per kg,Fruit & Veg,https://igashop.com.au/product/pink-lady-apple/,"(Fresh Pink Lady Apples each, 71.4285714285714...",Fresh Pink Lady Apples each,71
3,9310023141460,Helga’s Wraps Traditional White 8 Pack 560g,5.55,0.99 per 100g,Bakery,https://igashop.com.au/product/helgas-wraps-tr...,"(Helga's Wraps Traditional White 560g 8 pack, ...",Helga's Wraps Traditional White 560g 8 pack,100
4,9310023141446,Helga’s Mixed Grain Wraps 8 Pack 560g,5.55,0.99 per 100g,Bakery,https://igashop.com.au/product/helgas-mixed-gr...,"(Helga's Wraps Mixed Grain 560g 8 pack, 100.0,...",Helga's Wraps Mixed Grain 560g 8 pack,100


In [32]:
iga_wow_fuzzymatched = fuzzy_iga_wow[['SKU', 'Brand_Product_Size', 'Product Match', 'Match Score', 'IGA Price', 'IGA ppu', 'IGA Category', 'Product URL']]
iga_wow_fuzzymatched = iga_wow_fuzzymatched.rename(columns={'Product Match': 'WOW Product Match'})

In [33]:
iga_wow_fuzzymatched.head()

Unnamed: 0,SKU,Brand_Product_Size,WOW Product Match,Match Score,IGA Price,IGA ppu,IGA Category,Product URL
0,9300675009775,Diet Coke Soft Drink 600ml,Coca-cola Diet Soft Drink Bottle 600mL,75,4.1,2.46 per litre,Drinks,https://igashop.com.au/product/diet-coke-soft-...
1,4155,Granny Smith Apple,Fresh Granny Smith Apples each,75,0.99,5.50 per kg,Fruit & Veg,https://igashop.com.au/product/granny-smith-ap...
2,4156,Pink Lady Apple,Fresh Pink Lady Apples each,71,1.2,6.00 per kg,Fruit & Veg,https://igashop.com.au/product/pink-lady-apple/
3,9310023141460,Helga’s Wraps Traditional White 8 Pack 560g,Helga's Wraps Traditional White 560g 8 pack,100,5.55,0.99 per 100g,Bakery,https://igashop.com.au/product/helgas-wraps-tr...
4,9310023141446,Helga’s Mixed Grain Wraps 8 Pack 560g,Helga's Wraps Mixed Grain 560g 8 pack,100,5.55,0.99 per 100g,Bakery,https://igashop.com.au/product/helgas-mixed-gr...


In [34]:
iga_wow_fuzzymatched.to_csv('3. Fuzzy Matching/Token Sort/IGA_Woolworths.csv', index=False, encoding='utf-8-sig')

* Token Sort with a score of 100 is accurate in matching the same product (It's just matching the 'tokens' i.e. the words, so order is irrelevant)
* Token Set has the ability to match things like the following:
    * Huggies Infant Size 2 Unisex 96 Nappies Jumbo Pack 1 pack (Coles) vs Huggies Jumbo Infant Nappies Nappies 96 pack (Woolworths)