# McCane & Widdowson Product Reduction

Reading in Librarys 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re #regular expressions 
from fuzzywuzzy import process, fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os 
import glob

Reduced Food List

                                                     Decisions Made

- Focuses on raw ingredients like fruits, vegetables, meats, dairy products, grains, etc. These make up a large part of grocery store sales.

- Includes commonly purchased prepared/packaged foods like bread, cereal, cheese, canned goods, jars, frozen foods, snacks, sweets, spreads, oils, spices, etc. These are staples people buy at supermarkets.

- Contains different variations of the same base ingredient (e.g. yogurt plain/fruit/Greek, milk whole/semi-skimmed/skimmed). Retailers sell multiple varieties.

- Excludes less common items or items typically made at home rather than store-bought.

- Limits the variations of each item to 2-3 options. Retail shelves cannot fit every possible variety.

- Covers foods from all the major categories - produce, dairy, meats, seafood, pantry items, frozen foods, snacks, etc.

                         List of items most likely sold in food retial store based on M&W food items

In [2]:
food_list = [
  ['Ackee, canned', 'Can'],
  ['Agar, dried', 'Bag'],
  ['Almonds, whole', 'Bag'],
  ['Amaranth leaves, raw', 'Bagged produce'],
  ['Anchovies, canned', 'Can'],
  ['Apple juice, ambient', 'Carton'],
  ['Apples, cooking, raw', 'Loose produce'],
  ['Apples, eating, raw', 'Loose produce'],
  ['Apricots, raw', 'Loose produce'],
  ['Arrowroot', 'Box'],
  ['Artichoke, raw', 'Loose produce'], 
  ['Asparagus, raw', 'Bunched produce'],
  ['Aubergine, raw', 'Loose produce'],
  ['Avocado, raw', 'Loose produce'],
  ['Bacon, back rashers, raw', 'Fresh meat counter'],
  ['Bagels, plain', 'Fresh bakery'],
  ['Baked beans, canned', 'Can'],
  ['Baking powder', 'Box'],
  ['Bananas, raw', 'Loose produce'],
  ['Barley, pearl, raw', 'Bag'],
  ['Basil, dried', 'Bag/jar'],
  ['Beans, butter, canned', 'Can'],
  ['Beef, braising steak, raw', 'Fresh meat counter'],
  ['Beef, fillet steak, raw', 'Fresh meat counter'],
  ['Beetroot, raw', 'Loose produce'],
  ['Biscuits, digestives', 'Pack'],
  ['Biscuits, shortbread', 'Pack'],
  ['Blackberries, raw', 'Punnet'],
  ['Blackcurrants, raw', 'Punnet'],
  ['Bread, naan', 'Fresh bakery'],
  ['Bread, pitta', 'Bag'],
  ['Bread, soda', 'Loaf'],
  ['Bread, tortilla, wheat', 'Pack'],
  ['Bread, white', 'Loaf'],
  ['Bread, wholemeal', 'Loaf'],  
  ['Broccoli, raw', 'Loose produce'],
  ['Brussels sprouts, raw', 'Loose produce'],
  ['Bubble and squeak, homemade', 'Prepared food counter'],
  ['Bulgur wheat, raw', 'Bag'],
  ['Butter','Block'],
  ['Butter, spreadable','Tub'],
  ['Cabbage, raw', 'Loose produce'],
  ['Cake, madeira', 'Fresh bakery'],
  ['Cake, fruit', 'Fresh bakery'], 
  ['Cake, chocolate fudge', 'Fresh bakery'],
  ['Canned fish, tuna', 'Can'],
  ['Canned fish, sardines', 'Can'],
  ['Canned meat, corned beef', 'Can'],
  ['Canned vegetables, sweetcorn', 'Can'],
  ['Canned vegetables, carrots', 'Can'],
  ['Canned vegetables, peas', 'Can'],
  ['Carrots, raw', 'Loose produce'],
  ['Cauliflower, raw', 'Loose produce'],
  ['Celery, raw', 'Loose produce'],
  ['Cereal, cornflakes', 'Box'],
  ['Cereal, muesli', 'Bag'],
  ['Cereal, porridge oats', 'Bag'],
  ['Cheddar cheese', 'Wedge'],
  ['Cheese, brie', 'Wedge'],
  ['Cheese, feta', 'Pack'],
  ['Cheese, mozzarella', 'Pack'],
  ['Cheese, parmesan', 'Wedge'],
  ['Cherries, raw', 'Punnet'],
  ['Chicken, roast', 'Fresh meat counter'],
  ['Chicken, stir-fry strips', 'Fresh meat counter'],
  ['Chickpeas, canned', 'Can'],
  ['Chocolate, milk', 'Bar'],
  ['Chocolate, dark', 'Bar'],
  ['Chocolate, white', 'Bar'],
  ['Coconut, desiccated', 'Bag'],
  ['Coconut milk', 'Can'],
  ['Coffee, instant', 'Jar'],
  ['Cola drink', 'Bottle/can'],
  ['Cookies, chocolate chip', 'Pack'],
  ['Cookies, shortbread', 'Pack'],
  ['Courgettes, raw', 'Loose produce'], 
  ['Couscous, plain', 'Box'],
  ['Crackers, wholemeal', 'Box'],
  ['Crisps, potato', 'Bag'],
  ['Cucumber, raw', 'Loose produce'],
  ['Custard, canned', 'Can'],
  ['Dates, ready to eat', 'Pack aged dried fruit'],
  ['Drinking chocolate powder', 'Tin'],
  ['Dried fruit, raisins', 'Bag'],
  ['Dried fruit, prunes', 'Bag'],
  ['Dried fruit, apricots', 'Bag'],
  ['Duck, roast', 'Fresh meat counter'],
  ['Edam cheese', 'Wedge'],
  ['Eggs', 'Carton'],
  ['Fajita kits', 'Box'],
  ['Falafel mix', 'Box'],
  ['Figs, ready to eat', 'Packaged dried fruit'],
  ['Fish fingers, cod', 'Frozen pack'],
  ['Flour, plain', 'Bag'],
  ['Flour, self-raising', 'Bag'],
  ['Fresh fish, salmon', 'Fresh seafood counter'],
  ['Fresh fish, cod', 'Fresh seafood counter'],
  ['Fresh fish, haddock', 'Fresh seafood counter'],
  ['Fresh meat, beef', 'Fresh meat counter'],
  ['Fresh meat, lamb', 'Fresh meat counter'],  
  ['Fresh meat, pork', 'Fresh meat counter'],
  ['Fresh meat, chicken', 'Fresh meat counter'],
  ['Fruit juices', 'Carton'],
  ['Fruit squash', 'Bottle'],
  ['Fruits, satsumas', 'Bagged produce'],
  ['Fruits, plums', 'Punnet'],
  ['Garam masala', 'Jar'],
  ['Gherkins, pickled', 'Jar'],
  ['Ginger, ground', 'Bag'],
  ['Grapes, seedless', 'Punnet'],
  ['Green beans, raw', 'Bagged produce'],
  ['Herbs, mixed dried', 'Jar'],
  ['Herbs, fresh basil', 'Pot'],
  ['Herbs, fresh parsley', 'Bunched produce'],
  ['Honey', 'Squeeze bottle'],
  ['Horseradish sauce', 'Bottle'],
  ['Hot chocolate powder', 'Tin'],
  ['Ice cream, vanilla', 'Tub'],
  ['Ice cream, strawberry', 'Tub'],
  ['Ice cream, chocolate', 'Tub'],
  ['Instant noodle snacks', 'Cup'],
  ['Jams', 'Jar'], 
  ['Ketchup', 'Bottle'],
  ['Kiwi fruit', 'Loose produce'], 
  ['Lager, canned', 'Can'],
  ['Lamb, leg, raw', 'Fresh meat counter'],
  ['Lamb, shoulder, raw', 'Fresh meat counter'],
  ['Leek, raw', 'Loose produce'],
  ['Lemons', 'Loose produce'],
  ['Lentils, dried', 'Bag'], 
  ['Lettuce, average', 'Loose produce'],
  ['Limes', 'Bagged produce'],
  ['Mackerel, canned', 'Can'],
  ['Mangetout, raw', 'Bagged produce'],
  ['Margarine','Tub'],
  ['Marmalade', 'Jar'],
  ['Marzipan', 'Block'],
  ['Mayonnaise', 'Bottle'],
  ['Milk, whole', 'Bottle'],
  ['Milk, semi-skimmed', 'Bottle'], 
  ['Milk, skimmed', 'Bottle'],
  ['Mince pies', 'Pack'],
  ['Mints, sweets', 'Pack'],
  ['Muesli, no added sugar', 'Bag'],
  ['Mushrooms, raw', 'Closed cup/pack'],
  ['Mussels, raw', 'Fresh seafood counter'],
  ['Mustard', 'Squeeze bottle'],
  ['Noodles, dried', 'Pack'],
  ['Nuts, peanuts', 'Bag'],
  ['Nuts, cashew', 'Bag'],
  ['Nuts, almond', 'Bag'],
  ['Oats, rolled', 'Bag'],
  ['Oils, vegetable', 'Bottle'],
  ['Oils, olive', 'Bottle'], 
  ['Olives, canned', 'Jar'],
  ['Onions, raw', 'Mesh bag'],
  ['Oranges', 'Bag'],
  ['Pasta, dried', 'Box'],
  ['Pasta sauce, jar', 'Jar'],
  ['Pastry, frozen', 'Pack'],
  ['Peaches, canned', 'Can'],
  ['Peanut butter', 'Jar'],
  ['Pears, canned', 'Can'],
  ['Peas, frozen', 'Frozen pack'],
  ['Peas, canned', 'Can'],
  ['Pepper, black', 'Grinder'],
  ['Peppers, raw', 'Loose produce'],
  ['Pickles', 'Jar'],
  ['Pies, meat', 'Fresh bakery'],
  ['Pies, fruit', 'Fresh bakery'],
  ['Pineapple, canned', 'Can'],
  ['Pineapple, fresh','Whole produce'],
  ['Pistachios', 'Bag'], 
  ['Pizza, refrigerated', 'Packaged fresh'],
  ['Plums, raw', 'Punnet'],
  ['Porridge oats', 'Bag'],
  ['Pork sausages', 'Fresh meat counter'],
  ['Pork pies', 'Fresh bakery'], 
  ['Potatoes, old, raw', 'Loose produce'],
  ['Potatoes, new, raw', 'Bagged produce'],
  ['Prawns, frozen', 'Frozen pack'],
  ['Prunes, ready to eat', 'Packaged dried fruit'],
  ['Pulses, dried', 'Bag'],
  ['Quinoa', 'Box'],
  ['Rice, basmati', 'Bag'],
  ['Rice, easy cook', 'Bag'],
  ['Rice, long grain', 'Bag'],
  ['Rocket leaves', 'Bagged produce'],
  ['Rye bread', 'Loaf'],
  ['Salad dressing', 'Bottle'],
  ['Salmon, smoked', 'Fresh seafood counter'],
  ['Sardines, canned', 'Can'],
  ['Sausages, pork', 'Fresh meat counter'],
  ['Sausages, vegetarian', 'Pack'], 
  ['Scotch broth, dried', 'Box'],
  ['Seeds, sunflower', 'Bag'],
  ['Seeds, pumpkin', 'Bag'],
  ['Self-raising flour', 'Bag'], 
  ['Semi-skimmed milk', 'Bottle'],
  ['Sesame seeds', 'Bag'],
  ['Smoked fish, mackerel', 'Pack'],
  ['Smoked meat, ham', 'Sliced deli counter'],
  ['Smoked salmon', 'Pack'],
  ['Soup, canned', 'Can'],
  ['Soya milk', 'Carton'], 
  ['Spaghetti, dried', 'Box'],
  ['Spices, garlic powder', 'Jar'],
  ['Spices, cinnamon', 'Jar'],
  ['Spinach, raw', 'Bagged produce'],
  ['Spring greens, raw', 'Loose produce'],
  ['Spring onions', 'Bunched produce'],
  ['Sprouts, alfalfa', 'Bagged produce'], 
  ['Squash, butternut', 'Whole produce'],
  ['Stews, canned', 'Can'],
  ['Stilton cheese', 'Wedge'],
  ['Strawberries', 'Punnet'],
  ['Sugar, white', 'Bag'],
  ['Sugar, brown', 'Bag'],
  ['Sultanas', 'Bag'], 
  ['Sunflower oil', 'Bottle'],
  ['Sunflower seeds', 'Bag'],
  ['Swede, raw', 'Loose produce'],
  ['Sweetcorn, canned', 'Can'],
  ['Sweetcorn, frozen', 'Frozen pack'],
  ['Sweets, liquorice allsorts', 'Bag'],
  ['Sweets, jelly babies', 'Bag'],
  ['Sweets, mints', 'Pack'],
  ['Swede, raw', 'Loose produce'],
  ['Tea bags', 'Box'],
  ['Tinned tomatoes', 'Can'],
  ['Tinned tuna', 'Can'],
  ['Tofu', 'Pack'],
  ['Tomatoes, raw', 'Loose produce'],
  ['Tomatoes, canned', 'Can'], 
  ['Tuna, canned', 'Can'],
  ['Turkey, whole', 'Fresh meat counter'],
  ['Turmeric', 'Jar'],
  ['Vegetables, frozen mix', 'Frozen pack'],
  ['Vegetable oil', 'Bottle'],
  ['Vegetarian sausages', 'Pack'],
  ['Vinegar, balsamic', 'Bottle'], 
  ['Walnuts', 'Bag'],
  ['Water, still', 'Bottle'],
  ['Wheat biscuits', 'Box'],
  ['Wheatgerm', 'Bag'],
  ['Whiting, raw', 'Fresh seafood counter'],
  ['Wine, red', 'Bottle'],
  ['Wine, white', 'Bottle'],
  ['Yeast, dried', 'Jar'],
  ['Yogurt, fruit', 'Pot'],
  ['Yogurt, plain', 'Pot'],
  ['Yogurt, Greek', 'Pot']
]

In [3]:
len(food_list)

252

In [4]:
food_list2 = [
  ['Apricots, dried', 'Packaged dried fruit'],
  ['Apricots, canned', 'Canned'],
  ['Baking fat, hard margarine', 'Tub'], 
  ['Beef, corned, canned', 'Canned'],
  ['Beef, minced', 'Fresh meat counter'],
  ['Beef, roast', 'Fresh meat counter'],
  ['Beer, lager', 'Bottled/canned'],
  ['Biscuits, chocolate coated', 'Packaged'],
  ['Biscuits, cream filled', 'Packaged'],
  ['Bread, baguettes', 'Fresh bakery'],
  ['Bread, wraps', 'Packaged'],
  ['Buns, burger', 'Fresh bakery'],
  ['Buns, hot dog', 'Fresh bakery'],
  ['Butter, salted', 'Tub'],
  ['Cake bars, snack', 'Packaged snacks'],
  ['Cake, madeira, iced', 'Fresh bakery'],
  ['Cake, sponge', 'Fresh bakery'],
  ['Cereal bars', 'Packaged snacks'],
  ['Cereal, shredded wheat', 'Box'],
  ['Cereal, crunchy nut cornflakes', 'Box'],
  ['Cheese, cheddar, reduced fat', 'Packaged'],
  ['Cheese, cream cheese', 'Tub'],
  ['Cheese, goat cheese', 'Packaged'],
  ['Cheese, halloumi', 'Packaged'],
  ['Cheese, ricotta', 'Tub'],
  ['Chicken, boneless breasts', 'Fresh meat counter'],
  ['Chicken, drumsticks', 'Fresh meat counter'],
  ['Chicken, thigh', 'Fresh meat counter'],
  ['Chicken, turkey, deli meat', 'Fresh meat counter'],
  ['Chickpeas, dried', 'Bag'],
  ['Chocolate bars', 'Packaged snacks'], 
  ['Chutney', 'Jar'],
  ['Coleslaw, prepackaged', 'Tub'],
  ['Cookies, oat', 'Packaged'],
  ['Cous cous, flavored, plain', 'Box'],
  ['Crackers, savory', 'Box'],
  ['Crisps, tortilla', 'Bag'],
  ['Croissants', 'Fresh bakery'],
  ['Curry paste', 'Jar'],
  ['Dips, hummus', 'Tub'],
  ['Dips, salsa', 'Jar'],
  ['Duck, boneless breasts', 'Fresh meat counter'],
  ['Eggs, free range', 'Carton'],
  ['Filo pastry', 'Pack'],
  ['Fish, batter', 'Frozen pack'],
  ['Fish, breaded', 'Frozen pack'],
  ['Fish, pollock', 'Fresh seafood counter'],
  ['Fish, prawns', 'Fresh seafood counter'],
  ['Fish, smoked haddock', 'Fresh seafood counter'],
  ['Fish, tuna, canned in oil', 'Can'],
  ['Fish, whitefish fillets', 'Fresh seafood counter'],
  ['Flour, whole wheat', 'Bag'],
  ['Fruit, canned', 'Can'], 
  ['Fruit, dried', 'Packaged dried fruit'],
  ['Fruit, prepared', 'Packaged produce'],
  ['Gelatine, powder', 'Box'],
  ['Gherkins', 'Jar'],
  ['Gravy mix', 'Jar'],
  ['Ham, sliced', 'Fresh meat counter'],
  ['Herbs, dried mixed', 'Jar'],
  ['Honey, clear', 'Squeeze bottle'], 
  ['Hummus', 'Tub'],
  ['Ice cream cones', 'Box'],
  ['Jelly, powder', 'Box'],
  ['Juices, fruit', 'Carton'],
  ['Ketchup, reduced sugar', 'Squeeze bottle'],
  ['Kitchen towels', 'Pack'],
  ['Lamb, diced for stew', 'Fresh meat counter'],
  ['Lamb, minced', 'Fresh meat counter'],
  ['Lemonade', 'Bottle'], 
  ['Lentils, green, dried', 'Bag'],
  ['Lentils, red, dried', 'Bag'], 
  ['Lettuce, bagged salad', 'Bag'],
  ['Lollipops', 'Packaged candy'],
  ['Meat, deli sliced', 'Fresh meat counter'],
  ['Milk, almond', 'Carton'],
  ['Milk, coconut', 'Carton'],
  ['Milk, lactose free', 'Carton'],
  ['Mincemeat', 'Jar'],
  ['Muesli, no added sugar', 'Bag'],
  ['Muesli, sweetened', 'Bag'],
  ['Mushrooms, chestnut, dried', 'Bag'],
  ['Mushrooms, oyster, dried', 'Bag'],
  ['Mushrooms, shiitake, dried', 'Bag'],
  ['Mustard, wholegrain', 'Jar'],
  ['Noodles, fresh', 'Pack'],
  ['Nuts, mixed', 'Bag'],
  ['Oil, canola', 'Bottle'],
  ['Oil, coconut', 'Bottle'],
  ['Oil, corn', 'Bottle'],
  ['Oil, sunflower', 'Bottle'],
  ['Olives, black', 'Jar'],
  ['Pasta, fresh filled', 'Pack'], 
  ['Pasta, gluten-free', 'Box'],
  ['Pasta, lasagna sheets', 'Box'],
  ['Pasta, wholewheat', 'Box'],
  ['Pastry, shortcrust', 'Pack'],
  ['Pastry, sweet shortcrust', 'Pack'],
  ['Peanut butter, crunchy', 'Jar'],
  ['Peanut butter, smooth', 'Jar'],
  ['Peas, frozen', 'Frozen pack'],
  ['Peppers, assorted', 'Fresh produce'],
  ['Pickles, pickled onions', 'Jar'],
  ['Pies, pork pies', 'Fresh bakery'],
  ['Pineapple, chunks, canned', 'Can'],
  ['Pineapple, crushed, canned', 'Can'],
  ['Pizza, fresh', 'Packaged fresh'],
  ['Pizza, frozen', 'Frozen pack'],
  ['Popcorn', 'Bag'],
  ['Pork, baby back ribs', 'Fresh meat counter'],
  ['Pork, chops', 'Fresh meat counter'],
  ['Pork, diced', 'Fresh meat counter'],
  ['Pork, leg steaks', 'Fresh meat counter'],
  ['Pork, loin chops', 'Fresh meat counter'],
  ['Pork, shoulder', 'Fresh meat counter'],
  ['Potatoes, baked', 'Fresh produce'],
  ['Potatoes, mashed', 'Packaged produce'],
  ['Prawns, cooked', 'Fresh seafood counter'],
  ['Prawns, raw', 'Fresh seafood counter'],
  ['Quiche', 'Fresh bakery'],
  ['Rice, arborio', 'Bag'],
  ['Rice, brown', 'Bag'],
  ['Rice, microwave pouches', 'Pouch'],
  ['Rice, white', 'Bag'], 
  ['Salmon, smoked, sliced', 'Fresh seafood counter'],
  ['Salsa', 'Jar'],
  ['Sardines, in oil or sauce', 'Can'],
  ['Sauces, pasta', 'Jar'],
  ['Sausages, beef', 'Fresh meat counter'],
  ['Sausages, turkey', 'Fresh meat counter'],
  ['Seasonings, mixed', 'Jar'],
  ['Seeds, chia', 'Bag'],
  ['Seeds, sesame', 'Bag'],
  ['Soup, cup of', 'Cup'],
  ['Soup, instant sachets', 'Pouch'],
  ['Soy sauce', 'Bottle'],
  ['Spices, allspice', 'Jar'],
  ['Spices, paprika', 'Jar'],
  ['Spreads, chocolate', 'Jar'],
  ['Spreads, honey', 'Squeeze bottle'], 
  ['Spreads, jam','Jar'],
  ['Spreads, yeast extract', 'Jar'], 
  ['Squash, butternut', 'Fresh produce'],
  ['Sugar, caster', 'Bag'],
  ['Sugar, icing', 'Bag'],
  ['Sugar, muscovado', 'Bag'],
  ['Sultanas, golden', 'Bag'],
  ['Sweetener, granulated', 'Pouch'],
  ['Sweetener, tablets', 'Box'],
  ['Sweets, boiled', 'Bag'],
  ['Sweets, candy', 'Bag'],
  ['Sweets, chewing gum', 'Pack'],
  ['Sweets, fudge', 'Packaged candy'],
  ['Sweets, gummies', 'Bag'],
  ['Sweets, lollipops', 'Packaged candy'],
  ['Tea, black', 'Box'],
  ['Tea, camomile', 'Box'],
  ['Tea, decaffeinated', 'Box'],
  ['Tea, fruit', 'Box'],
  ['Tea, green', 'Box'],
  ['Toilet paper', 'Pack'],
  ['Tomatoes, cherry', 'Packaged produce'],
  ['Tomatoes, plum', 'Packaged produce'],
  ['Tomatoes, sun dried', 'Packaged dried produce'],
  ['Tortillas, corn', 'Pack'],
  ['Tortillas, flour', 'Pack'],
  ['Tuna, canned in spring water', 'Can'],
  ['Vinegar, white wine', 'Bottle'],
  ['Vinegar, red wine', 'Bottle'],
  ['Water, carbonated', 'Bottle'],
  ['Water, flavored', 'Bottle'],
  ['Yeast, fresh', 'Pack'],
  ['Yogurt, frozen', 'Tub'],
  ['Yogurt, low fat varieties', 'Pot'],
  ['Yogurt, organic', 'Pot'],
  ['Yogurt, soy', 'Pot']  
]

In [5]:
len(food_list2)

176

In [6]:
176 + 252

428

                                            Combine into single dataframe

In [7]:
newlist = food_list + food_list2

In [8]:
reduce= pd.DataFrame(newlist)
reduce.rename(columns={0:'Food Name', 1:'Sales Format'}, inplace=True)
reduce

Unnamed: 0,Food Name,Sales Format
0,"Ackee, canned",Can
1,"Agar, dried",Bag
2,"Almonds, whole",Bag
3,"Amaranth leaves, raw",Bagged produce
4,"Anchovies, canned",Can
...,...,...
423,"Yeast, fresh",Pack
424,"Yogurt, frozen",Tub
425,"Yogurt, low fat varieties",Pot
426,"Yogurt, organic",Pot


                                           Matching my new list with M&W Database

Previously Reduced List

In [9]:
mFood = pd.read_csv(r"C:\Users\medekar\Desktop\Product_Weight_Project\Data\Raw Data\Labelling2021_watercress2.csv", index_col = 0)

In [10]:
mFood.drop(['Description','Previous', 'Main data references', 'Footnote', 'Energy (kJ)','Energy (kcal)', 
           'Fat (g)', 'Saturates (g)', 'Carbohydrate (g)','Sugars (g)', 'Starch (g)', 'Fibre (g)', 
           'Protein (g)', 'Salt (g)'], axis=1, inplace=True)

In [11]:
mFood#.head()

Unnamed: 0,Food Code,Food Name,Food sub-group codes,FoodGroup
0,13-145,"Ackee, canned, drained",DG,Vegetables
1,13-146,"Agar, dried",DG,Vegetables
2,13-147,"Agar, dried, soaked and drained",DG,Vegetables
3,13-148,"Alfalfa sprouts, raw",DG,Vegetables
4,13-801,"Allspice, ground",H,Herbs and spices
...,...,...,...,...
2882,12-515,"Yogurt, whole milk, twin pot, not fruit",BN,Milk and milk products
2883,12-531,"Yogurt, whole milk, twin pot, thick and creamy...",BN,Milk and milk products
2884,11-960,"Yorkshire pudding, made with semi-skimmed milk...",AT,Cereals and cereal products
2885,11-1135,"Yorkshire pudding, made with skimmed milk, hom...",AT,Cereals and cereal products


In [12]:
pd.set_option('display.max_rows', None)

                                                       Matching Algo

In [13]:
%%time
# Preprocess strings
def preprocess_string(s):
    s = s.lower()  # Convert to lowercase
    s = ' '.join(s.split())  # Remove extra spaces
    s = s.strip()  # Remove leading/trailing spaces
    return s

# Create a set to keep track of matched indices
matched_indices = set()

# Preprocess food names in both DataFrames
reduce['Food Name'] = reduce['Food Name'].apply(preprocess_string)
mFood['Food Name'] = mFood['Food Name'].apply(preprocess_string)

# Create new columns to store matches, similarity scores, and sales format
mFood['Matches'] = ""
mFood['Similarity Score'] = 0
mFood['Sales Format'] = ""

# Set a similarity threshold
similarity_threshold = 70  # Adjust as needed

# Loop through the food names and perform fuzzy matching
for index, row in reduce.iterrows():
    if index in matched_indices:
        continue  # Skip already matched rows
    food_name = row['Food Name']
    sales_format = row['Sales Format']

    best_similarity = 0
    best_match_index = None

    for mIndex, mRow in mFood.iterrows():
        similarity = fuzz.token_sort_ratio(food_name, mRow['Food Name'])
        if similarity > best_similarity and similarity >= similarity_threshold:
            best_similarity = similarity
            best_match_index = mIndex

    if best_match_index is not None:
        mFood.at[best_match_index, 'Matches'] += f"{food_name}, "
        mFood.at[best_match_index, 'Similarity Score'] = best_similarity
        mFood.at[best_match_index, 'Sales Format'] = sales_format

# Remove the trailing comma and whitespace from the "Matches" column
mFood['Matches'] = mFood['Matches'].str.rstrip(', ')

CPU times: total: 1min 38s
Wall time: 1min 38s


In [14]:
mFood.columns

Index(['Food Code', 'Food Name', 'Food sub-group codes', 'FoodGroup',
       'Matches', 'Similarity Score', 'Sales Format'],
      dtype='object')

In [15]:
fFood = mFood[mFood['Matches'] != '']
fFood['Food Name'].count()

215

## 215 items from New list matched previously reduced List

In [16]:
fFood.rename(columns={'FoodGroup':'Food Group'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fFood.rename(columns={'FoodGroup':'Food Group'}, inplace=True)


In [17]:
fFood.head()

Unnamed: 0,Food Code,Food Name,Food sub-group codes,Food Group,Matches,Similarity Score,Sales Format
0,13-145,"ackee, canned, drained",DG,Vegetables,"ackee, canned",75,Can
1,13-146,"agar, dried",DG,Vegetables,"agar, dried",100,Bag
3,13-148,"alfalfa sprouts, raw",DG,Vegetables,"sprouts, alfalfa",88,Bagged produce
8,14-896,"almonds, whole kernels",GA,Nuts and seeds,"almonds, whole",76,Bag
10,13-149,"amaranth leaves, raw",DG,Vegetables,"amaranth leaves, raw",100,Bagged produce


In [18]:
# Create a set of values from fFood['Matches'] column
fFood_matches = set(fFood['Matches'])

# Filter out rows from reduce where the Food Name is in fFood_matches
unmatched_rows = reduce[~reduce['Food Name'].isin(fFood_matches)]

In [19]:
unmatched_rows['Food Name'].count()

253

In [20]:
unmatched_rows#.head()

Unnamed: 0,Food Name,Sales Format
4,"anchovies, canned",Can
5,"apple juice, ambient",Carton
6,"apples, cooking, raw",Loose produce
8,"apricots, raw",Loose produce
13,"avocado, raw",Loose produce
16,"baked beans, canned",Can
18,"bananas, raw",Loose produce
27,"blackberries, raw",Punnet
42,"cake, madeira",Fresh bakery
45,"canned fish, tuna",Can


                              Match against origional Dataproduct list with Matched Newlist/M&W

In [21]:
fFood.drop(['Matches','Similarity Score'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fFood.drop(['Matches','Similarity Score'], axis=1, inplace=True)


M&W List

In [22]:
bFood = pd.read_csv(r"C:\Users\medekar\Desktop\Product_Weight_Project\Data\Processed\ReducedwithWeights\dataproduct2.csv", index_col = 0)

In [23]:
bFood.head()

Unnamed: 0,Food Code,Food Name,Food sub-group codes,Food Group,Sale format(s),Source,Similarity Score,Weight,PurEqualCon,Portion Consumed
0,17-208,beer bitter best premium,QA,Alcoholic beverages,"can, can multipack, bottle, bottle multipack",,0.601799,333g,Y,small can
1,17-224,cider sweet,QC,Alcoholic beverages,"can, can multipack, bottle, bottle multipack",,0.0,,,
2,17-234,port,QF,Alcoholic beverages,bottle,,0.0,,,
3,17-236,sherry medium,QF,Alcoholic beverages,bottle,,0.79207,200g,Y,medium
4,17-247,spirits 40% volume,QK,Alcoholic beverages,"bottle, miniature",,0.707107,29g,Y,1 miniature


In [24]:
bFood.drop(['Source', 'Similarity Score', 'Weight', 'PurEqualCon','Portion Consumed'], axis=1, inplace=True)

In [25]:
%%time
# Create a set to keep track of matched indices
matched_indices = set()

# Preprocess food names in both DataFrames
fFood['Food Name'] = fFood['Food Name'].apply(preprocess_string)
bFood['Food Name'] = bFood['Food Name'].apply(preprocess_string)

# Create new columns to store matches, similarity scores, and sales format
bFood['Matches'] = ""
bFood['Similarity Score1'] = 0
bFood['Sales Format'] = ""

# Set a similarity threshold
similarity_threshold = 60  # Adjust as needed

# Loop through the food names and perform fuzzy matching
for index, row in fFood.iterrows():
    if index in matched_indices:
        continue  # Skip already matched rows

    food_name = row['Food Name']
    sales_format = row['Sales Format']

    best_similarity = 0
    best_match_index = None

    for mIndex, mRow in bFood.iterrows():
        if mIndex in matched_indices:
            continue  # Skip already matched rows

        similarity = fuzz.token_sort_ratio(food_name, mRow['Food Name'])
        if similarity > best_similarity and similarity >= similarity_threshold:
            best_similarity = similarity
            best_match_index = mIndex

    if best_match_index is not None:
        matched_indices.add(index)
        matched_indices.add(best_match_index)

        bFood.at[best_match_index, 'Matches'] += f"{food_name}, "
        bFood.at[best_match_index, 'Similarity Score1'] = best_similarity
        bFood.at[best_match_index, 'Sales Format'] = sales_format

# Remove the trailing comma and whitespace from the "Matches" column
bFood['Matches'] = bFood['Matches'].str.rstrip(', ')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


CPU times: total: 7.91 s
Wall time: 7.9 s


In [26]:
bFood['Food Name'].count()

537

In [27]:
fFood.columns

Index(['Food Code', 'Food Name', 'Food sub-group codes', 'Food Group',
       'Sales Format'],
      dtype='object')

In [28]:
redlist = bFood[bFood['Matches'] == '']

In [29]:
redlist.drop(['Matches','Similarity Score1','Sales Format'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  redlist.drop(['Matches','Similarity Score1','Sales Format'], axis=1, inplace=True)


In [30]:
redlist.rename(columns={'Sale format(s)':'Sales Format'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  redlist.rename(columns={'Sale format(s)':'Sales Format'}, inplace=True)


In [31]:
redlist.head()

Unnamed: 0,Food Code,Food Name,Food sub-group codes,Food Group,Sales Format
0,17-208,beer bitter best premium,QA,Alcoholic beverages,"can, can multipack, bottle, bottle multipack"
1,17-224,cider sweet,QC,Alcoholic beverages,"can, can multipack, bottle, bottle multipack"
2,17-234,port,QF,Alcoholic beverages,bottle
3,17-236,sherry medium,QF,Alcoholic beverages,bottle
4,17-247,spirits 40% volume,QK,Alcoholic beverages,"bottle, miniature"


In [32]:
# Create a set of values from fFood['Matches'] column
Food_matched = set(fFood['Food Name'])

# Filter out rows from reduce where the Food Name is in fFood_matches
redunmatched = redlist[~redlist['Food Name'].isin(Food_matched)]

In [33]:
redunmatched.count()

Food Code               315
Food Name               376
Food sub-group codes    376
Food Group              376
Sales Format            234
dtype: int64

In [34]:
gfood = pd.concat([fFood,redunmatched], ignore_index=True)

In [58]:
# Convert the 'Food Name' column to lowercase for case-insensitive comparison
gfood['Food Name'] = gfood['Food Name'].str.lower()

# Define a function to find similar food names using fuzzy matching
def find_similar_names(name, name_list, threshold=90):
    similar_names = process.extract(name, name_list, scorer=fuzz.token_sort_ratio)
    return [s[0] for s in similar_names if s[1] >= threshold]

# Create a list of unique normalized food names
unique_normalized_names = gfood['Food Name'].apply(normalize_food_name).unique()

# Identify similar names with extra words attached
similar_names = []
for name in unique_normalized_names:
    similar_names.extend(find_similar_names(name, unique_normalized_names))

similar_names = list(set(similar_names))  # Remove duplicates from the list

In [55]:
dp = gfood[~gfood['Food Name'].isin(similar_names)]

In [60]:
dp#['Food Group'].value_counts().sum()

Unnamed: 0,Food Code,Food Name,Food sub-group codes,Food Group,Sales Format,Normalized Food Name
0,13-145,"ackee, canned, drained",DG,Vegetables,Can,ackeecanneddrained
1,13-146,"agar, dried",DG,Vegetables,Bag,agardried
2,13-148,"alfalfa sprouts, raw",DG,Vegetables,Bagged produce,alfalfasproutsraw
3,14-896,"almonds, whole kernels",GA,Nuts and seeds,Bag,almondswholekernels
4,13-149,"amaranth leaves, raw",DG,Vegetables,Bagged produce,amaranthleavesraw
5,17-851,"apple sauce, homemade",WC,"Soups, sauces and miscellaneous foods",Prepared food counter,applesaucehomemade
6,14-016,"apples, eating, dried",FA,Fruit,Loose produce,appleseatingdried
7,14-031,"apricots, dried",FA,Fruit,Canned,apricotsdried
9,13-153,"artichoke, globe, raw",DG,Vegetables,Loose produce,artichokegloberaw
10,13-157,"asparagus, raw",DG,Vegetables,Bunched produce,asparagusraw


                                                            Export 

In [57]:
#dp.to_csv(r'C:\Users\medekar\Desktop\Product_Weight_Project\Data\Processed\MW_DataReduction\Reduced Total\Updated3_RedLab2021.csv')