In [10]:
# !pip install langchain
# !pip install openai -U
# !pip install nervaluate
# !pip install faiss
# !pip install tiktoken
#!pip install pandas
# !pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m980.4 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting transformers<5.0.0,>=4.6.0 (from sentence-transformers)
  Downloading transformers-4.35.2-py3-none-any.whl.metadata (123 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.5/123.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting torch>=1.6.0 (from sentence-transformers)
  Downloading torch-2.1.1-cp39-none-macosx_10_9_x86_64.whl.metadata (25 kB)
Collecting torchvision (from sentence-transformers)
  Downloading torchvision-0.16.1-cp39-cp39-macosx_10_13_x86_64.whl.metadata (6.6 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.3.2-cp39-cp39-macosx_10_9_x86_64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers)
  Downl

In [2]:
### THE OVERVIEW: 
# 1. Prompt in LLM.
# 2. Convert the receipt text structured data
# 3. Use labels and K-nearest neighbors (embeddings) to classify the vendor into vendor categories.
# 4. Classify the items into categories; propose your own .
# 5. Plot these analytics of vendors and items.



In [1]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
import json
import pandas as pd
import numpy as np

In [4]:
prompt = """
Can you please convert this unstructured receipt text data into a structured JSON object which includes ReceiptInfo and ITEMS.
Note that sometimes the quantity for an item may be located next to the item name.
Text in the structured JSON should appear in the same format as the unstructured receipt.
Please just give the structured JSON object and nothing else.

Here is the schema for the JSON object:
{{
  "ReceiptInfo": {{
    "merchant": "(string value)",
    "address": "(string value)",
    "city": "(string value)",
    "state": "(string value)",
    "phoneNumber": "(string value)",
    "tax": "(float value)",
    "total": "(float value)",
    "receiptDate": "(string value)",
    "receiptTime": "(string value)",


    "ITEMS": [
      {{
        "description": "(string value)",
        "quantity": "(integer value)",
        "unitPrice": "(float value)",
        "totalPrice": "(float value)",
        "discountAmount": "(float value)"
      }}
    ]
  }}
}}


Examples:

Receipt:
Give us feedback @ <UNKNOWN>
Thank you! ID <UNKNOWN> <UNKNOWN>
Walmart
301-604-0180 Mgr:DEBRA
3549 RUSSETT GRN
LAUREL MD 20724
ST# 01985 OP# 009049 TE# 49 TR# 04345
<UNKNOWN> PRK SB 008265754333 F
1.98 X
BOOSTER
088439295372
19.98 N
VOIDED BANKCARD TRANSACTION
TERMINAL <UNKNOWN> SC010112
TRANSACTION NOT COMPLETE
07/04/23
16:19:53
<UNKNOWN> BANKCARD TRANSACTION
TERMINAL # SC010112
TRANSACTION <UNKNOWN> COMPLETE
07/04/23
16:20:01
SUBTOTAL
21.96
TAX 1
6.000 %
0.12
TOTAL
22.08
VISA TEND
22.08
ACCOUNT #
**** **** <UNKNOWN> 2130 F
APPROVAL # 00592D
REF # 318500762875
TRANS ID - 583185732192035
VALIDATION - B339
PAYMENT SERVICE - E
TERMINAL # SC010112
07/04/23
16:20:21
CHANGE DUE
0.00
# ITEMS SOLD 2
<UNKNOWN> 0069 6254 8220 4557 9869
Walmart
Become a member
Scan for free 30-day trial
Low Prices You Can <UNKNOWN> Every <UNKNOWN>
07/04/23
16:20:22
***CUSTOMER <UNKNOWN>

JSON:
{{
  "ReceiptInfo": {{
    "merchant": "Walmart",
    "address": "3549 RUSSETT GRN",
    "city": "LAUREL",
    "state": "MD",
    "phoneNumber": "301-604-0180",
    "tax": "0.12",
    "total": "22.08",
    "receiptDate": "07/04/23",
    "receiptTime": "4:20pm",


    "ITEMS": [
      {{
        "description": "PRK SB",
        "quantity": "1",
        "unitPrice": "1.98",
        "totalPrice": "1.98",
        "discountAmount": "0.00"
      }},
      {{
        "description": "BOOSTER",
        "quantity": "1",
        "unitPrice": "19.98",
        "totalPrice": "19.98",
        "discountAmount": "0.00"
      }}
    ]
  }}
}}

Receipt:
Unstructured Receipt Text Data:
203
BUY ONE GET ONE FREE QUARTER POUNDER
W/CHEESE OR EGG MCMUFFIN
Go to www.mcdvoice.com within 7 days
and tell <UNKNOWN> about your visit.
Validation Code:
Expires 30 days after receipt date.
Valid at participating US
Survey Code:
<UNKNOWN>
McDonald's Restaurant <UNKNOWN>
2915 E MANCA RD
<UNKNOWN> 96822
TEL# <UNKNOWN>
Thank You Valued Customer
KS# 2
08/19/2023 03:33 PM
<UNKNOWN>
Order 03
1 Happy Meal Ch Burger
5.89
1 Cheeseburger
ONLY Onions
ONLY Ketchup
1 Extra Kids Fry
1 Apple Juice
1 NERF
Subtotal
5.89
Tax
0.28
Take-Out Total
6.17
Cashless
6.17
Change
0.00
MER# 467782
CARD ISSUER
ACCOUNT
Visa SALE
<UNKNOWN> <UNKNOWN> 130
TRANSACTION AMOUNT
6.17
CONTACTLESS
AUTHORIZATION CODE - <UNKNOWN>
SEQ# 107204
AID: A0000000031010
McDonald's Restaurant
Sign up for MyMcDonald's rewards
to earn points on future visit

JSON:
{{
  "ReceiptInfo": {{
    "merchant": "McDonald's Restaurant",
    "address": "2915 E MANOA RD",
    "city": "Honolulu",
    "state": "HI",
    "phoneNumber": "",
    "tax": "0.28",
    "total": "6.17",
    "receiptDate": "08/19/2023",
    "receiptTime": "03:33 PM",


    "ITEMS": [
      {{
        "description": "1 Happy Meal Ch Burger",
        "quantity": "1",
        "unitPrice": "5.89",
        "totalPrice": "5.89",
        "discountAmount": "0.00"
      }}
    ]
  }}
}}

Receipt:
Unstructured Receipt Text Data:
H
<UNKNOWN>
MART
http://www.hmart.com
458 Keawe st
Honolulu, <UNKNOWN> 96813
TEL (808) 219-0924
Your Cashier was TIM
WANG DANGMYUN
4.99 B
K-T TOPPOKI
PC
6.99 B
OTG SOUP RICECAKE
7.99 B
HT VEGE DUMPLING
14.99 B
TAX
1.65
****
BALANCE
36.61
Discover Credit - C
ACCOUNT <UNKNOWN> ************ 1153
APPROVAL CODE: 01263R
SEQUENCE NUMBER: 22907
No CVM
Amount USD $36.61
CARD:Discover CREDIT XXXX1153 EMV
APPROVAL CODE : 01263R
AID A0000001523010
TVR : 0000008000
IAD <UNKNOWN>
TSI : E800
APPLICATION CRYPTOGRAM CB8F25799B68A2B
APPLICATION PREFERRED NAME Discover Cr
APPLICATION LABEL : Discover
TC CB8F25799B68A2B3
<UNKNOWN> 00
RespDate : 06112023
Resp Time 211509
<UNKNOWN>
TOTAL AMOUNT: $36.61
RESPONSE CODE: APPROVED
06/11/23 09:15pm 113 20
Discover
<UNKNOWN> 61
CHANGE
0.00
TOTAL NUMBER OF ITEMS SOLD =
4
06/11/23 09:15pm 81 20 363 113

JSON:
{{
  "ReceiptInfo": {{
    "merchant": "H MART",
    "address": "458 Keawe st",
    "city": "Honolulu",
    "state": "HI",
    "phoneNumber": "(808) 219-0924",
    "tax": "1.65",
    "total": "36.61",
    "receiptDate": "06/11/23",
    "receiptTime": "09:15pm",


    "ITEMS": [
      {{
        "description": "WANG DANGMYUN",
        "quantity": "1",
        "unitPrice": "4.99",
        "totalPrice": "4.99",
        "discountAmount": "0.00"
      }},
      {{
        "description": "K-T TOPPOKI PC",
        "quantity": "1",
        "unitPrice": "6.99",
        "totalPrice": "6.99",
        "discountAmount": "0.00"
      }},
      {{
        "description": "OTG SOUP RICECAKE",
        "quantity": "1",
        "unitPrice": "7.99",
        "totalPrice": "7.99",
        "discountAmount": "0.00"
      }},
      {{
        "description": "HT VEGE DUMPLING",
        "quantity": "1",
        "unitPrice": "14.99",
        "totalPrice": "14.99",
        "discountAmount": "0.00"
      }}
    ]
  }}
}}

Receipt:
SAFEWAY
O
<UNKNOWN>
Store 204 Dir Kaipo Ah Mook Sang
Main (808) 988-2058
2855 East Manoa Road
HONOLULU HI 96822
GROCERY
6.99 B
MARINARA SCE
Regular Price
8.49
Member Savings
1.50-
REFRIG/FROZEN
LUC CHES COLBY JAC.
10.99 B
Regular Price
13.49
Member Savings
2.50-
MARIE CALLENDER'S
4.00 B
Regular Price
6.49
Member Savings
2.49-
MEAT
AIDELLS MEATBALLS
5.99 B
Regular Price
8.99
Member Savings
3.00-
TAX
1.32
**** BALANCE
29.29
Credit Purchase 08/01/23 20:15
CARD # <UNKNOWN>
<UNKNOWN> 861544421300 AUTH: 0006375D
PAYMENT AMOUNT
29.29
AL VISA CREDIT
AID A0000000031010
TVR 0000000000
TSI 0000
Visa
29.29
CHANGE
0.00
TOTAL NUMBER OF ITEMS SOLD =
4
08/01/23 20:15 204 51 151
8851
Gameplays Earned 2
Sweepstakes Entries Earned 2
*Subject to limit of 10 <UNKNOWN> day
Earn gameplays and sweepstakes entries
by shopping, then play
Flavor Adventure for a chance to win!
Scan the QR code to Play. Win. Save.
REWARDS AVAILABLE
2
POINTS EARNED TODAY
Base Points 27
Total 27
Points Towards Next Reward 57 of 100
YOUR CASHIER TODAY WAS SELF
YOUR SAVINGS
Member Savings
Total
9.49
Total Savings Value
9.49
25%
Thank <UNKNOWN> you for shopping <UNKNOWN>
For SAFEWAY FOR U questions
877-276-9637 or Safeway.com/foru call

JSON:
{{
  "ReceiptInfo": {{
    "merchant": "SAFEWAY",
    "address": "2855 East Manoa Road",
    "city": "Honolulu",
    "state": "HI",
    "phoneNumber": "(808) 988-2058",
    "tax": "1.32",
    "total": "29.29",
    "receiptDate": "08/01/23",
    "receiptTime": "8:15pm",


    "ITEMS": [
      {{
        "description": "MARINARA SCE",
        "quantity": "1",
        "unitPrice": "8.49",
        "totalPrice": "6.99",
        "discountAmount": "1.50"
      }},
      {{
        "description": "LUC CHES COLBY JAC.",
        "quantity": "1",
        "unitPrice": "13.49",
        "totalPrice": "10.99",
        "discountAmount": "2.50"
      }},
      {{
        "description": "MARIE CALLENDER'S",
        "quantity": "1",
        "unitPrice": "6.49",
        "totalPrice": "4.00",
        "discountAmount": "2.49"
      }},
      {{
        "description": "AIDELLS MEATBALLS",
        "quantity": "1",
        "unitPrice": "8.99",
        "totalPrice": "5.99",
        "discountAmount": "3.00"
      }}
    ]
  }}
}}

Here is the Unstructured Receipt Text Data (Turn this into structured JSON):

Receipt: {receipt}
"""

In [5]:
import os
def get_receipts():
    """Gets unstructured receipt text from each receipt text file in the receipts directory."""
    folder_path = 'receipts/text'
    receipts = []
    
    # List all files in the folder
    files = os.listdir(folder_path)
    
    # Iterate through each file
    for file_name in files:
        file_path = os.path.join(folder_path, file_name)
    
        # Check if it's a text file
        if file_name.endswith('.txt'):
            with open(file_path, 'r') as file:
                # Read the content and append to the array
                text_content = file.read()
                receipts.append(text_content)
    
    return receipts

test_receipts = get_receipts()[0]

In [None]:
## Getting output from LLM.
# Uses OpenAI model.
model = ChatOpenAI()

# Creates a template for the prompt.
prompt_template = PromptTemplate.from_template(prompt)

# Inserts the prompt into model and gets output.
chain = LLMChain(llm=model, prompt=prompt_template)
receipt_outputs = [chain.run(receipt=test_receipt) for test_receipt in test_receipts]

In [None]:
## Named Entity Recognition:
def find_span(text, entity_text):
    """Finds the start and end positions for a given entity_text"""
    start = text.find(entity_text)
    if start == -1:
        return None
    end = start + len(entity_text)
    return (start, end), entity_text

def convert_to_prodigy_spans(receipt_text, entities):
    """Structures the receipt data by finding the position spans of each field indicated in the Structured JSON Object."""
    text_vals = []
    entities = json.loads(entities)
    prodigy_data = []
    receipt_info = entities["ReceiptInfo"]

    # Gets the label and entity_text for each corresponding label and entity listed.
    for label, entity_text in [
        ("MERCHANT", str(receipt_info.get("merchant", ""))), 
        ("ADDRESS", str(receipt_info.get("address", ""))), 
        ("CITY", str(receipt_info.get("city", ""))), 
        ("STATE", str(receipt_info.get("state", ""))), 
        ("PHONE", str(receipt_info.get("phoneNumber", ""))), 
        ("TAX", str(receipt_info.get("tax", ""))), 
        ("TOTAL", str(receipt_info.get("total", ""))), 
        ("DATE", str(receipt_info.get("receiptDate", ""))) 
    ]:

        # Tries to add positions of enties for corresponding labels.
        try:
            span, text = find_span(receipt_text, entity_text)
            text_vals.append(text)

            if span:
                start, end = span
                prodigy_data.append({"start": start, "end": end, "label": label})
        except:
            pass

    # Process item-level entities
    for item in receipt_info["ITEMS"]:
        for label, entity_text in [ # add .get() to all fields? 
            ("ITEM_DESC", item["description"]),
            ("QTY", str(item["quantity"])),
            ("UNIT_PRICE", str(item["unitPrice"])),
            ("TOTAL_PRICE", str(item["totalPrice"])),
            ("DISCOUNT", str(item.get("discountAmount", "")))  # Discount might not always be present
        ]:
            if entity_text:  # Check if the entity text is not empty
                span = find_span(receipt_text, entity_text)
                if span:
                    start, end = span
                    prodigy_data.append({"start": start, "end": end, "label": label})

    return prodigy_data, text_vals

# Example usage

prodigy_spans_true, text_vals = convert_to_prodigy_spans(test_receipts[0], receipt_outputs[0])
#print(json.dumps(prodigy_spans_true, indent=2))

In [None]:
# Converts all receipt_outputs to JSON.
json_objects = [json.loads(receipt_output) for receipt_output in receipt_outputs]

# Gets the vendor name for each receipt JSON object.
vendors = [receipt_json['ReceiptInfo']['merchant'] for receipt_json in json_objects]
embeddings_model = OpenAIEmbeddings()

# Gets embeddings for the vendors.
vector_db = embeddings_model.embed_documents(vendors)
len(vector_db), len(vector_db[0])

In [43]:
from langchain.embeddings import HuggingFaceEmbeddings

categories = ["Grocery and Supermarkets", "Restaurants and Food Services", "Clothing and Apparel", 
              "Health and Beauty", "Electronics and Appliances", "Home and Garden", "Entertainment and Leisure"]

import os
def get_training_data():
    folder_path = 'training_data/'
    category_train_test_data = {}
    category_training_items = []
    
    # List all files in the folder
    files = os.listdir(folder_path)
    i = 0

    # Gets the items for each category.
    for category in categories:
        # Gets the file identifier for the category.
        category_id = (category.split(" ")[0]).lower()
        file_name = f'{category_id}_items.txt'
        file_path = os.path.join(folder_path, file_name)
    
        # Check if it's a text file.
        if file_name.endswith('.txt'):
            with open(file_path, 'r') as file:
                # Read the content and split into list by newline character.
                text_content = file.read().split('\n')
                # Adds the items to the category in the dict.
                category_train_test_data[categories[i]] = text_content
                i += 1
    return category_train_test_data

category_train_test_data = get_training_data()
    
#     # Iterate through each file
#     for file_name in files:
#         file_path = os.path.join(folder_path, file_name)
    
#         # Check if it's a text file
#         if file_name.endswith('.txt'):
#             with open(file_path, 'r') as file:
#                 # Read the content and append to the array
#                 text_content = file.read()
#                 category_training_items.append(str(text_content))
#         category_train_test_data[categories[i]] = category_training_items
#         category_training_items = []
#         i += 1
    
#     return category_train_test_data

# category_train_test_data = get_training_data()
# category_train_test_data

# category_train_test_data = {
# "Grocery and Supermarkets": ['Milk', 'Eggs', 'Bread', 'Chicken', 'Bananas', 'Tomatoes', 'Potatoes', 'Onions', 'Cheese', 'Pasta', 'Rice', 'Cereal', 'Yogurt', 'Apples', 'Oranges', 'Carrots', 'Lettuce', 'Broccoli', 'Ground beef', 'Toothpaste', 'Shampoo', 'Soap', 'Toilet paper', 'Paper towels', 'Laundry detergent', 'Dish soap', 'Coffee', 'Tea', 'Sugar', 'Flour', 'Cooking oil', 'Ketchup', 'Mustard', 'Mayonnaise', 'Salad dressing', 'Chips', 'Soda', 'Water', 'Juice', 'Ice cream', 'Frozen pizza', 'Frozen vegetables', 'Canned beans', 'Canned tomatoes', 'Peanut butter', 'Jelly', 'Bacon', 'Sausages', 'Ham', 'Turkey', 'Bottled water', 'Energy bars', 'Cookies', 'Crackers', 'Candles', 'Light bulbs', 'Trash bags', 'Aluminum foil', 'Plastic wrap', 'Ziplock bags', 'Tissues', 'Hand sanitizer', 'Dishwasher detergent', 'Alcohol', 'Wine', 'Beer', 'Cat food', 'Dog food', 'Pet litter', 'Napkins', 'Baby diapers', 'Baby wipes', 'Baby formula', 'Feminine hygiene products', 'Bandages', 'Pain relievers', 'Cough medicine', 'Vitamins', 'Cleaning supplies', 'Air freshener', 'Dental floss', 'Lawn bags', 'Batteries', 'Candles', 'Greeting cards', 'Magazines', 'Postage stamps', 'Gift wrap'],
# "Restaurants and Food Services": ["Grilled Chicken Salad", "Cheeseburger", "Vegetarian Pizza", "French Fries", "Soda", "Iced Tea", "Chocolate Cake", "Spaghetti Bolognese", "Caesar Salad", "Margarita Pizza",
# "Pasta Primavera", "Chicken Alfredo", "Garlic Bread", "Fish and Chips", "Mango Smoothie", "Cappuccino", "Tomato Soup", "Chicken Wings", "Shrimp Scampi", "Onion Rings",
# "Club Sandwich", "Coffee", "Brownie Sundae", "Vegetable Stir-Fry", "Lemonade", "Beef Tacos", "Pesto Pasta", "Hot Chocolate", "Mushroom Risotto", "Greek Salad", "Churros",
# "Chicken Parmesan", "Mojito", "Spinach Dip", "Pho", "Beef and Broccoli", "Tiramisu", "Hamburger", "Fried Chicken", "Green Tea", "Caprese Salad", "Quesadilla", "Nachos",
# "Banana Split", "Pad Thai", "Sushi Roll", "Chicken Quesadilla", "Pineapple Fried Rice", "Apple Pie", "Clam Chowder", "Bruschetta", "Calamari", "Pancakes", "Chicken Caesar Wrap",
# "Peach Iced Tea", "Avocado Toast", "Lobster Roll", "Eggplant Parmesan", "Tuna Salad Sandwich", "Raspberry Lemonade", "Crab Cakes", "Veggie Burger", "Tomato Basil Soup", "Omelette",
# "Cobb Salad", "Chicken Teriyaki", "Chocolate Mousse", "Pesto Chicken Sandwich", "Tofu Stir-Fry", "Caramel Macchiato", "Beef Burrito", "Coconut Shrimp", "Pumpkin Pie", "Buffalo Wings",
# "Strawberry Shortcake", "Quinoa Salad", "Shrimp Po' Boy", "Blueberry Pancakes", "Cajun Chicken Pasta", "Ginger Ale", "Pulled Pork Sandwich", "Fruit Salad", "Chicken Satay", "Key Lime Pie",
# "Beef Stroganoff", "Chocolate Chip Cookies", "Cranberry Juice", "Egg Fried Rice", "Baked Ziti", "Cucumber Salad", "Veggie Wrap", "Black Bean Soup", "Sweet Potato Fries", "Pesto Pizza"],
# "Clothing and Apparel": ['T-shirt', 'Jeans', 'Sweater', 'Dress', 'Jacket', 'Shorts', 'Skirt', 'Blouse', 'Shirt', 'Hoodie', 'Leggings', 'Cap', 'Socks', 'Underwear', 'Scarf', 'Gloves', 'Coat', 'Tank top', 'Polo shirt', 'Cardigan', 'Hat', 'Belt', 'Tie', 'Blazer', 'Pants', 'Jumpsuit', 'Vest', 'Kimono', 'Pajamas', 'Raincoat', 'Sunglasses', 'Handbag', 'Backpack', 'Wallet', 'Watch', 'Bracelet', 'Necklace', 'Earrings', 'Ring', 'Headband', 'Tights', 'Swimwear', 'Sandals', 'Boots', 'Sneakers', 'High heels', 'Loafers', 'Slippers', 'Umbrella', 'Tote bag', 'Gym bag', 'Laundry bag', 'Luggage', 'Iron', 'Hanger', 'Laundry detergent', 'Fabric softener', 'Lint roller', 'Sewing kit', 'Clothing tags', 'Thread', 'Needles', 'Buttons', 'Zipper', 'Measuring tape', 'Safety pins', 'Ironing board', 'Clothing rack', 'Lint brush', 'Shoe polish', 'Insoles', 'Shoehorn', 'Shoe rack', 'Hosiery', 'Fabric spray', 'Lint trap', 'Drawer organizers', 'Garment bag', 'Clothing brush', 'Stain remover', 'Lint shaver', 'Shoe inserts', 'Collar stays', 'Clothing steamer', 'Shoe care kit', 'Clothing repair patches', 'Sewing machine'],
# "Health and Beauty": ['Shampoo', 'Conditioner', 'Facial cleanser', 'Moisturizer', 'Sunscreen', 'Body wash', 'Hand soap', 'Deodorant', 'Toothpaste', 'Mouthwash', 'Floss', 'Hairbrush', 'Hair ties', 'Razors', 'Shaving cream', 'Body lotion', 'Face mask', 'Exfoliating scrub', 'Makeup remover', 'Cotton pads', 'Cotton swabs', 'Tissues', 'Nail polish', 'Nail polish remover', 'Nail file', 'Acne treatment', 'Anti-aging cream', 'Serum', 'Eye cream', 'Lip balm', 'Face wash', 'Cleansing wipes', 'Essential oils', 'Aromatherapy candles', 'Bath salts', 'Bubble bath', 'Massage oil', 'Hair serum', 'Hair mask', 'Hand cream', 'Foot cream', 'Foot scrub', 'Body scrub', 'Scented lotion', 'Perfume', 'Cologne', 'Hair gel', 'Hair mousse', 'Styling cream', 'Blow dryer', 'Flat iron', 'Curling iron', 'Makeup brushes', 'Foundation', 'Concealer', 'Powder', 'Blush', 'Eyeshadow', 'Eyeliner', 'Mascara', 'Lipstick', 'Lip gloss', 'Makeup setting spray', 'Makeup remover wipes', 'Facial toner', 'Tweezers', 'Eyebrow pencil', 'Hair color', 'Hair dye', 'Hair accessories', 'Face sunscreen', 'Body sunscreen', 'Insect repellent', 'Antibacterial wipes', 'Hand sanitizer', 'Lip scrub', 'Hair clips', 'Sleep mask', 'Earplugs', 'Blister pads', 'Foot powder', 'Dental night guard', 'Collagen supplement', 'Vitamin C serum', 'Aloe vera gel', 'Tea tree oil', 'Sulfate-free shampoo', 'Organic conditioner', 'Natural skincare set', 'Reusable makeup remover pads'],
# "Electronics and Appliances": ['Smart TV', 'Refrigerator', 'Laptop', 'Washing Machine', 'Air Conditioner', 'Bluetooth Speaker', 'Microwave Oven', 'Coffee Maker', 'Toaster', 'Vacuum Cleaner', 'Gaming Console', 'Smartwatch', 'Digital Camera', 'Headphones', 'Printer', 'Tablet', 'Blender', 'Hair Dryer', 'Electric Shaver', 'Rice Cooker', 'Iron', 'Food Processor', 'Smart Home Hub', 'Security Camera', 'Wireless Router', 'External Hard Drive', 'USB Flash Drive', 'Power Strip', 'HDMI Cable', 'USB-C Cable', 'Wireless Mouse', 'Mechanical Keyboard', 'Monitor', 'Soundbar', 'Fitness Tracker', 'Digital Scale', 'Robot Vacuum', 'Air Purifier', 'Portable Charger', 'Surge Protector', 'In-Ear Earphones', 'Gaming Mouse', 'Graphics Card', 'Motherboard', 'CPU', 'RAM', 'SSD', 'External SSD', 'Wireless Earbuds', 'Digital Voice Recorder', 'Projector', 'Smart Bulbs', 'Smart Thermostat', 'Coffee Grinder', 'Instant Pot', 'Deep Fryer', 'Juicer', 'Cordless Phone', 'Curling Iron', 'Flat Iron', 'Wireless Charger', 'Gaming Headset', 'Blue Light Glasses', 'Webcam', 'Smart Doorbell', 'Drone', 'GPS Tracker', 'Home Theater System', 'Digital Photo Frame', 'Camera Lens', 'Tripod', 'Solar Charger', 'Electric Scooter', 'Smart Refrigerator', 'Multi-Cooker', 'Air Fryer', 'E-reader', 'Smart Glasses', 'Waterproof Bluetooth Speaker', 'Smart Mirror', 'Car Dash Cam', 'Smart Door Lock', 'VR Headset', 'Digital Drawing Tablet', 'Electric Grill', 'Cordless Vacuum', 'Smart Kitchen Scale', 'Sleep Tracker', 'Wireless Charging Pad', 'Smart Pet Feeder'],
# "Home and Garden": ['Flower pot', 'Mulch', 'Garden gloves', 'Pruning shears', 'Hose', 'Watering can', 'Seeds', 'Fertilizer', 'Outdoor furniture set', 'Lawn mower', 'Garden shovel', 'Patio umbrella', 'Plant fertilizer', 'Compost bin', 'Garden rake', 'Plant labels', 'Garden trowel', 'Garden hose reel', 'Garden kneeler', 'Bird feeder', 'Weed killer', 'Garden edging', 'Garden sprayer', 'Pest control spray', 'Garden cart', 'Garden stakes', 'Wind chimes', 'Outdoor lights', 'Garden twine', 'Watering wand', 'Garden trellis', 'Patio heater', 'Garden hose nozzle', 'Rain barrel', 'Outdoor cushions', 'Garden hose splitter', 'Grill cover', 'Chiminea', 'Garden scissors', 'Lawn edger', 'Deck box', 'Garden mulcher', 'Hanging planters', 'Garden soil', 'Pergola kit', 'Garden mesh', 'Garden fence', 'Plant stand', 'Garden bench', 'Garden tool set', 'Fire pit', 'Garden gnome', 'Solar lights', 'Garden lanterns', 'Leaf blower', 'Outdoor rug', 'Garden hat', 'Garden apron', 'Garden shoes', 'Water feature', 'Garden decor', 'Potting bench', 'Garden tool organizer', 'Garden thermometer', 'Outdoor clock', 'Garden umbrella stand', 'Beehive', 'Outdoor storage shed', 'Garden hose hanger', 'Garden compost', 'Garden stakes', 'Garden netting', 'Garden seat cushion', 'Garden fountain', 'Garden wagon', 'Garden clogs', 'Garden trug', 'Folding outdoor table', 'Garden pond kit', 'Garden windmill', 'Garden tool bag', 'Garden tool rack', 'Outdoor broom', 'Garden tool sharpener', 'Garden hat', 'Outdoor lantern', 'Garden seat pad', 'Garden knee pads', 'Garden kneeler', 'Garden hose holder', 'Garden tool caddy', 'Garden gloves', 'Garden kneeling pad', 'Garden cart wheels'],
# "Entertainment and Leisure": ['Movie tickets', 'Popcorn', 'Soda', 'Candy', 'Arcade games tokens', 'Bowling lane rental', 'Shoe rental', 'Mini golf fees', 'Concert tickets', 'Merchandise', 'VIP lounge access', 'Theme park admission', 'Ride tickets', 'Food and beverages', 'Photo souvenirs', 'Escape room fees', 'Paintball session', 'Karaoke room rental', 'Virtual reality experience', 'Laser tag session', 'Amusement park parking', 'Go-kart race fees', 'Water park admission', 'Snack bar purchases', 'VIP seating upgrade', 'Musical instrument rental', 'Museum entrance fee', 'Art gallery tickets', 'Tour guide fees', 'Historical site admission', 'Outdoor adventure fees', 'Zip-lining charges', 'Horseback riding fees', 'Boat rental', 'Fishing gear rental', 'Concession stand purchases', 'Skating rink rental', 'Ice skate rental', 'Snow tubing fees', 'Ski lift pass', 'Snowboard rental', 'Themed event tickets', 'Casino gaming chips', 'Comedy show tickets', 'VIP meet and greet passes', 'Concession stand snacks', 'Live performance tickets', 'Stage play admission', 'Merchandise souvenirs', 'VIP lounge access', 'VIP parking pass', 'Surfing lessons', 'Parasailing fees', 'Jet ski rental', 'Beach cabana rental', 'Scuba diving lessons', 'Snorkeling gear rental', 'Golf course fees', 'Golf club rental', 'Tennis court rental', 'Fitness class fees', 'Spa services', 'Sauna session', 'Hot tub rental', 'Massage therapy', 'Pool access fees', 'Water aerobics class', 'Fitness center membership', 'Yoga class fees', 'Personal training session', 'Aerial silk class', 'Indoor climbing wall fees', 'Trampoline park admission', 'Dance class fees', 'Cooking class fees', 'Wine tasting event tickets', 'Brewery tour fees', 'Paint and sip class', 'Pottery painting session', 'Craft workshop fees', 'Photography class', 'Film screening tickets', 'Video game tournament entry', 'Board game night cover charge', 'Escape room team-building package', 'Trivia night entry', 'Concert merchandise', 'Sports event tickets', 'Autograph signing fees', 'VIP box seat rental', 'Stadium parking pass', 'Sports memorabilia purchases', 'Gaming convention admission', 'Cosplay contest entry']
# }

category_training_embeddings = {}

embeddings_model = HuggingFaceEmbeddings()

for category in category_train_test_data:
    category_training_embeddings[category] = embeddings_model.embed_documents(category_train_test_data[category])

len(category_training_embeddings["Grocery and Supermarkets"])

327

In [45]:
"""
"Categories"                        "Embedding 1"     "Embedding 2" ....
-------------------------------------------------------------------------
"Grocery and Supermarkets"
"Grocery and Supermarkets"  
"Grocery and Supermarkets"  
"Grocery and Supermarkets"  
"Restaurants and Food Services"
"Restaurants and Food Services"
"Restaurants and Food Services"
"Clothing and Apparel" 
"Health and Beauty"
"Electronics and Appliances"
"Home and Garden"
"Entertainment and Leisure"
"""

df_category_items = []
d = {'categories':[]}

# Each category.
for category in category_training_embeddings:
    # Each item in the category.
    for item in category_training_embeddings[category]:
        df_category_items.append(category)

        # Each embedding in an item.
        for i in range(len(item)):
            embedding = item[i]

            if f'embedding_{i}' in d:
                d[f'embedding_{i}'].append(embedding)
            else:
                d[f'embedding_{i}'] = []
                d[f'embedding_{i}'].append(embedding)

d['categories'] = df_category_items
    
# Training and Testing data.
category_data = pd.DataFrame(d)

# 637 Items (Training (80%): 509, Testing (20%): 128)
category_data.shape

(2636, 769)

In [48]:
# Randomly shuffles the rows.
shuffled_data = category_data.sample(frac=1)

total_rows = shuffled_data.shape[0]
train_size = int(total_rows*0.8)
 
# Split data into test and train
train = shuffled_data[0:train_size]
test = shuffled_data[train_size:]

train

Unnamed: 0,categories,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,...,embedding_758,embedding_759,embedding_760,embedding_761,embedding_762,embedding_763,embedding_764,embedding_765,embedding_766,embedding_767
250,Grocery and Supermarkets,0.022932,-0.059276,-0.014088,0.009429,0.035327,0.056376,-0.106598,0.039942,-0.024974,...,-0.030151,-0.010124,-0.022766,-0.015552,0.039506,0.048782,-0.027297,0.018023,-0.067031,0.006369
1503,Health and Beauty,-0.008689,-0.011421,0.044234,-0.039460,-0.014466,-0.014718,0.036519,0.017756,0.017360,...,-0.044711,-0.003131,-0.012905,-0.076379,0.012108,-0.035628,-0.035749,-0.025436,-0.039674,0.005174
1449,Health and Beauty,-0.012293,-0.043711,-0.012516,-0.031413,-0.037765,-0.004644,0.028352,0.034481,0.058440,...,-0.069078,-0.011215,0.009536,-0.030627,0.022892,-0.020202,-0.000073,-0.004375,-0.054842,-0.007859
541,Restaurants and Food Services,-0.004676,0.014751,0.003418,-0.015537,-0.022635,0.046072,-0.030878,0.092523,-0.015897,...,-0.050465,-0.004023,0.007332,-0.018828,0.021083,-0.096452,0.033650,-0.020295,-0.005324,-0.013453
1903,Home and Garden,-0.015000,0.011488,0.016470,0.004036,-0.014393,-0.011709,0.030537,-0.001961,0.054645,...,-0.007819,-0.019128,0.030721,0.065144,0.026939,0.005840,0.047857,-0.002935,-0.021771,-0.007162
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2615,Entertainment and Leisure,0.035970,-0.054509,-0.010828,0.013157,-0.009331,-0.032359,-0.013092,0.015315,-0.007121,...,-0.024493,-0.002660,0.013641,0.000450,0.016358,0.084939,0.032413,-0.019211,-0.019021,-0.029408
1678,Electronics and Appliances,-0.017719,-0.081308,-0.005012,-0.037735,-0.005576,-0.032860,0.033752,-0.028651,0.039428,...,0.003794,0.004787,0.083868,-0.010669,-0.004457,-0.033078,-0.003870,0.042218,0.008238,0.016050
407,Restaurants and Food Services,0.024768,-0.000311,-0.002272,-0.060565,-0.009662,0.016409,-0.119447,0.022994,-0.047620,...,-0.030748,-0.055882,0.038782,-0.048902,0.012978,0.018615,-0.003430,-0.009020,-0.035026,-0.015602
995,Clothing and Apparel,-0.047693,0.058039,0.026723,-0.013098,-0.064800,0.004924,-0.002214,-0.012140,-0.059487,...,-0.082995,0.021528,0.021496,0.011549,0.005327,0.012046,0.035436,0.030192,-0.028385,-0.020070


In [49]:
# X is the embeddings and Y is the categories.

y_train = train['categories']
x_train = train.loc[:, train.columns != 'categories']

y_test = test['categories']
x_test = test.loc[:, test.columns != 'categories']

x_test

Unnamed: 0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_758,embedding_759,embedding_760,embedding_761,embedding_762,embedding_763,embedding_764,embedding_765,embedding_766,embedding_767
2080,-0.025748,-0.000337,0.009893,-0.011661,0.013014,-0.008676,0.027963,-0.039226,0.022762,-0.040477,...,-0.022090,0.005557,-0.015660,0.067102,0.018338,0.002708,0.078045,0.000636,-0.029308,-0.010467
192,-0.012546,0.105127,-0.005683,-0.020742,-0.006319,0.016114,-0.040195,0.033728,-0.000114,0.038361,...,-0.050425,0.003059,-0.073523,0.004985,-0.008113,0.009558,-0.038105,0.014918,0.018473,0.006930
300,0.009929,-0.033920,-0.014182,-0.002011,0.002859,-0.017492,-0.051080,0.009359,-0.015323,-0.020739,...,-0.049601,-0.016486,-0.000519,-0.022947,0.037642,-0.006315,-0.023478,0.004112,0.041135,-0.026381
2510,0.034156,-0.017166,-0.014093,0.038483,0.032493,0.003833,-0.011759,0.026814,-0.081008,0.031737,...,-0.025213,-0.021253,0.005613,-0.004191,-0.024915,0.062136,-0.048277,-0.060197,0.005693,-0.048321
2008,-0.069287,0.001901,0.006566,0.010806,0.037788,-0.020554,0.029857,-0.042941,0.023633,-0.028519,...,-0.021302,0.002114,-0.006633,0.070211,0.036321,-0.003345,0.085887,-0.011999,-0.011740,-0.000607
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
703,-0.018577,-0.076831,-0.011638,-0.008162,-0.055313,-0.000629,-0.087827,0.022140,-0.072585,0.059997,...,-0.070026,-0.005358,-0.009508,-0.041145,0.054839,0.028299,-0.014273,0.006855,-0.017789,0.011974
1510,0.024161,0.008815,-0.020319,-0.027920,-0.043117,-0.004798,-0.047229,-0.000353,-0.064692,0.021478,...,0.006066,-0.034626,0.017501,0.003131,-0.005205,-0.001660,-0.024092,0.021054,-0.062887,-0.010192
766,0.017426,-0.051375,0.000795,0.000831,-0.063293,-0.023277,-0.089694,-0.007803,-0.100700,0.043069,...,-0.060823,0.022900,-0.010134,-0.023763,0.049706,0.059806,-0.009043,-0.026030,0.002735,0.001867
2407,0.062672,-0.008010,-0.017647,0.045529,0.033098,0.023118,0.024900,0.038976,-0.026351,-0.004710,...,-0.005569,-0.016657,0.046921,0.068156,0.000026,0.151155,-0.019736,-0.013327,0.025927,-0.014083


In [50]:
from sklearn.neighbors import KNeighborsClassifier

knn_cls = KNeighborsClassifier(n_neighbors=25)

knn_cls.fit(x_train, y_train)

In [51]:
# Measures accuracy of training dataset.
predictions = knn_cls.predict(x_test)
is_correct = list(y_test) == predictions
accuracy = (np.sum(is_correct) / len(predictions)) * 100
accuracy

87.12121212121212

In [52]:
from sklearn.metrics import confusion_matrix

# Measures accuracy of training dataset.
y_test_pred = knn_cls.predict(x_test.to_numpy())

# y_train_pred = knn_cls.predict(X_train.to_numpy())
conf_matrix = confusion_matrix(y_test, y_test_pred)
conf_matrix



array([[96,  0,  1,  0,  0,  0,  0],
       [ 3, 33,  2,  0,  2,  1,  0],
       [ 6,  2, 80,  0,  1,  1,  3],
       [ 0,  0,  0, 52,  6,  0, 16],
       [ 5,  3,  3,  7, 37,  0,  0],
       [ 0,  1,  1,  1,  1, 71,  1],
       [ 0,  0,  0,  1,  0,  0, 91]])

In [53]:
## 1. For each receipt we will take the avg(vendor embeddings, item1 embeddings, item2 embeddings, ...).
## 2. Pre-process the data to be put into the model.
## 3. Predict using the model.