# Receipt Analyzer
## 1. Data import, cleansing and categorization

In [341]:
import os
import json
import pandas as pd
import re
import pytesseract
from PIL import Image, ImageEnhance, ImageFilter
import openai
from io import StringIO

# Set up your OpenAI API key
openai.api_key = os.getenv('OPENAI_API_RECEIPT_KEY')

# Specify the path to the Tesseract executable
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Define the path to the 'data' folder containing receipt images
data_folder = 'data'

# Load categories from JSON file
with open('data/categories.json', 'r') as file:
    categories = json.load(file)

# Load known supermarkets from JSON file
with open('data/supermarkets.json', 'r') as file:
    known_supermarkets = json.load(file)['supermarkets']

# Get a list of all image files in the 'data' folder
image_files = [os.path.join(data_folder, file) for file in os.listdir(data_folder) if file.endswith(('png', 'jpg', 'jpeg', 'tiff'))]

# Function to preprocess image to improve OCR accuracy
def preprocess_image(image_path):
    image = Image.open(image_path)
    image = image.convert('L')  # Convert to grayscale
    image = image.filter(ImageFilter.SHARPEN)  # Sharpen image
    image = ImageEnhance.Contrast(image).enhance(2)  # Enhance contrast
    return image

# Function to extract text from image using pytesseract
def extract_text_from_image(image_path):
    try:
        image = preprocess_image(image_path)
        return pytesseract.image_to_string(image)
    except FileNotFoundError:
        print(f"File not found: {image_path}")
        return ""

# Function to find and standardize supermarket names
def find_supermarket_name(text):
    text_lower = text.lower()
    for key, name in known_supermarkets.items():
        if key in text_lower:
            return name
    return 'Unknown'

# Function to clean and structure the extracted data
def clean_receipt_data(receipt_text):
    items = []
    lines = receipt_text.split('\n')
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
        
        parts = line.rsplit(' ', 1)
        if len(parts) == 2:
            item, price = parts
            try:
                price = float(price)
                quantity = 1  # Default quantity
                weight = None
                
                # Extract weight if available in the item name
                weight_match = re.search(r'(\d+\.?\d*)\s*(kg|g|ml)', item, re.IGNORECASE)
                if weight_match:
                    weight = float(weight_match.group(1)) if weight_match.group(2).lower() == 'kg' else float(weight_match.group(1)) / 1000
                
                items.append({
                    'item': item.strip(),
                    'weight': weight,
                    'price': price,
                    'quantity': quantity,
                })
            except ValueError:
                # If conversion to float fails, it's not a valid item-price line
                continue
    
    # Extract additional information
    date_of_purchase = None
    supermarket_name = find_supermarket_name(receipt_text)
    
    # Extract date of purchase (considering different date formats)
    date_match = re.search(r'\b(\d{2}[/-]\d{2}[/-]\d{4})\b', receipt_text)
    if date_match:
        date_of_purchase = date_match.group(1)

    return items, date_of_purchase, supermarket_name

# Function to send data to OpenAI API and get improved categorization
def correct_item_name(items):
    prompt = f"""
    Given the following list of purchased products in a supermarket or liquor store, complete and correct the product name for each item.
    Example:
        item: 'COLES FROZEN CAULIFL SOOGRAM'
        item_corrected: 'Coles Frozen Cauliflower 500GRAM'

    This is the list of items:
    {items}

    Step 1: Complete and correct the product name for each item. If you cannot correct an item, return the original item name.
    Step 2: Check that the number of returned items matches the number of input items. If an item is missing, add the corrected name to the list.
    Step 3: Return the completed and corrected product names in a CSV format with the columns 'item' and 'item_corrected', without any additional text or explanations.
    """
    
    client = openai.OpenAI()
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that assists in identifying product names on supermarket receipts."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=1500
    )
    
    return completion.choices[0].message.content.strip()

def chunk_list(lst, chunk_size):
    """Split a list into chunks of a given size."""
    for i in range(0, len(lst), chunk_size):
        yield lst[i:i+chunk_size]

def normalize_text(text):
    """Normalize text by converting to lowercase, stripping whitespace, and removing punctuation."""
    text = text.lower().strip()
    text = re.sub(r'[^\w\s]', '', text)
    return text

def categorize_item(item_corrected, categories):
    if pd.isna(item_corrected):
        return 'Other', 'None', 'None'

    item_lower = item_corrected.lower()
    for category, subcategories in categories.items():
        for subcategory, products in subcategories.items():
            for product, terms in products.items():
                for term in terms:
                    if term.lower() in item_lower:
                        return category, subcategory, product
    return 'Other', 'Other', 'Other'


In [342]:
# Extract data from all receipt images
all_items = []
for image_file in image_files:
    # Extract text from the image
    receipt_text = extract_text_from_image(image_file)
    if receipt_text:
        # Clean and structure the extracted data
        items, date_of_purchase, supermarket_name = clean_receipt_data(receipt_text)
        for item in items:
            all_items.append({
                'file_path': image_file,
                'supermarket_name': supermarket_name,
                'date_of_purchase': date_of_purchase,
                'item': item['item'],
                'weight': item['weight'],
                'price': item['price'],
                'quantity': item['quantity']
            })

# Create a DataFrame
df = pd.DataFrame(all_items)

# Normalize item name
df['item'] = df['item'].apply(normalize_text)

In [344]:
receipt_text

"ALDI Siar\n\nA LIMITED PARTNERSHIP\nBROADWAY\n\nABN: 90 196 565 019\nTax Invoice\n\n99660 PulpFree 100%0J 2L\n\n705527 Barista Almond iL\n9053 SthPnt SvBle 750m]\n761389 Strawberries 250g\n76149 Blueberries 170g\n77706 Cucumber Baby 250g\n76083 Avocado ea\n\n76112 Banana Cav per kg\n\nC'.589kg Net @ 3.99 $/kg\nSubtotal\n\nSUBTOTAL CINCL GST)\n\n$\n6\ng\n9\n9\n4\nA\n2\n3\n\nNOrPONMHBN\nCO1W OO ODM WO’ O\nPerr Srur>\n\nNO\nBS\nNO\nco\n\n24.28\n\n"

In [340]:
clean_receipt_data(receipt_text)

([], None, 'Aldi')

In [308]:
# Convert df['item'] column to a list without duplicates
unique_items_list = df.drop_duplicates(subset=['item'])['item'].tolist()

# Process items in chunks of 20
chunk_size = 20
corrected_items_all = []

for chunk in chunk_list(unique_items_list, chunk_size):
    improved_data = correct_item_name(chunk)
    
    # Convert the CSV response to a DataFrame
    csv_data = StringIO(improved_data)
    df_chunk = pd.read_csv(csv_data)
    
    # Extract the corrected items and maintain the order
    corrected_items_all.extend(df_chunk['item_corrected'].tolist())

# Create a DataFrame from the unique items list and their corrected names
df_corrected = pd.DataFrame({
    'item': unique_items_list,
    'item_corrected': corrected_items_all
})

# Calculate the percentage of NaN values
nan_count = df_corrected['item_corrected'].isna().sum()
total_count = len(df_corrected)
nan_percentage = (nan_count / total_count) * 100

if nan_percentage <= 5:
    # Drop NaN values
    df_corrected = df_corrected.dropna(subset=['item_corrected'])
    print(f"Dismissed {nan_count} out of {total_count} items due to NaN values.")
else:
    print(f"More than 5% of the items have NaN values. Total NaN count: {nan_count}")

# Apply the categorization function to each corrected item
df_corrected[['category', 'subcategory', 'product']] = df_corrected['item_corrected'].apply(
    lambda x: pd.Series(categorize_item(x, categories))
)

# Merge the original DataFrame with the corrected DataFrame on 'item'
df_merged = pd.merge(df, df_corrected, on='item', how='left')

Dismissed 0 out of 143 items due to NaN values.


## 2. Data Analysis

In [316]:
# Create a pivot table
pivot_table = pd.pivot_table(
    df_merged,
    values='price',
    index=['category', 'subcategory', 'product'],
    columns='supermarket_name',
    aggfunc='median',
    fill_value=None
)

# Replace NaN values with None and format numbers to be in $, like $9.00
pivot_table = pivot_table.applymap(lambda x: f"${x:.2f}" if pd.notnull(x) else None)

# Replace NaN values with None
pivot_table = pivot_table.where(pd.notnull(pivot_table), None)

# Display the pivot table
pivot_table

Unnamed: 0_level_0,Unnamed: 1_level_0,supermarket_name,Coles,Liquorland
category,subcategory,product,Unnamed: 3_level_1,Unnamed: 4_level_1
Baby,Essentials,Diaper,$9.00,
Baby,Food,Baby Food,$1.70,
Bakery,Bread,Bread,$3.75,
Bakery,Pastries,Pastry,$5.00,
Beverages,Alcoholic,Beer,$7.20,$7.00
Beverages,Alcoholic,Spirits,$2.50,
Beverages,Non-Alcoholic,Coffee,$7.90,
Beverages,Non-Alcoholic,Juice,$5.00,
Beverages,Non-Alcoholic,Soda,$5.60,
Condiments,Seasoning,Seasoning,$2.15,
