# Basic Ingredient Extraction from Product Labels

## 1. Introduction
- Goal: Try extracting ingredients from cosmetic or food product labels using OCR (EasyOCR).
- Purpose: Simulate real-world data extraction to enhance dataset robustness.

## 2. Install EasyOCR
(pip install easyocr)

## 3. OCR Extraction
- Run EasyOCR on 1-2 sample images.
- Save extracted text to .txt files.

## 4. Sample Outputs
- View and analyze OCR results.

## 5. Future Work
- Suggest improvements like better OCR, post-processing, etc.


In [None]:
import os
import sys

# Set working directory to project root
os.chdir(r"C:\Users\User\Documents\cs599_deepLearning\harmful-ingredient-detector")
print("✅ Now in:", os.getcwd())

# Add root to Python path
sys.path.append(os.getcwd())

### Preprocessing the Image

In [None]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

# --- Preprocessing function ---
def preprocess_image(filepath):
    img = cv2.imread(str(filepath))
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Denoising
    denoised = cv2.fastNlMeansDenoising(gray, h=30)

    # Contrast Enhancement (CLAHE)
    clahe = cv2.createCLAHE(clipLimit=0.5, tileGridSize=(8, 8))
    enhanced = clahe.apply(denoised)

    # Binarization
    _, thresh = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    return img, thresh

# --- Save Preprocessed Only ---
def save_processed_images(image_folder, processed_folder):
    image_paths = list(Path(image_folder).glob('*.jpg'))
    processed_folder = Path(processed_folder)
    processed_folder.mkdir(parents=True, exist_ok=True)

    for img_path in image_paths:
        original_img, processed_img = preprocess_image(img_path)
        save_path = processed_folder / (img_path.stem + '.png')
        cv2.imwrite(str(save_path), processed_img)
        print(f"Saved preprocessed image: {save_path}")

# --- Save Before/After Comparison ---
def save_before_after_comparison(image_folder, save_folder):
    image_paths = list(Path(image_folder).glob('*.jpg'))
    save_folder = Path(save_folder)
    save_folder.mkdir(parents=True, exist_ok=True)

    for img_path in image_paths:
        original_img, processed_img = preprocess_image(img_path)

        fig, axes = plt.subplots(1, 2, figsize=(12, 6))
        fig.suptitle(f"{img_path.name}", fontsize=16)

        axes[0].imshow(cv2.cvtColor(original_img, cv2.COLOR_BGR2RGB))
        axes[0].set_title('Original')
        axes[0].axis('off')

        axes[1].imshow(processed_img, cmap='gray')
        axes[1].set_title('Preprocessed')
        axes[1].axis('off')

        save_path = save_folder / f"{img_path.stem}_comparison.png"
        plt.savefig(str(save_path), bbox_inches='tight')
        plt.close()

        print(f"Saved comparison: {save_path}")

# --- Example Usage ---
image_folder = 'images/beauty_ingredients'
processed_folder = 'images/beauty_ingredients_preprocessed'
save_folder = 'images/comparison_plots'

save_processed_images(image_folder, processed_folder)
save_before_after_comparison(image_folder, save_folder)



### Step 1: OCR Extraction

In [None]:
import os
import easyocr

# Create output folder
ocr_output_dir = "./images/ocr_outputs"
os.makedirs(ocr_output_dir, exist_ok=True)

# Re-initialize EasyOCR Reader
reader = easyocr.Reader(['en'], gpu=False)

# Folder with preprocessed images
preprocessed_dir = "./images/beauty_ingredients_preprocessed"
image_paths = [os.path.join(preprocessed_dir, img) for img in os.listdir(preprocessed_dir) if img.endswith(".png")]

for img_path in image_paths:
    img_filename = os.path.basename(img_path)
    output_txt_path = os.path.join(ocr_output_dir, img_filename.replace(".png", ".txt"))
    
    print(f"\nExtracting text from {img_filename}...")
    results = reader.readtext(img_path)
    
    extracted_text = []
    for (bbox, text, confidence) in results:
        # Keep text only if OCR confidence is decent
        if confidence > 0.4:  # You can lower/raise this threshold if needed
            extracted_text.append(text.strip())

    # Save to .txt file
    with open(output_txt_path, "w", encoding="utf-8") as f:
        f.write("\n".join(extracted_text))
    
    print(f"Saved extracted text to {output_txt_path}")




### Step 2: Ingredient Cleaning

In [None]:
import re
from textblob import TextBlob
from pathlib import Path

def clean_extracted_text(text_list):
    cleaned_ingredients = []
    for text in text_list:
        text = text.lower().strip()
        text = re.sub(r'[^a-z0-9\s\(\)\-\/,]', '', text)

        if any(word in text for word in ['directions', 'apply', 'store', 'cool', 'website', 'massage', 'usa', 'net weight', 'social', 'ingredient']):
            continue
        if len(text) < 3:
            continue

        blob = TextBlob(text)
        corrected = str(blob.correct())

        corrected = re.sub(r'\s+', ' ', corrected).strip()
        cleaned_ingredients.append(corrected)
        print(f"✅ Cleaned line: {corrected}")
    return cleaned_ingredients

def save_cleaned_ocr_files(ocr_input_folder, cleaned_output_folder):
    print("inside save cleaned")
    ocr_input_folder = Path(ocr_input_folder)
    cleaned_output_folder = Path(cleaned_output_folder)
    cleaned_output_folder.mkdir(parents=True, exist_ok=True)
    print("before loop")

    for ocr_file in sorted(ocr_input_folder.glob("*.txt")):
        print(f"📄 Processing {ocr_file.name}...")

        with open(ocr_file, "r", encoding="utf-8") as f:
            lines = f.readlines()

        cleaned_lines = clean_extracted_text(lines)

        cleaned_file_path = cleaned_output_folder / ocr_file.name
        with open(cleaned_file_path, "w", encoding="utf-8") as f:
            for line in cleaned_lines:
                f.write(line + "\n")

        print(f"✅ Saved cleaned file: {cleaned_file_path}")

# ✅ Correct folder name
ocr_input_folder = Path.cwd() / "./images/ocr_outputs"    # <-- No "s"
cleaned_output_folder = Path.cwd() / "./images/ocr_output_cleaned"

save_cleaned_ocr_files(ocr_input_folder, cleaned_output_folder)

print("Finished cleaning.")



### Normalization

In [None]:
import pandas as pd
from rapidfuzz import process, fuzz
from pathlib import Path

# 1. Load COSING
cosing_file = Path("images/COSING_Ingredients-Fragrance Inventory_v3.csv")
cosing_df = pd.read_csv(cosing_file)

# Extract Column B (INCI names)
inci_names = cosing_df.iloc[:, 1].dropna().str.lower().tolist()

# 2. Normalization Function
def normalize_ingredient(text, reference_list, threshold=85):
    text = text.lower().strip()

    if text in reference_list:
        return text, 100  # Perfect match

    match, score, _ = process.extractOne(text, reference_list, scorer=fuzz.ratio)
    if score >= threshold:
        return match, score
    else:
        return text, score

# 3. Normalize and Log
def normalize_ocr_outputs(cleaned_folder, normalized_folder, log_csv_path):
    cleaned_folder = Path(cleaned_folder)
    normalized_folder = Path(normalized_folder)
    normalized_folder.mkdir(parents=True, exist_ok=True)

    summary_records = []  # for summary CSV

    for cleaned_file in sorted(cleaned_folder.glob("*.txt")):
        with open(cleaned_file, "r", encoding="utf-8") as f:
            lines = f.read().splitlines()

        normalized_lines = []
        for line in lines:
            normalized_text, score = normalize_ingredient(line, inci_names)
            normalized_lines.append(normalized_text)

            summary_records.append({
                "file": cleaned_file.name,
                "ocr_extracted": line,
                "normalized_inci": normalized_text,
                "similarity_score": score
            })

        normalized_file_path = normalized_folder / cleaned_file.name
        with open(normalized_file_path, "w", encoding="utf-8") as f:
            for line in normalized_lines:
                f.write(line + "\n")

        print(f"✅ Normalized {cleaned_file.name}")

    # Save full summary CSV
    summary_df = pd.DataFrame(summary_records)
    summary_df.to_csv(log_csv_path, index=False)
    print(f"📄 Full summary saved at {log_csv_path}")

# 4. Execute
cleaned_folder = "./images/ocr_output_cleaned"
normalized_folder = "./images/ocr_outputs_normalized"
log_csv_path = "ingredient_normalization_summary.csv"

normalize_ocr_outputs(cleaned_folder, normalized_folder, log_csv_path)


In [None]:
print(Path.cwd())