In [None]:
# Ensures the code runs
import sys
import os

os.chdir("..")
print("Now in:", os.getcwd)

# Add parent directory to path so Python can file the 'data/' module
sys.path.append(os.path.abspath(".."))

In [None]:
import pandas as pd
from pathlib import Path

# === Step 1: Load project datasets ===
from data.load_beauty_data import get_beauty_dataset
from preprocessing.clean_and_label import load_ingredient_blacklist, label_dataframe

# Load original dataset (you probably won't need this, but loading it is fine)
df_beauty = get_beauty_dataset()

# Load harmful ingredient set
harmful_set = load_ingredient_blacklist()

print(f"Loaded {len(harmful_set)} harmful ingredients.")

# === Step 2: Load your OCR normalized ingredients ===
normalization_summary_path = Path("ingredient_normalization_summary.csv")
df = pd.read_csv(normalization_summary_path)

print(f"Loaded {len(df)} OCR normalized ingredients.")

# === Step 3: Classify ingredients ===
def classify_ingredient(name):
    if pd.isna(name) or name.strip() == "":
        return "Unknown"
    name = name.lower().strip()
    return "Harmful" if name in harmful_set else "Safe"

# Apply classification
df["classification"] = df["normalized_inci"].apply(classify_ingredient)

# === Step 4: Save the classification results ===
output_path = Path("ingredient_classification_results.csv")
df.to_csv(output_path, index=False)

print(f"✅ Classification results saved to: {output_path.resolve()}")



In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# 1. Load your classification results
df = pd.read_csv("ingredient_classification_results.csv")

# 2. Quick overview
print(df.head())

# 3. Count how many Safe vs Harmful
classification_counts = df["classification"].value_counts()
print("\nClassification counts:\n", classification_counts)

# 4. Plot bar chart
plt.figure(figsize=(8, 6))
classification_counts.plot(kind="bar", color=["skyblue", "salmon"])
plt.title("Ingredient Classification Counts")
plt.xlabel("Classification")
plt.ylabel("Number of Ingredients")
plt.xticks(rotation=0)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.tight_layout()
plt.show()

# 5. Plot pie chart
plt.figure(figsize=(6, 6))
classification_counts.plot(
    kind="pie",
    autopct="%1.1f%%",
    startangle=140,
    colors=["skyblue", "salmon"],
    labels=["Safe", "Harmful"]
)
plt.ylabel("")
plt.title("Ingredient Classification Percentage")
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# 1. Load your classification results again
df = pd.read_csv("ingredient_classification_results.csv")

# 2. Filter for only harmful ingredients
harmful_df = df[df["classification"] == "Harmful"]

# 3. Count occurrences of each harmful ingredient
harmful_counts = harmful_df["normalized_inci"].value_counts().head(10)

# 4. Plot top 10 harmful ingredients
plt.figure(figsize=(10, 6))
harmful_counts.plot(kind="barh", color="salmon")
plt.title("Top 10 Most Frequent Harmful Ingredients Detected")
plt.xlabel("Count")
plt.ylabel("Harmful Ingredient")
plt.gca().invert_yaxis()  # So the highest bar is at the top
plt.grid(axis="x", linestyle="--", alpha=0.7)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd

# 1. Load the ingredient normalization and classification results
normalization_df = pd.read_csv("ingredient_normalization_summary.csv")  # <- your matching
classification_df = pd.read_csv("ingredient_classification_results.csv")  # <- harmful/safe labels

# 2. Merge both files (should match on 'file' + 'normalized_inci')
final_df = normalization_df.copy()
final_df["classification"] = classification_df["classification"]

# 3. Save the combined final file
final_df.to_csv("final_ingredient_extraction_summary.csv", index=False)

print("✅ Final master CSV saved as: final_ingredient_extraction_summary.csv")
