In [None]:
# Ensures the code runs
import sys
import os

os.chdir("..")
print("Now in:", os.getcwd)

# Add parent directory to path so Python can file the 'data/' module
sys.path.append(os.path.abspath(".."))

### Step 1: Setup & Load Data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from data.load_beauty_data import get_beauty_dataset
from preprocessing.clean_and_label import load_ingredient_blacklist, label_dataframe

In [None]:
# Load and label dataset
df = get_beauty_dataset()
harmful_set = load_ingredient_blacklist()
df = label_dataframe(df, harmful_set)

In [None]:
# Preview
df[['ingredients_text', 'label']].head()

### Step 2: Preprocessing (TF-IDF)

In [None]:
# TF-IDF vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)

X = tfidf.fit_transform(df['ingredients_text'])
y = df['label']

print("TF-IDF shape:", X.shape)

### Step 3: Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

### Step 4: Train a Logistic Regression Classifier

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

### Step 5: Evaluate Performace

In [None]:
print(classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["Safe", "Harmful"], yticklabels=["Safe", "Harmful"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()