In [None]:
# Ensures the code runs
import sys
import os

os.chdir("..")
print("Now in:", os.getcwd)

# Add parent directory to path so Python can file the 'data/' module
sys.path.append(os.path.abspath(".."))

### Step 1: Import TensorFlow

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.feature_extraction.text import TfidfVectorizer

### Step 2: Use the Same TF-IDF Vectors

In [None]:
from data.load_beauty_data import get_beauty_dataset
from preprocessing.clean_and_label import load_ingredient_blacklist, label_dataframe

# Load and label dataset
df = get_beauty_dataset()
harmful_set = load_ingredient_blacklist()
df = label_dataframe(df, harmful_set)

In [None]:
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X = tfidf.fit_transform(df['ingredients_text'])
y = df['label']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


### Step 3: Convert to TensorFlow-friendly format

In [None]:
import numpy as np

X_train_tf = X_train.toarray()
X_test_tf = X_test.toarray()
y_train_tf = np.array(y_train)
y_test_tf = np.array(y_test)

### Step 4: Define and Compile the MLP

In [None]:
model = keras.Sequential([
    layers.Input(shape=(X_train_tf.shape[1],)),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')   # Binary classification
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()

### Step 5: Train the Model

In [None]:
history = model.fit(
    X_train_tf, y_train_tf,
    validation_data=(X_test_tf, y_test_tf),
    epochs=10,
    batch_size=32
)

### Step 6: Visualize the Training Process (loss/accuracy plots)

In [None]:
import matplotlib.pyplot as plt

os.makedirs("plots", exist_ok=True)


# Accuracy plot
plt.figure(figsize=(12,4))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

# Loss plot
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

plt.savefig("plots/mlp_training_curves.png", dpi=300)

### Step 7: Evaluate the Model

In [None]:
from sklearn.metrics import classification_report

os.makedirs("results", exist_ok=True)

y_pred_probs = model.predict(X_test_tf)
y_pred = (y_pred_probs > 0.5).astype(int)

report = classification_report(y_test_tf, y_pred, target_names=["Safe", "Harmful"])

print(report)  # Optional: display in notebook

# Save to file
with open("results/mlp_classification_report.txt", "w") as f:
    f.write(report)