In [None]:
"""
AI-Driven Optimization of Steelmaking Process Using Real-Time Sensor Data (Jan 2025 - Mar 2025)


This pipeline demonstrates:
 - Loading and preprocessing real-time steelmaking sensor data
 - Training ML models (RandomForest, GradientBoosting, optionally XGBoost/LightGBM)
 - Predicting steel quality (classification/regression)
 - Optimizing key parameters (temperature, composition, flow) using a surrogate model
 - Visualizing results (feature importance, parameter optimization)

Usage:
 python steelmaking_ai_optimization.py --data_file steel_sensor_data.csv --label_col quality
"""

import argparse
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

try:
    import xgboost as xgb
    XGB_AVAILABLE = True
except ImportError:
    XGB_AVAILABLE = False

# -------------------- Data Prep --------------------
def load_and_prepare_data(file, label_col, test_size=0.2, scale=True):
    df = pd.read_csv(file)
    X = df.drop(columns=[label_col])
    y = df[label_col]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=42
    )

    if scale:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
    else:
        X_train, X_test = X_train.values, X_test.values

    return X_train, X_test, y_train, y_test, list(df.drop(columns=[label_col]).columns)

# -------------------- Training --------------------
def train_models(X_train, y_train):
    models = {}

    rf = RandomForestClassifier(n_estimators=300, max_depth=None, class_weight="balanced", random_state=42)
    rf.fit(X_train, y_train)
    models["RandomForest"] = rf

    gb = GradientBoostingClassifier(n_estimators=300, learning_rate=0.05, max_depth=5, random_state=42)
    gb.fit(X_train, y_train)
    models["GradientBoosting"] = gb

    if XGB_AVAILABLE:
        xgb_model = xgb.XGBClassifier(
            n_estimators=400,
            max_depth=6,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            eval_metric="mlogloss",
            random_state=42,
            n_jobs=-1,
        )
        xgb_model.fit(X_train, y_train)
        models["XGBoost"] = xgb_model

    return models

# -------------------- Evaluation --------------------
def evaluate_model(model, X_test, y_test, name):
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)

    print(f"\n=== {name} Evaluation ===")
    print("Accuracy:", round(acc, 4))
    print(classification_report(y_test, preds))

    cm = confusion_matrix(y_test, preds)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(f"{name} Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

    return acc

# -------------------- Optimization --------------------
def optimize_parameters(model, feature_names, bounds, n_samples=1000):
    """
    Use the trained model to explore parameter space and suggest optimal values.
    bounds: dict {feature: (min, max)}
    """
    samples = {}
    for feat, (low, high) in bounds.items():
        samples[feat] = np.random.uniform(low, high, n_samples)
    df_samples = pd.DataFrame(samples)

    preds = model.predict(df_samples)
    # assume higher quality class is max label
    optimal = df_samples.loc[preds.argmax()] if hasattr(preds, "argmax") else df_samples.iloc[0]

    print("\nSuggested optimal parameters:")
    print(optimal)
    return optimal

# -------------------- Visualization --------------------
def plot_feature_importances(model, feature_names, name, top_n=10):
    if hasattr(model, "feature_importances_"):
        importances = model.feature_importances_
        indices = np.argsort(importances)[::-1][:top_n]
        plt.figure(figsize=(8, 6))
        plt.bar(range(top_n), importances[indices], align="center")
        plt.xticks(range(top_n), [feature_names[i] for i in indices], rotation=45, ha="right")
        plt.title(f"{name} Top {top_n} Features")
        plt.tight_layout()
        plt.show()

# -------------------- Main --------------------
def main(args):
    X_train, X_test, y_train, y_test, feature_names = load_and_prepare_data(
        args.data_file, args.label_col
    )

    models = train_models(X_train, y_train)
    best_model, best_acc = None, 0

    for name, model in models.items():
        acc = evaluate_model(model, X_test, y_test, name)
        plot_feature_importances(model, feature_names, name)
        if acc > best_acc:
            best_model, best_acc = model, acc

    print(f"Best model achieved accuracy: {round(best_acc, 4)}")

    # Optimization demo (example bounds)
    bounds = {
        "temperature": (1500, 1700),
        "composition_C": (0.02, 0.1),
        "flow_rate": (100, 300),
    }
    optimize_parameters(best_model, feature_names, bounds)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="AI-Driven Steelmaking Optimization")
    parser.add_argument("--data_file", type=str, required=True, help="Path to sensor data CSV")
    parser.add_argument("--label_col", type=str, required=True, help="Target column for quality")
    args = parser.parse_args()
    main(args)