In [None]:
!pip install lightgbm



In [None]:
pip install tensorflow



In [None]:
pip install pandas numpy scikit-learn lightgbm streamlit matplotlib seaborn

Collecting streamlit
  Downloading streamlit-1.44.1-py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.44.1-py3-none-any.whl (9.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.8/9.8 MB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m66.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInst

In [None]:
!pip install tensorflow==2.15.1 pandas numpy matplotlib seaborn scipy scikit-learn lightgbm streamlit pyngrok



Collecting tensorflow==2.15.1
  Downloading tensorflow-2.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Collecting ml-dtypes~=0.3.1 (from tensorflow==2.15.1)
  Downloading ml_dtypes-0.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting numpy
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3 (from tensorflow==2.15.1)
  Downloading protobuf-4.25.6-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting wrapt<1.15,>=1.11.0 (from tensorflow==2.15.1)
  Downloading wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_

# Plastic Injection Moulding Quality Prediction

## Overview
This project aims to predict the quality of plastic injection moulded parts using machine learning models. The dataset (`dataset.csv`) contains features related to the moulding process (e.g., `Melt temperature`, `Mold temperature`, `time_to_fill`) and a target variable `quality` (1=Waste, 2=Acceptable, 3=Target, 4=Inefficient). The project is divided into four sections:

- **Section 1 (15%)**: Data Preprocessing & Exploratory Data Analysis (EDA)
- **Section 2 (15%)**: Hypothesis Testing & ANOVA
- **Section 3 (40%)**: Machine Learning Model Development (including ANN, which is mandatory)
- **Section 4 (20%)**: Interactive Dashboard Development using Streamlit

## Environment Setup
The script uses TensorFlow for the ANN model, which has caused issues locally on Windows (Python 3.9) due to version compatibility and DLL errors. TensorFlow 2.10.0 is not available for Python 3.9, so we’ll use TensorFlow 2.12.0, the earliest compatible version. If DLL errors persist, run this script in WSL or Google Colab, where TensorFlow worked previously.

## Section 0: Import Libraries
### Purpose
Import all necessary libraries for data handling, visualization, statistical analysis, and machine learning.

### Thought Process
- `pandas` and `numpy` for data manipulation.
- `matplotlib` and `seaborn` for visualization.
- `scipy.stats` for ANOVA.
- `sklearn` for preprocessing, model training, and evaluation.
- `lightgbm` for the LightGBM model.
- `tensorflow` for the ANN model (mandatory).
- `warnings` to suppress unnecessary warnings.
- Set a random seed with `np.random.seed(42)` for reproducibility.

### Expected Observation
Libraries should import successfully. TensorFlow 2.12.0 is used to ensure compatibility with Python 3.9.

In [None]:
# Section 0: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import f_oneway, skew, kurtosis
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from lightgbm import LGBMClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.base import BaseEstimator, ClassifierMixin
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

## Section 0: Load Dataset
### Purpose
Load the `Dataset.csv` file, which contains the features and target variable `quality`.

### Thought Process
- Use `latin1` encoding to handle potential encoding issues with the CSV file.
- The dataset should have columns like `Melt temperature`, `Mold temperature`, etc., and `quality` as the target.

### Expected Observation
Dataset should load successfully. Ensure the file is in the same directory as this notebook.

In [None]:
# Section 0: Load Dataset
data = pd.read_csv('Dataset.csv', encoding='latin1')

## Section 1: Data Preprocessing & EDA (15%)
### Dataset Overview
#### Purpose
Understand the dataset's structure, check for missing values, and explore the data distribution.

#### Thought Process
- Check the shape to understand the number of rows and columns.
- Check data types to ensure features are numerical.
- Check for missing values to decide if imputation is needed.

#### Expected Observation
The dataset should have multiple features and a `quality` target. Expect no missing values based on prior runs, but confirm here.

In [None]:
# Section 1: Data Preprocessing & EDA (15%)
## Dataset Overview
print("### Dataset Overview")
print("Shape:", data.shape)
print("\nData Types:\n", data.dtypes)
print("\nMissing Values:\n", data.isnull().sum())

### Feature Engineering
#### Purpose
Create a new feature to potentially improve model performance.

#### Thought Process
- Create a new feature `pressure_ratio` by dividing `APVs - Specific injection pressure peak value` by `APSs - Specific back pressure peak value`.
- This ratio may capture the relative pressure dynamics, potentially improving model performance.

#### Expected Observation
The new `pressure_ratio` feature should be added to the dataset, which may help models capture interactions between pressure-related features.

In [None]:
## Feature Engineering
print("\n### Feature Engineering")
# Calculate pressure_ratio
data['pressure_ratio'] = data['APVs - Specific injection pressure peak value'] / data['APSs - Specific back pressure peak value']
print("New feature 'pressure_ratio' added to the dataset.")
print("Sample values of 'pressure_ratio':\n", data['pressure_ratio'].head())

### Statistical Summary
#### Purpose
Compute basic statistics (mean, std, min, max) and check skewness and kurtosis to understand data distribution.

#### Thought Process
- Skewness indicates asymmetry (positive/negative skew).
- Kurtosis indicates the 'tailedness' of the distribution (high kurtosis = heavy tails).

#### Expected Observation
Features like `Melt temperature` may show varying skewness and kurtosis, indicating potential non-normality. This informs the need for standardization later.

In [None]:
## Statistical Summary
print("\n### Statistical Summary")
stats = data.describe()
print(stats)
for col in data.columns[:-1]:  # Exclude 'quality'
    print(f"{col} - Skewness: {skew(data[col]):.2f}, Kurtosis: {kurtosis(data[col]):.2f}")

In [None]:
## Data Preprocessing
# Load the dataset with encoding handling
try:
    data = pd.read_csv('Dataset.csv', encoding='utf-8')
except UnicodeDecodeError:
    data = pd.read_csv('Dataset.csv', encoding='latin1')

# Separate features and target
X = data.drop('quality', axis=1)
y = data['quality']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Features (X) shape:", X.shape)
print("Target (y) shape:", y.shape)
print("Scaled features (X_scaled) shape:", X_scaled.shape)

### Visualizations
#### Purpose
Visualize key features to understand their distributions and relationships with the target.

#### Thought Process
- Histograms with KDE for `Melt temperature`, `Mold temperature`, `time_to_fill`, and `ZDx - Plasticizing time` to check distributions.
- Boxplot of `Melt temperature` vs `quality` to see how this feature varies across quality levels.

#### Expected Observation
Histograms may reveal non-normal distributions (e.g., skewed `time_to_fill`). The boxplot should show if `Melt temperature` varies significantly across quality levels, suggesting its predictive power.

In [None]:
## Additional EDA Visualizations
print("\n### Additional EDA Visualizations")

# Histogram of pressure_ratio
plt.figure(figsize=(8, 5))
sns.histplot(data['pressure_ratio'], kde=True, color='blue')
plt.title('Distribution of Pressure Ratio')
plt.xlabel('Pressure Ratio')
plt.ylabel('Frequency')
plt.show()

# Boxplots of key features by quality
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
fig.suptitle('Feature Distributions by Quality', fontsize=16)

sns.boxplot(x='quality', y='Melt temperature', data=data, ax=axes[0, 0])
axes[0, 0].set_title('Melt Temperature by Quality')

sns.boxplot(x='quality', y='Mold temperature', data=data, ax=axes[0, 1])
axes[0, 1].set_title('Mold Temperature by Quality')

sns.boxplot(x='quality', y='time_to_fill', data=data, ax=axes[1, 0])
axes[1, 0].set_title('Time to Fill by Quality')

sns.boxplot(x='quality', y='pressure_ratio', data=data, ax=axes[1, 1])
axes[1, 1].set_title('Pressure Ratio by Quality')

plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

# Correlation Heatmap
plt.figure(figsize=(10, 8))
corr = data.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', vmin=-1, vmax=1)
plt.title('Correlation Heatmap of Features')
plt.show()

### Data Cleaning
#### Purpose
Identify and quantify outliers using the Interquartile Range (IQR) method.

#### Thought Process
- Outliers are defined as points below Q1 - 1.5*IQR or above Q3 + 1.5*IQR.
- We’ll print the number of outliers per column but not remove them, as tree-based models (e.g., RandomForest) are robust to outliers.

#### Expected Observation
Some features may have outliers (e.g., `APVs - Specific injection pressure peak value`). Since we’re using robust models, we’ll keep them.

In [None]:
## Data Cleaning
print("### Data Cleaning")
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
outliers = ((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).sum()
print("Outliers per column:\n", outliers)

### Train/Test Split and Standardization
#### Purpose
Split the data into training and testing sets, and standardize the features.

#### Thought Process
- Split the data into training (70%) and testing (30%) sets using `train_test_split` with `random_state` for reproducibility.
- Standardize features using `StandardScaler` to ensure all features are on the same scale, which is important for models like KNN and ANN.

#### Expected Observation
Data should be split into 70% training and 30% testing sets. Features should be standardized (mean=0, std=1) to improve model performance.

In [None]:
## Train/Test Split and Standardization
X = data.drop('quality', axis=1)
y = data['quality']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Section 2: Hypothesis Testing & ANOVA (15%)
### Purpose
Use ANOVA to test if each feature varies significantly across the four quality levels (1-4).

### Thought Process
- **Null Hypothesis (H0)**: The feature’s mean is the same across all quality levels.
- **Alternative Hypothesis (H1)**: The feature’s mean differs across quality levels.
- A low p-value (< 0.05) indicates statistical significance.

### Expected Observation
Features with p-values < 0.05 (e.g., `Melt temperature`) should be statistically significant predictors of quality, confirming their importance.

In [None]:
# Section 2: Hypothesis Testing & ANOVA (15%)
print("\n## Section 2: Hypothesis Testing & ANOVA")
print("### ANOVA Results")
for col in X.columns:
    groups = [X[col][y == i] for i in range(1, 5)]  # Quality: 1-4
    f_stat, p_val = f_oneway(*groups)
    print(f"{col}: F-Statistic = {f_stat:.2f}, p-value = {p_val:.4f}")

## Section 3: Machine Learning Model Development (40%)
### Purpose
Train and evaluate multiple models to predict `quality`, including RandomForest, ExtraTrees, KNN, LightGBM, and ANN (mandatory).

### Thought Process
- Use `GridSearchCV` with 3-fold cross-validation to tune hyperparameters.
- Evaluate models using accuracy, precision, recall, F1-score, and ROC-AUC.

### Expected Observation
Models should train successfully. LightGBM previously showed the highest accuracy (0.926667). ANN performance will depend on the dataset.

### Evaluation Function
#### Purpose
Define a function to compute multiple metrics for consistent evaluation across models.

#### Thought Process
- Compute accuracy, precision, recall, and F1-score for overall performance.
- Compute ROC-AUC for multi-class classification performance.

#### Expected Observation
The evaluation function should be defined to compute all required metrics, ensuring fair comparison across models.

In [None]:
## Evaluation Function
def evaluate_model(y_true, y_pred, y_prob=None):
    metrics = {
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred, average='weighted'),
        'Recall': recall_score(y_true, y_pred, average='weighted'),
        'F1-Score': f1_score(y_true, y_pred, average='weighted')
    }
    if y_prob is not None:
        metrics['ROC-AUC'] = roc_auc_score(y_true, y_prob, multi_class='ovr')
    return metrics

### ANN Wrapper for scikit-learn Compatibility
#### Purpose
Create a wrapper class for the ANN to make it compatible with `GridSearchCV` for consistent model comparison.

#### Thought Process
- The ANN has two hidden layers (64 and 32 neurons) with ReLU activation, and an output layer with softmax for 4 classes.
- Compile with Adam optimizer and sparse categorical crossentropy loss.

#### Expected Observation
The `ANNClassifier` wrapper should allow the ANN to be used with `GridSearchCV`, ensuring it’s evaluated like other models.

In [None]:
## ANN Wrapper for scikit-learn Compatibility
class ANNClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, input_dim, epochs=20, batch_size=32):
        self.input_dim = input_dim
        self.epochs = epochs
        self.batch_size = batch_size
        self.model = None

    def fit(self, X, y):
        self.model = Sequential([
            Dense(64, activation='relu', input_shape=(self.input_dim,)),
            Dense(32, activation='relu'),
            Dense(4, activation='softmax')
        ])
        self.model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, verbose=0)
        return self

    def predict(self, X):
        y_prob = self.model.predict(X, verbose=0)
        return np.argmax(y_prob, axis=1)

    def predict_proba(self, X):
        return self.model.predict(X, verbose=0)

### Model Training and Evaluation
#### Purpose
Define models and hyperparameters, then train and evaluate them using `GridSearchCV`.

#### Thought Process
- Models: RandomForest, ExtraTrees, KNN, LightGBM, and ANN.
- Use 3-fold cross-validation to tune hyperparameters.
- Adjust `y_train` and `y_pred` for 0-based indexing (TensorFlow expects 0-3, but dataset has 1-4).

#### Expected Observation
LightGBM previously showed the highest accuracy (0.926667). ANN performance will depend on the dataset, but expect it to be competitive.

In [None]:
## Model Training and Evaluation
models = {
    'RandomForest': (RandomForestClassifier(), {'n_estimators': [50, 100], 'max_depth': [10, 20]}),
    'ExtraTrees': (ExtraTreesClassifier(), {'n_estimators': [50, 100], 'max_depth': [10, 20]}),
    'KNN': (KNeighborsClassifier(), {'n_neighbors': [3, 5, 7]}),
    'LightGBM': (LGBMClassifier(), {'n_estimators': [50, 100], 'learning_rate': [0.01, 0.1]}),
    'ANN': (ANNClassifier(input_dim=X_train_scaled.shape[1]), {'epochs': [20], 'batch_size': [32]})
}

results = {}
kf = KFold(n_splits=3, shuffle=True, random_state=42)

for name, (model, params) in models.items():
    print(f"\n### Training {name}")
    grid = GridSearchCV(model, params, cv=kf, scoring='accuracy')
    grid.fit(X_train_scaled, y_train - 1)
    y_pred = grid.predict(X_test_scaled) + 1
    y_prob = grid.predict_proba(X_test_scaled)
    results[name] = evaluate_model(y_test, y_pred, y_prob)
    print(f"Best Params: {grid.best_params_}")
    print(f"Results: {results[name]}")

### Feature Importance (Random Forest)
#### Purpose
Use RandomForest to compute feature importance, identifying which features contribute most to quality prediction.

#### Thought Process
- Train a RandomForest model with the best hyperparameters found (`n_estimators=100`, `max_depth=20`).
- Plot feature importance as a horizontal bar chart.

#### Expected Observation
Features like `Melt temperature` and `pressure_ratio` may rank high, aligning with ANOVA results, indicating their predictive power.

In [None]:
## Feature Importance (Random Forest)
rf = RandomForestClassifier(n_estimators=100, max_depth=20).fit(X_train_scaled, y_train - 1)
feat_importance = pd.Series(rf.feature_importances_, index=X.columns)
plt.figure(figsize=(10, 6))
feat_importance.sort_values().plot(kind='barh')
plt.title('Feature Importance (Random Forest)')
plt.show()

### Model Comparison
#### Purpose
Compile the evaluation results into a table for easy comparison across models.

#### Thought Process
- Include all models: RandomForest, ExtraTrees, KNN, LightGBM, and ANN.
- Metrics: Accuracy, Precision, Recall, F1-Score, ROC-AUC.

#### Expected Observation
LightGBM previously outperformed others (Accuracy: 0.926667). ANN should be competitive, but its performance depends on the dataset complexity.

In [None]:
## Model Comparison
results_df = pd.DataFrame(results).T
print("\n### Model Comparison\n", results_df)

# Bar Chart for Model Accuracy
plt.figure(figsize=(8, 5))
sns.barplot(x='Accuracy', y=results_df.index, data=results_df, palette='viridis')
plt.title('Model Accuracy Comparison')
plt.xlabel('Accuracy')
plt.ylabel('Model')
plt.show()

# Confusion Matrices for Each Model
fig, axes = plt.subplots(3, 2, figsize=(12, 15))
fig.suptitle('Confusion Matrices for Each Model', fontsize=16)

# Define quality labels for the confusion matrix
quality_labels = ['Waste', 'Acceptable', 'Target', 'Inefficient']

for idx, (model_name, y_pred) in enumerate(model_predictions.items()):
    row = idx // 2
    col = idx % 2
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[row, col],
                xticklabels=quality_labels, yticklabels=quality_labels)
    axes[row, col].set_title(f'Confusion Matrix: {model_name}')
    axes[row, col].set_xlabel('Predicted')
    axes[row, col].set_ylabel('Actual')

# Remove the empty subplot (if any)
if len(model_predictions) % 2 != 0:
    fig.delaxes(axes[-1, -1])

plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

## Section 4: Interactive Dashboard Development (20%)
### Purpose
Create a Streamlit dashboard for interactive quality prediction.

### Thought Process
- Allow users to input key features via sliders.
- Use the trained RandomForest model for prediction.
- Display prediction probabilities, feature importance, and confusion matrix.

### Expected Observation
The dashboard should allow users to input key features and see the predicted quality, along with visualizations for interpretability.

In [None]:
# Section 4: Interactive Dashboard Development (20%)
print("\n## Section 4: Interactive Dashboard Development")
dashboard_code = """
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Load and preprocess data with encoding handling
try:
    data = pd.read_csv('dataset.csv', encoding='utf-8')
except UnicodeDecodeError:
    data = pd.read_csv('dataset.csv', encoding='latin1')

data['pressure_ratio'] = data['APVs - Specific injection pressure peak value'] / data['APSs - Specific back pressure peak value']
X = data.drop('quality', axis=1)
y = data['quality']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train RandomForest
rf_model = RandomForestClassifier(n_estimators=100, max_depth=20)
rf_model.fit(X_scaled, y - 1)

# Dashboard
st.title('Plastic Injection Moulding Quality Predictor')

# User Inputs
melt_temp = st.slider('Melt Temperature (°C)', float(X['Melt temperature'].min()), float(X['Melt temperature'].max()), 106.0)
mold_temp = st.slider('Mold Temperature (°C)', float(X['Mold temperature'].min()), float(X['Mold temperature'].max()), 81.0)
time_to_fill = st.slider('Time to Fill (s)', float(X['time_to_fill'].min()), float(X['time_to_fill'].max()), 6.5)
shot_volume = st.slider('Shot Volume (cm³)', float(X['SVo - Shot volume'].min()), float(X['SVo - Shot volume'].max()), 18.7)

# Prepare input (values are approximate medians from dataset)
input_data = pd.DataFrame([[
    melt_temp,  # Melt temperature
    mold_temp,  # Mold temperature
    time_to_fill,  # time_to_fill
    shot_volume,  # SVo - Shot volume
    550,  # APVs - Specific injection pressure peak value
    275,  # APSs - Specific back pressure peak value
    5.5,  # ZUX - Cycle time
    3.0,  # ZDx - Plasticizing time
    100,  # SKs - Clamping force peak value
    100,  # SKx - Closing force peak value
    5.5,  # CPn - Screw position at the end of hold pressure
    10,   # Mm - Torque mean value current cycle
    10,   # Ms - Torque peak value current cycle
    550/275  # pressure_ratio (APVs / APSs)
]], columns=X.columns)
input_scaled = scaler.transform(input_data)

# Prediction
pred = rf_model.predict(input_scaled)[0] + 1
probs = rf_model.predict_proba(input_scaled)[0]
quality_map = {1: 'Waste', 2: 'Acceptable', 3: 'Target', 4: 'Inefficient'}
st.write(f'Predicted Quality: **{quality_map[pred]}**')

# Layout for plots
col1, col2, col3 = st.columns(3)

# Probability Bar Chart
with col1:
    st.subheader('Prediction Probabilities')
    fig, ax = plt.subplots(figsize=(5, 3))
    ax.bar(quality_map.values(), probs)
    ax.set_ylim(0, 1)
    plt.xticks(rotation=45)
    st.pyplot(fig)

# Feature Importance
with col2:
    st.subheader('Feature Importance')
    fig, ax = plt.subplots(figsize=(5, 3))
    feat_importance = pd.Series(rf_model.feature_importances_, index=X.columns)
    feat_importance.sort_values().plot(kind='barh', ax=ax)
    ax.set_title('Feature Importance')
    st.pyplot(fig)

# Confusion Matrix
with col3:
    st.subheader('Confusion Matrix')
    y_pred_full = rf_model.predict(X_scaled) + 1
    cm = confusion_matrix(y, y_pred_full)
    fig, ax = plt.subplots(figsize=(5, 3))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
    ax.set_title('Confusion Matrix')
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')
    st.pyplot(fig)
"""

# Write the dashboard file
with open('dashboard.py', 'w', encoding='utf-8') as f:
    f.write(dashboard_code)
print("Dashboard code saved as 'dashboard.py'.")

### Launch Streamlit Dashboard
#### Purpose
Launch the Streamlit dashboard from Jupyter Notebook.

#### Thought Process
- Use `subprocess` to run the Streamlit app.
- The dashboard should open at `http://localhost:8501`.

#### Expected Observation
The dashboard should launch at `http://localhost:8501`, providing an interactive interface for quality prediction.

In [None]:
## Launch Streamlit Dashboard
import subprocess
import sys
import time

print("Launching Streamlit dashboard...")
try:
    process = subprocess.Popen([sys.executable, "-m", "streamlit", "run", "dashboard.py"],
                               stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    time.sleep(3)
    print("Dashboard should be running. Open your browser to: http://localhost:8501")
    print("To stop the dashboard, interrupt the kernel (e.g., press 'I, I' in Jupyter or Ctrl+C in terminal).")
except Exception as e:
    print(f"Failed to launch dashboard: {e}")
    print("Try running 'streamlit run dashboard.py' manually in a terminal in the same directory.")