<a href="https://colab.research.google.com/github/guptamadhav/industrial_ai/blob/main/Kiln_Drying.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!git clone https://github.com/guptamadhav/industrial_ai.git

Cloning into 'industrial_ai'...
remote: Enumerating objects: 8, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 8 (delta 1), reused 4 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (8/8), 22.66 KiB | 2.27 MiB/s, done.
Resolving deltas: 100% (1/1), done.


In [74]:
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (mean_absolute_error, mean_squared_error, r2_score,
                              classification_report, confusion_matrix, roc_auc_score)

In [89]:
MODELS_DIR  = '/content/industrial_ai/models'
PLOTS_DIR   = '/content/industrial_ai/plots'

In [40]:
data = pd.read_csv('/content/industrial_ai/data_schedules/kiln_drying_dataset.csv')

In [41]:
data.head()

Unnamed: 0,species,board_size,thickness_mm,density_kg_m3,fiber_saturation_pct,initial_MC_pct,schedule,airflow_m_s,final_MC_pct,total_drying_time_hr,total_energy_kWh,avg_kiln_temp_C,avg_WBD_C,avg_RH_pct,avg_EMC_pct,reached_target,nlga_compliant
0,Fir,2x10,38,471,27.0,73.9,T10-D5,4.2,18.95,28.6,245.9,94.5,23.7,38.4,2.91,1,1
1,Fir,4x4,89,471,27.0,79.2,T10-D5,4.5,18.69,165.9,1384.1,93.5,23.0,39.6,3.03,1,1
2,Pine,4x4,89,512,26.0,94.8,T8-D4,2.87,60.94,209.5,1160.1,65.6,6.5,72.7,7.18,0,0
3,Pine,2x8,38,512,26.0,127.0,T8-D4,4.95,18.76,93.9,593.8,72.0,10.6,63.9,6.11,1,1
4,Pine,2x10,38,512,26.0,107.9,T10-D5,3.69,18.96,52.4,392.0,85.0,17.3,50.4,4.27,1,1


In [29]:
data.describe()

Unnamed: 0,thickness_mm,density_kg_m3,fiber_saturation_pct,initial_MC_pct,airflow_m_s,final_MC_pct,total_drying_time_hr,total_energy_kWh,avg_kiln_temp_C,avg_WBD_C,avg_RH_pct,avg_EMC_pct,reached_target,nlga_compliant,species_enc,schedule_enc,board_enc
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,48.2,462.515,26.985,96.9448,3.50737,26.20534,101.8037,628.9139,75.6621,12.6176,59.4985,5.54113,0.813,0.49,0.99,1.365,2.003
std,20.410208,45.264983,0.815745,15.873982,0.866478,17.368864,60.734177,331.516153,9.350747,4.678521,10.059358,1.402639,0.390107,0.50015,0.812749,0.808968,1.413856
min,38.0,402.0,26.0,53.3,2.0,17.2,23.6,208.8,54.0,3.0,34.9,2.56,0.0,0.0,0.0,0.0,0.0
25%,38.0,402.0,26.0,85.5,2.78,18.63,56.0,398.75,68.375,9.2,52.8,4.54,1.0,0.0,0.0,1.0,1.0
50%,38.0,471.0,27.0,95.0,3.51,19.01,76.55,503.35,76.4,12.5,59.15,5.41,1.0,0.0,1.0,2.0,2.0
75%,38.0,512.0,28.0,106.6,4.27,19.58,133.15,762.35,82.225,15.4,65.225,6.3475,1.0,1.0,2.0,2.0,3.0
max,89.0,512.0,28.0,148.9,5.0,100.97,211.2,1664.6,97.3,26.0,85.0,9.32,1.0,1.0,2.0,2.0,4.0


In [9]:
# encode categorical variables as integers
le_species  = LabelEncoder()
le_schedule = LabelEncoder()
le_board    = LabelEncoder()

In [10]:
data['species_enc']  = le_species.fit_transform(data['species'])
data['schedule_enc'] = le_schedule.fit_transform(data['schedule'])
data['board_enc']    = le_board.fit_transform(data['board_size'])

In [11]:
FEATURE_COLS = [
    'species_enc',           # wood species
    'board_enc',             # board size
    'thickness_mm',          # board thickness
    'density_kg_m3',         # species density
    'fiber_saturation_pct',  # FSP
    'initial_MC_pct',        # green MC at batch load : strongest predictor
    'schedule_enc',          # kiln schedule used
    'airflow_m_s',           # fan airspeed
]

FEATURE_LABELS = [
    'Board Size',
    'Thickness (mm)',
    'Density (kg/m³)',
    'Fiber Saturation (%)',
    'Initial MC (%)',
    'Kiln Schedule',
    'Airflow (m/s)',
]

In [12]:
X = data[FEATURE_COLS].values
print(f"    Features ({len(FEATURE_COLS)}): {FEATURE_COLS}")

    Features (8): ['species_enc', 'board_enc', 'thickness_mm', 'density_kg_m3', 'fiber_saturation_pct', 'initial_MC_pct', 'schedule_enc', 'airflow_m_s']


Targets

In [13]:
y_MC     = data['final_MC_pct'].values          # regression target 1 : Final MC
y_time   = data['total_drying_time_hr'].values  # regression target 2 : Total Drying Time
y_energy = data['total_energy_kWh'].values      # regression target 3 : Total cost
y_nlga   = data['nlga_compliant'].values        # classification target : NLGA passed or not

# Split Dataset into Train, Test
**4 splits for 4 different models :**
1. final_MC_pct          (regression)  -- primary target
2. total_drying_time_hr  (regression)  -- scheduling target
3. total_energy_kWh      (regression)  -- cost target
4. nlga_compliant        (classification) -- pass/fail probability

In [14]:
train_x, test_x, train_y_MC, test_y_MC = train_test_split(X, y_MC, test_size=0.2, random_state=42, stratify = y_nlga)
_, _, train_y_time, test_y_time = train_test_split(X, y_time, test_size=0.2, random_state=42, stratify = y_nlga)
_, _, train_y_energy, test_y_energy = train_test_split(X, y_energy, test_size=0.2, random_state=42, stratify = y_nlga)
_, _, train_y_nlga, test_y_nlga = train_test_split(X, y_nlga, test_size=0.2, random_state=42, stratify = y_nlga)

In [90]:
print(f"Train: {train_x.shape[0]} runs   Test: {test_x.shape[0]} runs")
print(f"NLGA compliance in train: {train_y_energy.mean()*100:.1f}%")
print(f"NLGA compliance in test:  {test_y_energy.mean()*100:.1f}%")

Train: 800 runs   Test: 200 runs
NLGA compliance in train: 62852.8%
NLGA compliance in test:  63045.9%


# hyperparameters:

*   **n_estimators** : number of trees. More = better fit, slower training.100-200 .
*   **max_depth**   : depth of each tree. Deeper = more interactions captured.
*   **learning_rate**: shrinkage applied to each tree's contribution.Lower = more conservative, need more trees.
*   **subsample**    : fraction of training data used per tree (stochastic GB). 0.8 adds regularization, improves generalization.
*   **min_samples_leaf**: minimum samples in a leaf. Prevents fitting to noise.







In [16]:
# Regression for MC, energy and cost
PARAMS_REG = dict(
    n_estimators   = 200,
    max_depth      = 4,
    learning_rate  = 0.08,
    subsample      = 0.8,
    min_samples_leaf = 5,
    random_state   = 42,
)

# Classification for nlga compliance
PARAMS_CLF = dict(
    n_estimators   = 200,
    max_depth      = 4,
    learning_rate  = 0.08,
    subsample      = 0.8,
    min_samples_leaf = 5,
    random_state   = 42,
)


# Train Models


In [91]:
def train_regressor(X_train, y_train, name, params):
  print(f"\nTraining : {name}")
  model = GradientBoostingRegressor(**params)
  model.fit(X_train, y_train)
  # using R2 Cross Validation : given the features, can we predict the outcome well.
  cv_r2 = cross_val_score(model, X_train, y_train, cv = 5, scoring = 'r2')
  print(f"    CV R²: {cv_r2.mean():.4f} ± {cv_r2.std():.4f}")
  return model

def train_classifier(X_train, y_train, name, params):
  print(f"\nTraining : {name}")
  model = GradientBoostingClassifier(**params)
  model.fit(X_train, y_train)
  # using Area Under the ROC curve :  how well the model separates pass from fail batches across all possible decision thresholds
  cv_auc = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
  print(f"    CV AUC: {cv_auc.mean():.4f} ± {cv_auc.std():.4f}")
  return model

# save models
def save_pkl(obj, path):
    with open(path, 'wb') as f:
        pickle.dump(obj, f)
    print(f"Saved: {path}")


In [92]:
model_MC     = train_regressor(train_x, train_y_MC, "Final MC Predictor", PARAMS_REG)
model_time   = train_regressor(train_x, train_y_time,  "Drying Time Predictor", PARAMS_REG)
model_energy = train_regressor(train_x, train_y_energy,  "Energy Predictor", PARAMS_REG)
model_nlga   = train_classifier(train_x, train_y_nlga, "NLGA Compliance Classifier", PARAMS_CLF)


Training : Final MC Predictor
    CV R²: 0.9927 ± 0.0023

Training : Drying Time Predictor
    CV R²: 0.9941 ± 0.0042

Training : Energy Predictor
    CV R²: 0.9929 ± 0.0038

Training : NLGA Compliance Classifier
    CV AUC: 0.6689 ± 0.0246


In [94]:
save_pkl(model_MC, f'{MODELS_DIR}/model_final_MC.pkl')
save_pkl(model_time, f'{MODELS_DIR}/model_drying_time.pkl')
save_pkl(model_energy, f'{MODELS_DIR}/model_energy.pkl')
save_pkl(model_nlga, f'{MODELS_DIR}/model_nlga_classifier.pkl')

Saved: /content/industrial_ai/models/model_final_MC.pkl
Saved: /content/industrial_ai/models/model_drying_time.pkl
Saved: /content/industrial_ai/models/model_energy.pkl
Saved: /content/industrial_ai/models/model_nlga_classifier.pkl


# Evaluating on Test set

*   **R²**
*   **MAE** : Mean Absolute Error
*   **RMSE** : Root Mean Square Error. (penalizes big errors more), useful when large mistakes are costly




In [23]:
def eval_regressor(model, X_test, y_test, name, unit):
  y_pred = model.predict(X_test)
  r2   = r2_score(y_test, y_pred)
  mae  = mean_absolute_error(y_test, y_pred)
  rmse = np.sqrt(mean_squared_error(y_test, y_pred))
  print(f"\n  {name}:")
  print(f"    R²   = {r2:.4f}  ")
  print(f"    MAE  = {mae:.2f} {unit}")
  print(f"    RMSE = {rmse:.2f} {unit}")
  return y_pred, r2, mae, rmse

In [33]:
pred_y_MC, r2_mc, mae_mc, rmse_mc = eval_regressor(model_MC, test_x, test_y_MC, "Final MC (%)", "%")
pred_y_time, r2_t, mae_t, rmse_t = eval_regressor(model_time, test_x, test_y_time,  "Drying Time", "hrs")
pred_y_energy, r2_e, mae_e, rmse_e = eval_regressor(model_energy, test_x, test_y_energy,  "Energy", "kWh")



  Final MC (%):
    R²   = 0.9930  
    MAE  = 0.83 %
    RMSE = 1.50 %

  Drying Time:
    R²   = 0.9930  
    MAE  = 2.43 hrs
    RMSE = 5.16 hrs

  Energy:
    R²   = 0.9851  
    MAE  = 18.78 kWh
    RMSE = 41.36 kWh


In [60]:
# Classification
pred_y_nlga      = model_nlga.predict(test_x)
yn_prob      = model_nlga.predict_proba(test_x)[:, 1]
auc          = roc_auc_score(test_y_nlga, yn_prob)
print(f"\n  NLGA Compliance Classifier:")
print(f"    AUC-ROC = {auc:.4f}  (1.0 = perfect; >0.90 = strong)")
print(f"\n  Classification Report:")
print(classification_report(test_y_nlga, yn_pred,
      target_names=['Non-compliant (>19%)', 'Compliant (<=19%)']))


  NLGA Compliance Classifier:
    AUC-ROC = 0.6720  (1.0 = perfect; >0.90 = strong)

  Classification Report:
                      precision    recall  f1-score   support

Non-compliant (>19%)       0.63      0.62      0.62       102
   Compliant (<=19%)       0.61      0.62      0.62        98

            accuracy                           0.62       200
           macro avg       0.62      0.62      0.62       200
        weighted avg       0.62      0.62      0.62       200



# Plots

In [95]:
def plot_actual_vs_pred(y_true, y_pred, title, unit, filename, r2, mae):
    fig, ax = plt.subplots(figsize=(7, 6))
    lims = [min(y_true.min(), y_pred.min()) * 0.95,
            max(y_true.max(), y_pred.max()) * 1.05]
    ax.plot(lims, lims, '--', color='#999', linewidth=1.5, label='Perfect prediction')
    ax.scatter(y_true, y_pred, alpha=0.5, s=25, color='#2E7D32', edgecolors='none')
    ax.set_xlabel(f'Actual {unit}', fontsize=12)
    ax.set_ylabel(f'Predicted {unit}', fontsize=12)
    ax.set_title(f'{title}\nR² = {r2:.3f}   MAE = {mae:.2f} {unit}', fontsize=13, fontweight='bold')
    ax.set_xlim(lims); ax.set_ylim(lims)
    ax.legend(fontsize=10)
    plt.tight_layout()
    ax.plot()
    plt.savefig(f'{PLOTS_DIR}/{filename}', dpi=150)
    plt.show()
    plt.close()
    print(f"Saved: {PLOTS_DIR}/{filename}")

In [96]:
plot_actual_vs_pred(test_y_MC, pred_y_MC, 'Final MC Prediction', '%', 'pred_MC.png', r2_mc, mae_mc)
plot_actual_vs_pred(test_y_time, pred_y_time, 'Drying Time Prediction', 'hrs', 'pred_time.png', r2_t, mae_t)
plot_actual_vs_pred(test_y_energy, pred_y_energy, 'Energy Prediction', 'kWh', 'pred_energy.png', r2_e, mae_e)

Saved: /content/industrial_ai/plots/pred_MC.png
Saved: /content/industrial_ai/plots/pred_time.png
Saved: /content/industrial_ai/plots/pred_energy.png


In [97]:
# generate confusion matric of NLGA class
fig, ax = plt.subplots(figsize=(5, 4))
cm = confusion_matrix(test_y_nlga, pred_y_nlga)
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens', ax=ax,
            xticklabels=['Fail (>19%)', 'Pass (≤19%)'],
            yticklabels=['Fail (>19%)', 'Pass (≤19%)'])
ax.set_xlabel('Predicted', fontsize=12)
ax.set_ylabel('Actual', fontsize=12)
ax.set_title(f'NLGA Compliance Confusion Matrix\nAUC = {auc:.3f}',
             fontsize=12, fontweight='bold')
plt.tight_layout()
plt.show()
plt.savefig(f'{PLOTS_DIR}/confusion_matrix.png', dpi=150)
plt.close()

In [88]:
# Residuals plot for primary target : MC, Moisture Control
residuals = test_y_MC - pred_y_MC # difference between predicted MC and actual MC
fig, ax = plt.subplots(figsize=(7, 5))
ax.scatter(yMC_pred, residuals, alpha=0.5, s=25, color='#2E7D32', edgecolors='none')
ax.axhline(0, color='#999', linestyle='--', linewidth=1.5)
ax.set_xlabel('Predicted Final MC (%)', fontsize=12)
ax.set_ylabel('Residual (Actual − Predicted) %', fontsize=12)
ax.set_title('Residual Plot — Final MC Prediction\n(Random scatter = good; patterns = model bias)',
             fontsize=12, fontweight='bold')
plt.tight_layout()
plt.savefig(f'{PLOTS_DIR}/residuals_MC.png', dpi=150)
plt.close()