# Water Quality Prediction with TensorFlow

## Professional Machine Learning Project

This notebook demonstrates a complete machine learning pipeline for predicting water quality for consumption based on three key sensor measurements:

- **TDS (Total Dissolved Solids)**: Measures dissolved inorganic and organic substances (mg/L)
- **Turbidity**: Measures water clarity/cloudiness (NTU - Nephelometric Turbidity Units)
- **pH**: Measures acidity/alkalinity levels (0-14 scale)

### Project Objectives
1. Generate realistic synthetic water quality data
2. Build a TensorFlow neural network for classification
3. Predict water quality categories: Poor, Acceptable, Good, Excellent
4. Provide actionable insights for water consumption safety

## 1. Import Required Libraries

Import all necessary libraries for machine learning, data analysis, and visualization.

In [None]:
# Core libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Machine Learning libraries
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.regularizers import l2

# Scikit-learn utilities
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.utils import class_weight

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Configure matplotlib
plt.style.use('seaborn-v0_8')
plt.rcParams['figure.figsize'] = (12, 8)

print("TensorFlow version:", tf.__version__)
print("GPU Available:", tf.config.list_physical_devices('GPU'))
print("Libraries imported successfully!")

## 2. Generate Synthetic Water Quality Dataset

Create a realistic synthetic dataset based on WHO and EPA water quality standards.

In [None]:
# Water Quality Standards
QUALITY_LABELS = {
    0: 'Poor',
    1: 'Acceptable',
    2: 'Good', 
    3: 'Excellent'
}

def determine_quality_score(tds, turbidity, ph):
    """Determine water quality based on sensor readings"""
    # pH scoring (7.0-7.5 is optimal)
    if 7.0 <= ph <= 7.5:
        ph_score = 3
    elif 6.5 <= ph <= 8.5:
        ph_score = 2
    elif 6.0 <= ph <= 9.0:
        ph_score = 1
    else:
        ph_score = 0
    
    # TDS scoring (lower is better)
    if tds <= 300:
        tds_score = 3
    elif tds <= 600:
        tds_score = 2
    elif tds <= 900:
        tds_score = 1
    else:
        tds_score = 0
    
    # Turbidity scoring (lower is better)
    if turbidity <= 1:
        turbidity_score = 3
    elif turbidity <= 4:
        turbidity_score = 2
    elif turbidity <= 10:
        turbidity_score = 1
    else:
        turbidity_score = 0
    
    # Weighted average (pH is most critical)
    weights = [0.4, 0.3, 0.3]  # pH, TDS, Turbidity
    scores = [ph_score, tds_score, turbidity_score]
    weighted_score = np.average(scores, weights=weights)
    
    # Convert to discrete quality levels
    if weighted_score >= 2.5:
        return 3  # Excellent
    elif weighted_score >= 1.5:
        return 2  # Good
    elif weighted_score >= 0.5:
        return 1  # Acceptable
    else:
        return 0  # Poor

def generate_realistic_sample(quality_target=None):
    """Generate a single realistic water sample"""
    if quality_target is None:
        quality_target = np.random.choice([0, 1, 2, 3], p=[0.15, 0.25, 0.35, 0.25])
    
    if quality_target == 3:  # Excellent
        ph = np.random.normal(7.25, 0.15)
        tds = np.random.normal(200, 50)
        turbidity = np.random.exponential(0.5)
    elif quality_target == 2:  # Good
        ph = np.random.normal(7.0, 0.4)
        tds = np.random.normal(450, 100)
        turbidity = np.random.exponential(2.0)
    elif quality_target == 1:  # Acceptable
        ph = np.random.normal(6.8, 0.8)
        tds = np.random.normal(750, 150)
        turbidity = np.random.exponential(6.0)
    else:  # Poor
        ph = np.random.choice([
            np.random.normal(5.5, 0.5),  # Too acidic
            np.random.normal(9.5, 0.5)   # Too alkaline
        ])
        tds = np.random.normal(1200, 300)
        turbidity = np.random.exponential(15.0)
    
    # Apply realistic bounds
    ph = np.clip(ph, 4.0, 12.0)
    tds = np.clip(tds, 50, 3000)
    turbidity = np.clip(turbidity, 0.1, 50)
    
    return tds, turbidity, ph

# Generate dataset
print("Generating water quality dataset...")
n_samples = 8000
data = []

# Generate samples for each quality level
quality_distribution = [0.15, 0.25, 0.35, 0.25]  # Poor, Acceptable, Good, Excellent

for quality in range(4):
    n_samples_quality = int(n_samples * quality_distribution[quality])
    
    for _ in range(n_samples_quality):
        tds, turbidity, ph = generate_realistic_sample(quality)
        
        # Verify quality matches expectations (with some noise)
        actual_quality = determine_quality_score(tds, turbidity, ph)
        
        # Add some noise to make it more realistic
        if np.random.random() < 0.1:  # 10% noise
            actual_quality = np.random.choice([max(0, actual_quality-1), 
                                             min(3, actual_quality+1)])
        
        data.append({
            'tds': round(tds, 2),
            'turbidity': round(turbidity, 2),
            'ph': round(ph, 2),
            'quality': actual_quality,
            'quality_label': QUALITY_LABELS[actual_quality]
        })

# Shuffle and create DataFrame
np.random.shuffle(data)
df = pd.DataFrame(data)

print(f"Dataset created with {len(df)} samples")
print(f"Quality distribution:\n{df['quality_label'].value_counts()}")
print("\nFirst 5 rows:")
df.head()

## 3. Data Preprocessing and Exploration

Explore the dataset structure, visualize distributions, and analyze relationships between features.

In [None]:
# Dataset overview
print("=== Dataset Information ===")
print(f"Shape: {df.shape}")
print(f"\nData types:\n{df.dtypes}")
print(f"\nMissing values:\n{df.isnull().sum()}")
print(f"\nBasic statistics:")
df.describe()

In [None]:
# Visualize feature distributions by quality
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

features = ['tds', 'turbidity', 'ph']
colors = ['red', 'orange', 'lightblue', 'green']

# Feature distributions
for i, feature in enumerate(features):
    ax = axes[i//2, i%2]
    
    for j, quality in enumerate(['Poor', 'Acceptable', 'Good', 'Excellent']):
        subset = df[df['quality_label'] == quality]
        ax.hist(subset[feature], alpha=0.7, label=quality, bins=30, color=colors[j])
    
    ax.set_xlabel(feature.upper())
    ax.set_ylabel('Frequency')
    ax.set_title(f'{feature.upper()} Distribution by Quality')
    ax.legend()
    ax.grid(True, alpha=0.3)

# Quality distribution pie chart
ax = axes[1, 1]
quality_counts = df['quality_label'].value_counts()
ax.pie(quality_counts.values, labels=quality_counts.index, autopct='%1.1f%%', 
       colors=colors, startangle=90)
ax.set_title('Overall Quality Distribution')

plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis
plt.figure(figsize=(10, 8))

# Calculate correlation matrix
numeric_df = df[['tds', 'turbidity', 'ph', 'quality']].copy()
correlation_matrix = numeric_df.corr()

# Plot heatmap
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, fmt='.3f', cbar_kws={'label': 'Correlation Coefficient'})
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

print("Correlation insights:")
for feature in ['tds', 'turbidity', 'ph']:
    corr = correlation_matrix.loc[feature, 'quality']
    print(f"- {feature.upper()} correlation with quality: {corr:.3f}")

In [None]:
# Pairwise feature relationships
plt.figure(figsize=(14, 10))

# Create pairplot
plot_df = df[['tds', 'turbidity', 'ph', 'quality_label']].copy()
g = sns.pairplot(plot_df, hue='quality_label', diag_kind='hist',
                 plot_kws={'alpha': 0.6}, diag_kws={'alpha': 0.7},
                 palette=['red', 'orange', 'lightblue', 'green'])

g.fig.suptitle('Feature Relationships by Water Quality', y=1.02, fontsize=16)
plt.show()

## 4. Feature Engineering and Scaling

Prepare the data for machine learning by scaling features and splitting into train/validation/test sets.

In [None]:
# Prepare features and target
feature_columns = ['tds', 'turbidity', 'ph']
X = df[feature_columns].copy()
y = df['quality'].copy()

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nTarget distribution:\n{pd.Series(y).value_counts().sort_index()}")

# Split data into train, validation, and test sets
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp
)

print(f"\nData split:")
print(f"Training set: {X_train.shape[0]} samples ({X_train.shape[0]/len(df)*100:.1f}%)")
print(f"Validation set: {X_val.shape[0]} samples ({X_val.shape[0]/len(df)*100:.1f}%)")
print(f"Test set: {X_test.shape[0]} samples ({X_test.shape[0]/len(df)*100:.1f}%)")

In [None]:
# Feature scaling
scaler = StandardScaler()

# Fit scaler on training data and transform all sets
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Convert labels to categorical for neural network
y_train_cat = to_categorical(y_train, num_classes=4)
y_val_cat = to_categorical(y_val, num_classes=4)
y_test_cat = to_categorical(y_test, num_classes=4)

print("Feature scaling completed")
print(f"Scaled training features shape: {X_train_scaled.shape}")
print(f"Categorical training labels shape: {y_train_cat.shape}")

# Calculate class weights for imbalanced dataset
class_weights = class_weight.compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = dict(enumerate(class_weights))
print(f"\nClass weights: {class_weight_dict}")

## 5. Build TensorFlow Neural Network Model

Design and compile a neural network architecture optimized for water quality classification.

In [None]:
def build_water_quality_model(input_dim=3, num_classes=4):
    """Build neural network for water quality classification"""
    
    model = Sequential([
        # Input layer with L2 regularization
        Dense(64, input_dim=input_dim, activation='relu', 
              kernel_regularizer=l2(0.001), name='input_layer'),
        BatchNormalization(),
        Dropout(0.3),
        
        # Hidden layer 1
        Dense(32, activation='relu', kernel_regularizer=l2(0.001), name='hidden_1'),
        BatchNormalization(),
        Dropout(0.3),
        
        # Hidden layer 2
        Dense(16, activation='relu', kernel_regularizer=l2(0.001), name='hidden_2'),
        BatchNormalization(),
        Dropout(0.2),
        
        # Output layer
        Dense(num_classes, activation='softmax', name='output_layer')
    ])
    
    # Compile model
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

# Build the model
model = build_water_quality_model(input_dim=X_train_scaled.shape[1])

# Display model architecture
print("Neural Network Architecture:")
model.summary()

# Visualize model architecture
tf.keras.utils.plot_model(model, show_shapes=True, show_layer_names=True, dpi=150)

## 6. Train the Model

Train the neural network with callbacks for optimal performance.

In [None]:
# Create training callbacks
callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True,
        verbose=1
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.2,
        patience=5,
        min_lr=0.0001,
        verbose=1
    )
]

print("Starting model training...")
print("This may take a few minutes...")

# Train the model
history = model.fit(
    X_train_scaled, y_train_cat,
    validation_data=(X_val_scaled, y_val_cat),
    epochs=100,
    batch_size=32,
    class_weight=class_weight_dict,
    callbacks=callbacks,
    verbose=1
)

print("\nTraining completed!")

In [None]:
# Plot training history
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Plot loss
ax1.plot(history.history['loss'], label='Training Loss', linewidth=2)
ax1.plot(history.history['val_loss'], label='Validation Loss', linewidth=2)
ax1.set_title('Model Loss During Training', fontsize=14)
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Plot accuracy
ax2.plot(history.history['accuracy'], label='Training Accuracy', linewidth=2)
ax2.plot(history.history['val_accuracy'], label='Validation Accuracy', linewidth=2)
ax2.set_title('Model Accuracy During Training', fontsize=14)
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print final metrics
final_train_acc = history.history['accuracy'][-1]
final_val_acc = history.history['val_accuracy'][-1]
final_train_loss = history.history['loss'][-1]
final_val_loss = history.history['val_loss'][-1]

print(f"\nFinal Training Metrics:")
print(f"Training Accuracy: {final_train_acc:.4f}")
print(f"Validation Accuracy: {final_val_acc:.4f}")
print(f"Training Loss: {final_train_loss:.4f}")
print(f"Validation Loss: {final_val_loss:.4f}")

## 7. Model Evaluation and Metrics

Evaluate model performance on the test set and analyze classification metrics.

In [None]:
# Make predictions on test set
y_test_pred_proba = model.predict(X_test_scaled)
y_test_pred = np.argmax(y_test_pred_proba, axis=1)

# Calculate test accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Classification report
print("\n=== Classification Report ===")
report = classification_report(
    y_test, y_test_pred,
    target_names=[QUALITY_LABELS[i] for i in range(4)],
    digits=4
)
print(report)

In [None]:
# Confusion matrix visualization
cm = confusion_matrix(y_test, y_test_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=[QUALITY_LABELS[i] for i in range(4)],
            yticklabels=[QUALITY_LABELS[i] for i in range(4)],
            cbar_kws={'label': 'Number of Samples'})
plt.title('Confusion Matrix - Water Quality Prediction', fontsize=16)
plt.xlabel('Predicted Quality', fontsize=12)
plt.ylabel('Actual Quality', fontsize=12)
plt.tight_layout()
plt.show()

# Calculate per-class accuracy
print("\nPer-class accuracy:")
for i in range(4):
    class_accuracy = cm[i, i] / cm[i, :].sum()
    print(f"{QUALITY_LABELS[i]}: {class_accuracy:.4f}")

## 8. Real-time Prediction Function

Create a function for making real-time predictions on new sensor readings.

In [None]:
def predict_water_quality(tds, turbidity, ph, model, scaler):
    """
    Predict water quality for given sensor readings
    
    Args:
        tds (float): Total Dissolved Solids (mg/L)
        turbidity (float): Turbidity (NTU)
        ph (float): pH level
        model: Trained TensorFlow model
        scaler: Fitted StandardScaler
    
    Returns:
        dict: Prediction results
    """
    # Validate inputs
    if not (4.0 <= ph <= 12.0):
        return {"error": f"pH {ph} is outside valid range (4.0-12.0)"}
    
    if not (0 <= tds <= 5000):
        return {"error": f"TDS {tds} is outside valid range (0-5000 mg/L)"}
        
    if not (0 <= turbidity <= 100):
        return {"error": f"Turbidity {turbidity} is outside valid range (0-100 NTU)"}
    
    # Prepare input
    sample = np.array([[tds, turbidity, ph]])
    sample_scaled = scaler.transform(sample)
    
    # Make prediction
    pred_proba = model.predict(sample_scaled, verbose=0)
    pred_class = np.argmax(pred_proba, axis=1)[0]
    confidence = pred_proba[0][pred_class]
    
    # Generate recommendation
    if confidence < 0.7:
        recommendation = "Low confidence prediction. Consider additional testing."
    elif pred_class == 3:
        recommendation = "✅ Excellent water quality. Safe for consumption."
    elif pred_class == 2:
        recommendation = "✅ Good water quality. Generally safe for consumption."
    elif pred_class == 1:
        recommendation = "⚠️ Acceptable water quality. Monitor regularly."
    else:
        recommendation = "❌ Poor water quality. Treatment required before consumption."
    
    return {
        'quality_class': int(pred_class),
        'quality_label': QUALITY_LABELS[pred_class],
        'confidence': float(confidence),
        'probabilities': {QUALITY_LABELS[i]: float(pred_proba[0][i]) for i in range(4)},
        'recommendation': recommendation
    }

# Test the prediction function
print("=== Testing Prediction Function ===")

test_cases = [
    (250, 0.8, 7.2, "Excellent quality water"),
    (450, 2.5, 7.0, "Good quality water"),
    (800, 5.0, 6.8, "Acceptable water"),
    (1500, 15.0, 5.5, "Poor quality water")
]

for tds, turbidity, ph, description in test_cases:
    result = predict_water_quality(tds, turbidity, ph, model, scaler)
    
    print(f"\n{description}:")
    print(f"Input: TDS={tds}, Turbidity={turbidity}, pH={ph}")
    
    if "error" not in result:
        print(f"Predicted: {result['quality_label']} (Confidence: {result['confidence']:.1%})")
        print(f"Recommendation: {result['recommendation']}")
    else:
        print(f"Error: {result['error']}")

## 9. Visualize Results and Feature Importance

Create comprehensive visualizations to understand model behavior and feature relationships.

In [None]:
# Prediction confidence analysis
all_pred_proba = model.predict(X_test_scaled)
all_pred_classes = np.argmax(all_pred_proba, axis=1)
confidence_scores = np.max(all_pred_proba, axis=1)

# Plot confidence distribution by quality
plt.figure(figsize=(12, 8))

for i, quality in enumerate(['Poor', 'Acceptable', 'Good', 'Excellent']):
    mask = all_pred_classes == i
    confidences = confidence_scores[mask]
    
    plt.hist(confidences, alpha=0.7, label=f'{quality} (n={len(confidences)})', 
             bins=20, color=colors[i])

plt.xlabel('Prediction Confidence')
plt.ylabel('Frequency')
plt.title('Prediction Confidence Distribution by Quality Class')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print(f"Average confidence: {np.mean(confidence_scores):.4f}")
print(f"Predictions with >90% confidence: {np.sum(confidence_scores > 0.9)/len(confidence_scores)*100:.1f}%")

In [None]:
# Feature impact visualization
# Create a grid to show how each feature affects predictions

def visualize_feature_impact(feature_name, feature_idx, model, scaler):
    """Visualize how a single feature affects predictions"""
    
    # Set default values (typical good water)
    default_values = [300, 2.0, 7.0]  # TDS, Turbidity, pH
    
    # Define feature ranges
    feature_ranges = {
        'tds': np.linspace(100, 1500, 100),
        'turbidity': np.linspace(0.1, 20, 100),
        'ph': np.linspace(5.0, 9.0, 100)
    }
    
    feature_range = feature_ranges[feature_name]
    predictions = []
    
    for value in feature_range:
        # Create sample with varying feature
        sample = default_values.copy()
        sample[feature_idx] = value
        
        # Make prediction
        sample_scaled = scaler.transform([sample])
        pred_proba = model.predict(sample_scaled, verbose=0)
        pred_class = np.argmax(pred_proba, axis=1)[0]
        
        predictions.append(pred_class)
    
    return feature_range, predictions

# Visualize impact of each feature
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
features = ['tds', 'turbidity', 'ph']
feature_labels = ['TDS (mg/L)', 'Turbidity (NTU)', 'pH Level']

for i, (feature, label) in enumerate(zip(features, feature_labels)):
    feature_range, predictions = visualize_feature_impact(feature, i, model, scaler)
    
    # Create color map for predictions
    colors_map = {0: 'red', 1: 'orange', 2: 'lightblue', 3: 'green'}
    pred_colors = [colors_map[pred] for pred in predictions]
    
    axes[i].scatter(feature_range, predictions, c=pred_colors, alpha=0.7, s=20)
    axes[i].set_xlabel(label)
    axes[i].set_ylabel('Predicted Quality Class')
    axes[i].set_title(f'Impact of {label} on Water Quality')
    axes[i].set_yticks([0, 1, 2, 3])
    axes[i].set_yticklabels(['Poor', 'Acceptable', 'Good', 'Excellent'])
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Model performance summary
def generate_model_summary(model, X_test, y_test, y_pred, history):
    """Generate comprehensive model performance summary"""
    
    print("="*60)
    print("            WATER QUALITY PREDICTION MODEL SUMMARY")
    print("="*60)
    
    # Model architecture info
    total_params = model.count_params()
    trainable_params = sum([np.prod(layer.get_weights()[0].shape) 
                           for layer in model.layers if layer.get_weights()])
    
    print(f"\n📊 MODEL ARCHITECTURE:")
    print(f"   • Total parameters: {total_params:,}")
    print(f"   • Trainable parameters: {trainable_params:,}")
    print(f"   • Number of layers: {len(model.layers)}")
    
    # Performance metrics
    test_loss, test_acc = model.evaluate(X_test, to_categorical(y_test), verbose=0)
    
    print(f"\n🎯 PERFORMANCE METRICS:")
    print(f"   • Test Accuracy: {test_acc:.4f} ({test_acc*100:.2f}%)")
    print(f"   • Test Loss: {test_loss:.4f}")
    
    # Training info
    epochs_trained = len(history.history['loss'])
    best_val_acc = max(history.history['val_accuracy'])
    
    print(f"\n🏃 TRAINING INFO:")
    print(f"   • Epochs trained: {epochs_trained}")
    print(f"   • Best validation accuracy: {best_val_acc:.4f}")
    
    # Quality-specific performance
    print(f"\n💧 QUALITY-SPECIFIC PERFORMANCE:")
    cm = confusion_matrix(y_test, y_pred)
    for i in range(4):
        class_acc = cm[i, i] / cm[i, :].sum() if cm[i, :].sum() > 0 else 0
        class_samples = cm[i, :].sum()
        print(f"   • {QUALITY_LABELS[i]}: {class_acc:.4f} accuracy ({class_samples} samples)")
    
    print("\n" + "="*60)

generate_model_summary(model, X_test_scaled, y_test, y_test_pred, history)

In [None]:
# Interactive prediction demonstration
print("🔬 INTERACTIVE WATER QUALITY TESTING")
print("="*50)

# Demo samples representing different scenarios
demo_samples = [
    (200, 0.5, 7.3, "🏔️ Mountain spring water"),
    (350, 1.8, 7.1, "🏠 Filtered tap water"),
    (650, 4.2, 6.9, "🚰 City tap water"),
    (1100, 8.5, 6.2, "🌊 Well water"),
    (1800, 18.0, 5.2, "🏭 Contaminated source")
]

results_df = []

for tds, turbidity, ph, description in demo_samples:
    result = predict_water_quality(tds, turbidity, ph, model, scaler)
    
    print(f"\n{description}")
    print(f"📈 Sensors: TDS={tds} mg/L, Turbidity={turbidity} NTU, pH={ph}")
    
    if "error" not in result:
        quality_emoji = {'Poor': '🔴', 'Acceptable': '🟡', 'Good': '🔵', 'Excellent': '🟢'}
        emoji = quality_emoji[result['quality_label']]
        
        print(f"{emoji} Quality: {result['quality_label']} (Confidence: {result['confidence']:.1%})")
        print(f"💡 {result['recommendation']}")
        
        # Store for summary table
        results_df.append({
            'Description': description.split('�')[1].strip() if '�' in description else description,
            'TDS': tds,
            'Turbidity': turbidity,
            'pH': ph,
            'Predicted_Quality': result['quality_label'],
            'Confidence': f"{result['confidence']:.1%}"
        })
    else:
        print(f"❌ Error: {result['error']}")

# Create summary table
results_summary = pd.DataFrame(results_df)
print("\n" + "="*50)
print("📋 PREDICTION SUMMARY TABLE")
print("="*50)
print(results_summary.to_string(index=False))

In [None]:
# Save the trained model and preprocessor
print("💾 Saving trained model and preprocessor...")

# Create models directory if it doesn't exist
import os
os.makedirs('../models', exist_ok=True)

# Save TensorFlow model
model.save('../models/water_quality_model.h5')
print("✅ Model saved to ../models/water_quality_model.h5")

# Save preprocessor (scaler)
import joblib
joblib.dump(scaler, '../models/water_quality_model_preprocessor.pkl')
print("✅ Preprocessor saved to ../models/water_quality_model_preprocessor.pkl")

# Save feature names for future reference
feature_info = {
    'feature_names': feature_columns,
    'quality_labels': QUALITY_LABELS,
    'model_version': '1.0',
    'training_samples': len(X_train),
    'test_accuracy': float(test_accuracy)
}

import json
with open('../models/model_info.json', 'w') as f:
    json.dump(feature_info, f, indent=2)
print("✅ Model info saved to ../models/model_info.json")

print("\n🎉 Model training and saving completed successfully!")
print("\n📝 Next steps:")
print("   1. Use the saved model for real-time predictions")
print("   2. Deploy the model in a production environment")
print("   3. Monitor model performance with new data")
print("   4. Retrain periodically with updated datasets")