# Air Filter Performance Analysis using Machine Learning

This notebook presents a comprehensive analysis of air filter performance using various machine learning techniques. We'll analyze different types of filters, predict their efficiency, and provide maintenance recommendations.

## Table of Contents
1. Data Loading and Preprocessing
2. Exploratory Data Analysis
3. Feature Engineering
4. Model Development
5. Model Evaluation
6. Performance Predictions
7. Maintenance Recommendations

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn')
sns.set_palette('husl')

## 1. Data Loading and Preprocessing

In [None]:
# Load the data
print("Loading data...")
df = pd.read_csv('cleaned_air_filter_data.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Display basic information
print("\nDataset Info:")
print(df.info())

print("\nSummary Statistics:")
print(df.describe())

## 2. Exploratory Data Analysis

In [None]:
# Analyze efficiency distribution by filter type
plt.figure(figsize=(12, 6))
sns.boxplot(x='filter_class', y='efficiency', data=df)
plt.title('Efficiency Distribution by Filter Type')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Efficiency vs Age scatter plot
plt.figure(figsize=(12, 6))
for filter_type in df['filter_class'].unique():
    filter_data = df[df['filter_class'] == filter_type]
    plt.scatter(filter_data['filter_age_days'], 
                filter_data['efficiency'], 
                alpha=0.5, 
                label=filter_type)
plt.xlabel('Filter Age (days)')
plt.ylabel('Efficiency')
plt.title('Efficiency vs Filter Age')
plt.legend()
plt.tight_layout()
plt.show()

## 3. Feature Engineering

In [None]:
def create_features(df):
    """Create enhanced features for modeling."""
    df = df.copy()
    
    # Time-based features
    df['hour'] = df['timestamp'].dt.hour
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    df['month'] = df['timestamp'].dt.month
    
    # Performance metrics
    df['pm_ratio'] = np.where(df['inlet_pm10'] > 0, 
                             df['inlet_pm25'] / df['inlet_pm10'], 
                             0)
    df['pressure_efficiency_ratio'] = np.where(df['pressure_drop_pa'] > 0,
                                             df['efficiency'] / df['pressure_drop_pa'],
                                             0)
    df['load_age_ratio'] = np.where(df['filter_age_days'] > 0,
                                   df['load_factor'] / df['filter_age_days'],
                                   df['load_factor'])
    
    # Clip extreme values
    for col in ['pm_ratio', 'pressure_efficiency_ratio', 'load_age_ratio']:
        q1 = df[col].quantile(0.01)
        q3 = df[col].quantile(0.99)
        df[col] = df[col].clip(q1, q3)
    
    return df

# Create enhanced features
df_enhanced = create_features(df)

# Define features for modeling
features = [
    'filter_age_days', 'load_factor', 'pressure_drop_pa',
    'inlet_pm25', 'inlet_pm10', 'hour', 'day_of_week', 'month',
    'pm_ratio', 'pressure_efficiency_ratio', 'load_age_ratio'
]

print("Enhanced features created:")
print(features)

## 4. Model Development

In [None]:
def train_and_evaluate_model(X, y, model_name="Random Forest"):
    """Train and evaluate a model using cross-validation."""
    # Initialize model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    metrics = {
        'R²': r2_score(y_test, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
        'MAE': mean_absolute_error(y_test, y_pred)
    }
    
    return model, metrics, (X_test, y_test, y_pred)

# Train models for each filter type
results = {}
for filter_type in df['filter_class'].unique():
    print(f"\nTraining model for {filter_type}...")
    
    # Filter data
    filter_data = df_enhanced[df_enhanced['filter_class'] == filter_type].copy()
    
    # Prepare features
    X = filter_data[features]
    y = filter_data['efficiency']
    
    # Scale features
    scaler = RobustScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled = pd.DataFrame(X_scaled, columns=features)
    
    # Train and evaluate
    model, metrics, test_data = train_and_evaluate_model(X_scaled, y)
    results[filter_type] = {
        'model': model,
        'metrics': metrics,
        'test_data': test_data,
        'scaler': scaler
    }
    
    print(f"Model Performance:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")

## 5. Model Evaluation

In [None]:
# Plot actual vs predicted values for each filter type
fig, axes = plt.subplots(2, 2, figsize=(15, 15))
axes = axes.ravel()

for idx, (filter_type, result) in enumerate(results.items()):
    X_test, y_test, y_pred = result['test_data']
    
    axes[idx].scatter(y_test, y_pred, alpha=0.5)
    axes[idx].plot([y_test.min(), y_test.max()], 
                   [y_test.min(), y_test.max()], 
                   'r--', lw=2)
    axes[idx].set_xlabel('Actual Efficiency')
    axes[idx].set_ylabel('Predicted Efficiency')
    axes[idx].set_title(f'{filter_type}')

plt.tight_layout()
plt.show()

# Plot feature importance for each filter type
fig, axes = plt.subplots(2, 2, figsize=(15, 15))
axes = axes.ravel()

for idx, (filter_type, result) in enumerate(results.items()):
    importance = pd.DataFrame({
        'feature': features,
        'importance': result['model'].feature_importances_
    }).sort_values('importance', ascending=True)
    
    axes[idx].barh(importance['feature'], importance['importance'])
    axes[idx].set_title(f'{filter_type} - Feature Importance')

plt.tight_layout()
plt.show()

## 6. Performance Predictions

In [None]:
def predict_efficiency_over_time(model, scaler, filter_data, max_age=60):
    """Predict filter efficiency over time."""
    # Create prediction data
    test_ages = np.arange(0, max_age)
    test_data = pd.DataFrame({
        'filter_age_days': test_ages,
        'load_factor': [filter_data['load_factor'].median()] * max_age,
        'pressure_drop_pa': [filter_data['pressure_drop_pa'].median()] * max_age,
        'inlet_pm25': [filter_data['inlet_pm25'].median()] * max_age,
        'inlet_pm10': [filter_data['inlet_pm10'].median()] * max_age,
        'hour': [12] * max_age,
        'day_of_week': [3] * max_age,
        'month': [6] * max_age,
        'pm_ratio': [filter_data['pm_ratio'].median()] * max_age,
        'pressure_efficiency_ratio': [filter_data['pressure_efficiency_ratio'].median()] * max_age,
        'load_age_ratio': [filter_data['load_age_ratio'].median()] * max_age
    })
    
    # Scale features
    test_data_scaled = scaler.transform(test_data[features])
    
    # Predict efficiencies
    predicted_efficiencies = model.predict(test_data_scaled)
    
    return test_ages, predicted_efficiencies

# Plot efficiency predictions for each filter type
plt.figure(figsize=(12, 6))

for filter_type, result in results.items():
    filter_data = df_enhanced[df_enhanced['filter_class'] == filter_type]
    ages, efficiencies = predict_efficiency_over_time(result['model'], 
                                                     result['scaler'], 
                                                     filter_data)
    
    plt.plot(ages, efficiencies, label=filter_type)

plt.axhline(y=0.95, color='r', linestyle='--', label='95% Efficiency Threshold')
plt.xlabel('Filter Age (days)')
plt.ylabel('Predicted Efficiency')
plt.title('Predicted Efficiency Over Time')
plt.legend()
plt.grid(True)
plt.show()

## 7. Maintenance Recommendations

In [None]:
print("Maintenance Recommendations:")
print("==========================")

for filter_type, result in results.items():
    filter_data = df_enhanced[df_enhanced['filter_class'] == filter_type]
    ages, efficiencies = predict_efficiency_over_time(result['model'], 
                                                     result['scaler'], 
                                                     filter_data)
    
    # Find replacement point
    initial_efficiency = efficiencies[0]
    threshold = initial_efficiency * 0.95
    replacement_age = ages[efficiencies < threshold][0] if any(efficiencies < threshold) else 60
    
    print(f"\n{filter_type}:")
    print(f"Initial Efficiency: {initial_efficiency:.3f}")
    print(f"Recommended Replacement Age: {replacement_age} days")
    print(f"Efficiency at Replacement: {efficiencies[replacement_age]:.3f}")
    
    # Additional recommendations
    importance = pd.DataFrame({
        'feature': features,
        'importance': result['model'].feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nKey factors affecting performance:")
    for _, row in importance.head(3).iterrows():
        print(f"- {row['feature']}: {row['importance']:.3f}")