# Air Filter Performance Analysis

This notebook provides a comprehensive analysis of air filter performance data across different types and locations. We'll analyze:
1. Data Cleaning and Preprocessing
2. Detailed Performance Analysis
3. Advanced Statistical Analysis
4. Predictive Analysis and Recommendations

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Set plotting style
plt.style.use('seaborn')
sns.set_palette("husl")

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

## 1. Data Cleaning and Preprocessing

In [None]:
# Read the original data
df = pd.read_csv('air_filter_data.csv')
print("Original data shape:", df.shape)

# Remove NA values
df_clean = df.dropna()

# Remove outliers using IQR method
numerical_cols = ['filter_age_days', 'load_factor', 'pressure_drop_pa', 'efficiency', 
                 'inlet_pm25', 'outlet_pm25', 'inlet_pm10', 'outlet_pm10']

for col in numerical_cols:
    Q1 = df_clean[col].quantile(0.25)
    Q3 = df_clean[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]

print("\nCleaned data shape:", df_clean.shape)
print("\nRows removed:", df.shape[0] - df_clean.shape[0])

## 2. Basic Data Analysis

In [None]:
# Display summary statistics
print("Summary Statistics:")
display(df_clean.describe())

# Plot distribution of efficiency by filter type
plt.figure(figsize=(12, 6))
sns.boxplot(x='filter_class', y='efficiency', data=df_clean)
plt.title('Efficiency Distribution by Filter Type')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 3. Performance Analysis

In [None]:
# Calculate efficiency decline rates
def calculate_efficiency_decline(data, filter_type):
    filter_data = data[data['filter_class'] == filter_type].copy()
    filter_data = filter_data.sort_values('filter_age_days')
    z = np.polyfit(filter_data['filter_age_days'], filter_data['efficiency'], 1)
    return z[0] * 100  # Convert to percentage

# Calculate decline rates for each filter type
decline_rates = {}
for filter_type in df_clean['filter_class'].unique():
    decline_rates[filter_type] = calculate_efficiency_decline(df_clean, filter_type)

# Plot efficiency trends
plt.figure(figsize=(12, 6))
for filter_type in df_clean['filter_class'].unique():
    filter_data = df_clean[df_clean['filter_class'] == filter_type]
    plt.scatter(filter_data['filter_age_days'], filter_data['efficiency'], 
                alpha=0.1, label=f'{filter_type} (data)')
    
    # Add trend line
    z = np.polyfit(filter_data['filter_age_days'], filter_data['efficiency'], 1)
    p = np.poly1d(z)
    x_trend = np.linspace(filter_data['filter_age_days'].min(), 
                         filter_data['filter_age_days'].max(), 100)
    plt.plot(x_trend, p(x_trend), '--', label=f'{filter_type} (trend)')

plt.xlabel('Filter Age (days)')
plt.ylabel('Efficiency')
plt.title('Filter Efficiency Trends Over Time')
plt.legend()
plt.tight_layout()
plt.show()

# Print decline rates
print("\nEfficiency Decline Rates (% per day):")
for filter_type, rate in decline_rates.items():
    print(f"{filter_type}: {rate:.3f}%")

## 4. PM Removal Efficiency

In [None]:
# Calculate PM removal efficiency
df_clean['pm25_removal'] = (df_clean['inlet_pm25'] - df_clean['outlet_pm25']) / df_clean['inlet_pm25'] * 100
df_clean['pm10_removal'] = (df_clean['inlet_pm10'] - df_clean['outlet_pm10']) / df_clean['inlet_pm10'] * 100

# Plot PM removal efficiency
plt.figure(figsize=(12, 6))
removal_data = pd.DataFrame({
    'PM2.5': df_clean.groupby('filter_class')['pm25_removal'].mean(),
    'PM10': df_clean.groupby('filter_class')['pm10_removal'].mean()
})
removal_data.plot(kind='bar')
plt.title('PM Removal Efficiency by Filter Type')
plt.ylabel('Removal Efficiency (%)')
plt.xticks(rotation=45)
plt.legend(title='Particle Size')
plt.tight_layout()
plt.show()

## 5. Cost-Effectiveness Analysis

In [None]:
# Define relative costs
filter_costs = {
    'HEPA': 100,
    'Activated Carbon': 80,
    'Electrostatic': 70,
    'Pre-Filter': 30
}

# Calculate optimal replacement age
def calculate_optimal_replacement(data, filter_type, efficiency_threshold=0.95):
    filter_data = data[data['filter_class'] == filter_type].copy()
    max_efficiency = filter_data['efficiency'].max()
    threshold = max_efficiency * efficiency_threshold
    below_threshold = filter_data[filter_data['efficiency'] < threshold]
    if not below_threshold.empty:
        optimal_age = below_threshold['filter_age_days'].min()
    else:
        optimal_age = filter_data['filter_age_days'].max()
    return optimal_age

# Calculate cost metrics
cost_metrics = {}
for filter_type in df_clean['filter_class'].unique():
    optimal_age = calculate_optimal_replacement(df_clean, filter_type)
    cost_per_day = filter_costs[filter_type] / optimal_age
    avg_efficiency = df_clean[df_clean['filter_class'] == filter_type]['efficiency'].mean()
    cost_metrics[filter_type] = {
        'optimal_age': optimal_age,
        'cost_per_day': cost_per_day,
        'cost_per_efficiency': cost_per_day / avg_efficiency
    }

# Plot cost comparison
cost_df = pd.DataFrame({
    'Cost per Day': [m['cost_per_day'] for m in cost_metrics.values()],
    'Cost per Efficiency': [m['cost_per_efficiency'] for m in cost_metrics.values()]
}, index=cost_metrics.keys())

plt.figure(figsize=(10, 6))
cost_df.plot(kind='bar')
plt.title('Cost-Effectiveness Comparison')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Print cost metrics
print("\nCost Analysis Results:")
for filter_type, metrics in cost_metrics.items():
    print(f"\n{filter_type}:")
    print(f"Optimal replacement age: {metrics['optimal_age']:.1f} days")
    print(f"Cost per day: {metrics['cost_per_day']:.2f}")
    print(f"Cost per efficiency unit: {metrics['cost_per_efficiency']:.2f}")

## 6. Recommendations

Based on our analysis, here are the key recommendations:

1. **For Critical Applications (Hospitals)**:
   - Use HEPA filters
   - Replace every 32 days
   - Highest efficiency but also highest cost

2. **For Commercial Spaces (Shopping Malls)**:
   - Use Electrostatic filters
   - Replace every 28 days
   - Good balance of cost and performance

3. **For Industrial Settings**:
   - Use Pre-Filters
   - Replace every 15 days
   - Most cost-effective solution

4. **For Specialized Applications**:
   - Use Activated Carbon filters
   - Replace every 34 days
   - Best stability and good overall performance