In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')


In [13]:
import os


In [14]:

filepath = r'C:/Users/madhu/Downloads/archive/station_day.csv'

# Load the CSV file
station_df = pd.read_csv(filepath)

In [None]:
station_hour=pd.read_csv( r'C:/Users/madhu/Downloads/archive/station_hour.csv')
city_hour=pd.read_csv( r'C:/Users/madhu/Downloads/archive/city_hour.csv')

In [None]:
# Load the datasets
station_df = pd.read_csv(r'C:/Users/madhu/Downloads/archive/station_hour.csv')
city_day_df = pd.read_csv(r'C:/Users/madhu/Downloads/archive/city_day.csv')


In [None]:
print(station_df.isnull().sum())
print(city_day_df.isnull().sum())

# Impute or drop missing values
station_df.fillna(station_df.mean(), inplace=True)
city_day_df.fillna(city_day_df.mean(), inplace=True)

In [None]:
# Convert 'Date' column to datetime format
station_df['Date'] = pd.to_datetime(station_df['Date'])
city_day_df['Date'] = pd.to_datetime(city_day_df['Date'])


In [None]:
# Merge on 'Date' or another common column if applicable
merged_df = pd.merge(station_df, city_day_df, on='Date', how='inner')


In [None]:
# Display basic information and statistics
print(station_df.info())
print(city_day_df.info())

print(station_df.describe())
print(city_day_df.describe())


In [None]:
# Plot distribution for PM2.5 and other pollutants
sns.histplot(station_df['PM2.5'], kde=True, bins=30)
plt.title('PM2.5 Distribution')
plt.show()

sns.histplot(city_day_df['PM10'], kde=True, bins=30)
plt.title('PM10 Distribution')
plt.show()


In [None]:
# Plot AQI trends over time
plt.figure(figsize=(10, 5))
plt.plot(station_df['Date'], station_df['AQI'], label='Station AQI', alpha=0.7)
plt.plot(city_day_df['Date'], city_day_df['AQI'], label='City AQI', alpha=0.7)
plt.legend()
plt.title('AQI Trends Over Time')
plt.xlabel('Date')
plt.ylabel('AQI')
plt.show()


In [None]:
# Correlation heatmap for station dataset
plt.figure(figsize=(12, 8))
sns.heatmap(station_df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap (Station)')
plt.show()

# Correlation heatmap for city dataset
plt.figure(figsize=(12, 8))
sns.heatmap(city_day_df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap (City)')
plt.show()


In [None]:
# Scatterplot for PM2.5 vs AQI
sns.scatterplot(data=station_df, x='PM2.5', y='AQI', alpha=0.7)
plt.title('PM2.5 vs AQI')
plt.show()

sns.scatterplot(data=city_day_df, x='PM10', y='AQI', alpha=0.7)
plt.title('PM10 vs AQI')
plt.show()


In [None]:
# Boxplot of AQI by Month
sns.boxplot(data=station_df, x='Month', y='AQI')
plt.title('Monthly AQI Distribution (Station)')
plt.show()

sns.boxplot(data=city_day_df, x='Month', y='AQI')
plt.title('Monthly AQI Distribution (City)')
plt.show()


In [None]:
# Bar plot of average pollutants
pollutants = ['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene']
station_avg = station_df[pollutants].mean()
city_avg = city_day_df[pollutants].mean()

# Plot
station_avg.plot(kind='bar', figsize=(10, 5), title='Average Pollutants (Station)')
plt.show()

city_avg.plot(kind='bar', figsize=(10, 5), title='Average Pollutants (City)')
plt.show()


In [None]:
from sklearn.feature_selection import mutual_info_regression

# Selecting numerical features for feature importance
features = ['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene']
X = station_df[features]
y = station_df['AQI']

# Calculate feature importance using mutual information
feature_importances = mutual_info_regression(X, y)
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importance_df.sort_values(by='Importance', ascending=False, inplace=True)

# Plot feature importances
sns.barplot(data=importance_df, x='Importance', y='Feature')
plt.title('Feature Importance for AQI')
plt.show()


In [None]:
from sklearn.model_selection import train_test_split

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the splits
print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")


In [None]:
from sklearn.neural_network import MLPRegressor

# Initialize and train the ANN model
ann_model = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
ann_model.fit(X_train, y_train)

# Evaluate the ANN model
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Example dataset
X = station_df[['PM2.5', 'PM10', 'NO', 'NO2', 'CO', 'SO2', 'O3']]  # Features
y = station_df['AQI']  # Target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define and train the ANN model
ann_model = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
ann_model.fit(X_train, y_train)

# Evaluate the model
ann_preds = ann_model.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, ann_preds)
rmse = np.sqrt(mean_squared_error(y_test, ann_preds))
print(f"ANN - MAE: {mae:.2f}, RMSE: {rmse:.2f}")



In [None]:
from sklearn.svm import SVR

# Initialize and train the SVM model
svm_model = SVR(kernel='rbf', C=1.0, epsilon=0.1)
svm_model.fit(X_train, y_train)

# Evaluate the SVM model
svm_train_score = svm_model.score(X_train, y_train)
svm_test_score = svm_model.score(X_test, y_test)
print(f"SVM Training Score: {svm_train_score:.2f}, Test Score: {svm_test_score:.2f}")


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Predictions
ann_preds = ann_model.predict(X_test)
svm_preds = svm_model.predict(X_test)

# ANN metrics
ann_mae = mean_absolute_error(y_test, ann_preds)
ann_rmse = np.sqrt(mean_squared_error(y_test, ann_preds))

# SVM metrics
svm_mae = mean_absolute_error(y_test, svm_preds)
svm_rmse = np.sqrt(mean_squared_error(y_test, svm_preds))

print(f"ANN - MAE: {ann_mae:.2f}, RMSE: {ann_rmse:.2f}")
print(f"SVM - MAE: {svm_mae:.2f}, RMSE: {svm_rmse:.2f}")


In [None]:
# Use ANN predictions as input for SVM
hybrid_X_train = ann_model.predict(X_train).reshape(-1, 1)
hybrid_X_test = ann_model.predict(X_test).reshape(-1, 1)

# Train SVM on ANN outputs
svm_hybrid = SVR(kernel='rbf', C=1.0, epsilon=0.1)
svm_hybrid.fit(hybrid_X_train, y_train)

# Evaluate hybrid model
hybrid_preds = svm_hybrid.predict(hybrid_X_test)
hybrid_mae = mean_absolute_error(y_test, hybrid_preds)
hybrid_rmse = np.sqrt(mean_squared_error(y_test, hybrid_preds))

print(f"Hybrid Model - MAE: {hybrid_mae:.2f}, RMSE: {hybrid_rmse:.2f}")


In [None]:
# Combine results into a DataFrame for comparison
results = pd.DataFrame({
    'Model': ['ANN', 'SVM', 'Hybrid'],
    'MAE': [ann_mae, svm_mae, hybrid_mae],
    'RMSE': [ann_rmse, svm_rmse, hybrid_rmse]
})

print(results)

# Plot results
results.set_index('Model')[['MAE', 'RMSE']].plot(kind='bar', figsize=(8, 5))
plt.title('Model Performance Comparison')
plt.ylabel('Error')
plt.show()


In [None]:
from sklearn.impute import KNNImputer

# Select relevant columns for imputation
knn_features = ['AQI', 'PM2.5', 'PM10', 'NO', 'NO2', 'CO', 'SO2', 'O3']
knn_data = station_df[knn_features + ['AQI_Bucket']].copy()

# Encode AQI_Bucket to numeric
knn_data['AQI_Bucket'] = knn_data['AQI_Bucket'].map({
    'Good': 0, 'Satisfactory': 1, 'Moderate': 2, 'Poor': 3, 
    'Very Poor': 4, 'Severe': 5, np.nan: np.nan
})

# Impute using KNN
imputer = KNNImputer(n_neighbors=5)
knn_data_imputed = imputer.fit_transform(knn_data)

# Decode back to original categories
aqi_bucket_mapping = {0: 'Good', 1: 'Satisfactory', 2: 'Moderate', 3: 'Poor', 4: 'Very Poor', 5: 'Severe'}
station_df['AQI_Bucket'] = [aqi_bucket_mapping[int(val)] for val in knn_data_imputed[:, -1]]


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Prepare the data
non_missing = station_df.dropna(subset=['AQI_Bucket'])
missing = station_df[station_df['AQI_Bucket'].isna()]

# Encode categorical target
target_mapping = {'Good': 0, 'Satisfactory': 1, 'Moderate': 2, 'Poor': 3, 'Very Poor': 4, 'Severe': 5}
non_missing['AQI_Bucket'] = non_missing['AQI_Bucket'].map(target_mapping)

X = non_missing[['AQI', 'PM2.5', 'PM10', 'NO', 'NO2', 'CO', 'SO2', 'O3']]
y = non_missing['AQI_Bucket']

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict for missing values
X_missing = missing[['AQI', 'PM2.5', 'PM10', 'NO', 'NO2', 'CO', 'SO2', 'O3']]
predictions = model.predict(X_missing)

# Map predictions back to categories
reverse_mapping = {v: k for k, v in target_mapping.items()}
station_df.loc[station_df['AQI_Bucket'].isna(), 'AQI_Bucket'] = [reverse_mapping[val] for val in predictions]
