In [15]:
import pandas as pd
import plotly.express as px
import plotly.io as pio

# Load the dataset
file_path = 'delhiaqi.csv'
data = pd.read_csv(file_path, parse_dates=['date'], index_col='date')

# Check for missing values
print("Missing values before handling:")
print(data.isnull().sum())

# Fill missing values (or handle them as needed)
data = data.fillna(method='ffill')  # Forward fill missing values

# Descriptive statistics
print("\nDescriptive Statistics:")
print(data.describe())

# Add a 'season' column based on the month
data['season'] = data.index.month % 12 // 3 + 1
data['season'] = data['season'].map({1: 'Winter', 2: 'Spring', 3: 'Summer', 4: 'Autumn'})

# Seasonal Variation Analysis using Plotly
seasonal_boxplot = px.box(data, x='season', y='pm2_5', title='Seasonal Variation in PM2.5 Levels',
                          labels={'season': 'Season', 'pm2_5': 'PM2.5'})
seasonal_boxplot.update_layout(yaxis_title='PM2.5', xaxis_title='Season')
seasonal_boxplot.write_html('seasonal_variation_pm25.html', auto_open=True)

# Time Series Analysis using Plotly
time_series_plot = px.line(data, x=data.index, y='pm2_5', title='PM2.5 Levels Over Time',
                          labels={'date': 'Date', 'pm2_5': 'PM2.5'})
time_series_plot.update_layout(xaxis_title='Date', yaxis_title='PM2.5')
time_series_plot.write_html('time_series_pm25.html', auto_open=True)

# Correlation Matrix Analysis using Plotly
# Exclude non-numeric columns
numeric_data = data.select_dtypes(include=['float64', 'int64'])
correlation_matrix = numeric_data.corr()

correlation_heatmap = px.imshow(correlation_matrix, text_auto=True, color_continuous_scale='Viridis',
                                title='Correlation Matrix of Pollutants')
correlation_heatmap.update_layout(xaxis_title='Pollutants', yaxis_title='Pollutants')
correlation_heatmap.write_html('correlation_matrix_heatmap.html', auto_open=True)

print("\nPlots saved as HTML files:")
print("1. Seasonal Variation: 'seasonal_variation_pm25.html'")
print("2. Time Series: 'time_series_pm25.html'")
print("3. Correlation Matrix: 'correlation_matrix_heatmap.html'")


Missing values before handling:
co       0
no       0
no2      0
o3       0
so2      0
pm2_5    0
pm10     0
nh3      0
dtype: int64

Descriptive Statistics:
                 co          no         no2          o3         so2  \
count    561.000000  561.000000  561.000000  561.000000  561.000000   
mean    3814.942210   51.181979   75.292496   30.141943   64.655936   
std     3227.744681   83.904476   42.473791   39.979405   61.073080   
min      654.220000    0.000000   13.370000    0.000000    5.250000   
25%     1708.980000    3.380000   44.550000    0.070000   28.130000   
50%     2590.180000   13.300000   63.750000   11.800000   47.210000   
75%     4432.680000   59.010000   97.330000   47.210000   77.250000   
max    16876.220000  425.580000  263.210000  164.510000  511.170000   

             pm2_5         pm10         nh3  
count   561.000000   561.000000  561.000000  
mean    358.256364   420.988414   26.425062  
std     227.359117   271.287026   36.563094  
min      60.100000


DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.




Plots saved as HTML files:
1. Seasonal Variation: 'seasonal_variation_pm25.html'
2. Time Series: 'time_series_pm25.html'
3. Correlation Matrix: 'correlation_matrix_heatmap.html'


# CROSS-VALIDATION 

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor

# Load the dataset
file_path = 'delhiaqi.csv'
data = pd.read_csv(file_path, parse_dates=['date'], index_col='date')

# Handle missing values
data = data.fillna(method='ffill')

# Feature engineering (example: using all columns except 'pm2_5' as features)
features = data.drop(columns=['pm2_5'])
target = data['pm2_5']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5, 7]
}
grid_search = GridSearchCV(XGBRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)
grid_search.fit(X_train, y_train)

# Best parameters from GridSearchCV
print(f"Best parameters: {grid_search.best_params_}")

# Initialize and train the model with the best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Make predictions
y_pred = best_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print metrics
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R²): {r2:.4f}")



DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.



Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
Mean Absolute Error (MAE): 13.0963
Mean Squared Error (MSE): 328.1647
Root Mean Squared Error (RMSE): 18.1153
R-squared (R²): 0.9910
