In [34]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,mean_absolute_error, r2_score
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('/content/air_quality_health_impact_data.csv')

In [3]:
data.head()

Unnamed: 0,RecordID,AQI,PM10,PM2_5,NO2,SO2,O3,Temperature,Humidity,WindSpeed,RespiratoryCases,CardiovascularCases,HospitalAdmissions,HealthImpactScore,HealthImpactClass
0,1,187.270059,295.853039,13.03856,6.639263,66.16115,54.62428,5.150335,84.424344,6.137755,7,5,1,97.244041,0.0
1,2,475.357153,246.254703,9.984497,16.318326,90.499523,169.621728,1.543378,46.851415,4.521422,10,2,0,100.0,0.0
2,3,365.996971,84.443191,23.11134,96.317811,17.87585,9.006794,1.169483,17.806977,11.157384,13,3,0,100.0,0.0
3,4,299.329242,21.020609,14.273403,81.234403,48.323616,93.161033,21.925276,99.473373,15.3025,8,8,1,100.0,0.0
4,5,78.00932,16.987667,152.111623,121.235461,90.866167,241.795138,9.217517,24.906837,14.534733,9,0,1,95.182643,0.0


In [4]:
#check the Null Values in the dataset
data.isnull().sum()

RecordID               0
AQI                    0
PM10                   0
PM2_5                  0
NO2                    0
SO2                    0
O3                     0
Temperature            0
Humidity               0
WindSpeed              0
RespiratoryCases       0
CardiovascularCases    0
HospitalAdmissions     0
HealthImpactScore      0
HealthImpactClass      0
dtype: int64

In [5]:
#checking the shape of the dataset
data.shape

(5811, 15)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5811 entries, 0 to 5810
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   RecordID             5811 non-null   int64  
 1   AQI                  5811 non-null   float64
 2   PM10                 5811 non-null   float64
 3   PM2_5                5811 non-null   float64
 4   NO2                  5811 non-null   float64
 5   SO2                  5811 non-null   float64
 6   O3                   5811 non-null   float64
 7   Temperature          5811 non-null   float64
 8   Humidity             5811 non-null   float64
 9   WindSpeed            5811 non-null   float64
 10  RespiratoryCases     5811 non-null   int64  
 11  CardiovascularCases  5811 non-null   int64  
 12  HospitalAdmissions   5811 non-null   int64  
 13  HealthImpactScore    5811 non-null   float64
 14  HealthImpactClass    5811 non-null   float64
dtypes: float64(11), int64(4)
memory usage:

In [41]:
# Features and Target Variables

X = data.iloc[:, 1:-2].values  # Select all columns except the first and the last
Y = data.iloc[:, -2].values    # Select the second last column

In [42]:
X

array([[187.27005942, 295.85303919,  13.03856044, ...,   7.        ,
          5.        ,   1.        ],
       [475.3571532 , 246.25470278,   9.98449713, ...,  10.        ,
          2.        ,   0.        ],
       [365.99697091,  84.44319074,  23.11133977, ...,  13.        ,
          3.        ,   0.        ],
       ...,
       [314.84179763,  41.89269906, 184.70855139, ...,  12.        ,
          2.        ,   3.        ],
       [208.0804732 , 165.53378512, 199.17725515, ...,   6.        ,
          2.        ,   3.        ],
       [ 83.26925675,  82.21626223, 119.96824423, ...,  14.        ,
          2.        ,   2.        ]])

In [43]:
Y

array([ 97.24404109, 100.        , 100.        , ..., 100.        ,
       100.        ,  81.66829811])

In [44]:
#Splitting the dataset

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [45]:
X_train.shape, X_test.shape

((4648, 12), (1163, 12))

In [46]:
# Standardize the dataset

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [47]:
Y_train.shape

(4648,)

###Build the LINEAR REGRESSION model


In [48]:
#Build the LINEAR REGRESSION model

linear_model = LinearRegression()
linear_model.fit(X_train_scaled, Y_train)

In [49]:
linear_pred = linear_model.predict(X_test_scaled)

In [50]:
linear_mae = mean_absolute_error(Y_test, linear_pred)
print("MAE", linear_mae)

linear_mse = mean_squared_error(Y_test, linear_pred)
print("MSE",linear_mae)

linear_r2 = r2_score(Y_test, linear_pred)
print("R2 Score", linear_r2)

MAE 7.354111310534881
MSE 7.354111310534881
R2 Score 0.5053377752606641


##Building the SVR

In [51]:
from sklearn.svm import SVR
SVR = SVR()
SVR.fit(X_train_scaled, Y_train)

In [52]:
SVR_pred = SVR.predict(X_test_scaled)

In [55]:
SVR_mae = mean_absolute_error(Y_test, SVR_pred)
print("MAE", SVR_mae)

SVR_mse = mean_squared_error(Y_test, SVR_pred)
print("MSE",SVR_mae)

SVR_r2 = r2_score(Y_test, SVR_pred)
print("R2 Score", SVR_r2)

MAE 4.708029871981457
MSE 4.708029871981457
R2 Score 0.5575655272162192


##Random forest regression

In [57]:
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, Y_train)

In [58]:
rf_pred = rf_model.predict(X_test_scaled)

In [59]:
rf_mae = mean_absolute_error(Y_test, rf_pred)
print("MAE", rf_mae)

rf_mse = mean_squared_error(Y_test, rf_pred)
print("MSE",rf_mae)

rf_r2 = r2_score(Y_test, rf_pred)
print("R2 Score", rf_r2)

MAE 1.569198567637857
MSE 1.569198567637857
R2 Score 0.9454070921231519


## Saving the RF model


In [61]:
import joblib
joblib.dump(rf_model, 'AIR_QUALITY_PREDICTION_MODEL.pkl')

['AIR_QUALITY_PREDICTION_MODEL.pkl']