In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Machine Learning Project : Air Quality Prediction
---
# Importing modules :

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Loading the dataset into a pandas dataframe :
- ### Here our **target variable** is **'count'**.

In [None]:
data = pd.read_csv("Air_Quality_Data_2022.csv", encoding='latin-1')
data.head()

Unnamed: 0,Date,Country,City,Specie,count,min,max,median,variance
0,28-03-2022,SK,Koï¿½ï¿,co,92,3.8,10.6,4.9,13.19
1,03-05-2022,SK,Koï¿½ï¿,co,92,3.1,9.9,4.2,5.59
2,15-05-2022,SK,Koï¿½ï¿,co,92,2.9,19.0,3.8,31.46
3,04-09-2022,SK,Koï¿½ï¿,co,92,2.5,8.5,2.9,22.18
4,13-10-2022,SK,Koï¿½ï¿,co,92,2.5,12.8,4.9,37.85


# Checking more details about out dataframe :

In [None]:
data.shape

(1048571, 9)

In [None]:
data.columns

Index(['Date', 'Country', 'City', 'Specie', 'count', 'min', 'max', 'median',
       'variance'],
      dtype='object')

In [None]:
data.describe()

Unnamed: 0,count,min,max,median,variance
count,1048571.0,1048571.0,1048571.0,1048571.0,1048571.0
mean,134.7271,106.0892,135.3171,117.9887,3102.38
std,206.5507,291.8964,294.057,293.2422,51625.86
min,2.0,-3259.1,-2635.7,-2671.1,0.0
25%,44.0,1.0,9.8,3.8,19.96
50%,72.0,5.0,26.9,14.0,110.34
75%,144.0,22.0,82.0,42.4,759.84
max,2616.0,1048.2,3221.0,2064.0,19952400.0


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048571 entries, 0 to 1048570
Data columns (total 9 columns):
 #   Column    Non-Null Count    Dtype  
---  ------    --------------    -----  
 0   Date      1048571 non-null  object 
 1   Country   1048571 non-null  object 
 2   City      1048571 non-null  object 
 3   Specie    1048571 non-null  object 
 4   count     1048571 non-null  int64  
 5   min       1048571 non-null  float64
 6   max       1048571 non-null  float64
 7   median    1048571 non-null  float64
 8   variance  1048571 non-null  float64
dtypes: float64(4), int64(1), object(4)
memory usage: 72.0+ MB


### Here only columns with integer and float values will be considered for regression.
---
# Checking null values :

In [None]:
data.isnull().sum()

Date        0
Country     0
City        0
Specie      0
count       0
min         0
max         0
median      0
variance    0
dtype: int64

# So there is no null values in our data.
---
## Converting the **'Country'** and **'City'** columns into numerical representations using one-hot encoding, ensuring the new columns have integer data types :

In [None]:
data = pd.get_dummies(data, columns = ['Country','City'], dtype = int)
data.head()

Unnamed: 0,Date,Specie,count,min,max,median,variance,Country_AU,Country_BA,Country_BD,...,City_Yokohama,City_Zabrze,City_Zagreb,City_Zaragoza,City_Zarqa,City_Zenica,City_ýýita,City_ýýzmir,City_ýýzmit,City_ýýýýdýý
0,28-03-2022,co,92,3.8,10.6,4.9,13.19,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,03-05-2022,co,92,3.1,9.9,4.2,5.59,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,15-05-2022,co,92,2.9,19.0,3.8,31.46,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,04-09-2022,co,92,2.5,8.5,2.9,22.18,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,13-10-2022,co,92,2.5,12.8,4.9,37.85,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Separating features and target variable :

In [None]:
x = data.drop(['Date','count','Specie'], axis = 1)
y = data['count']

# Splitting our data into training and testing sets :

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

# Scaling the features :

In [None]:
# Initializes the scaler :
scaler = StandardScaler()

# Fits the scaler on the training data and transforms :
x_train = scaler.fit_transform(x_train)

# Transforms the test data using the same scaler :
x_test = scaler.transform(x_test)

# Linear Regression :

In [None]:
lr = LinearRegression()
lr.fit(x_train, y_train)
y_pred_lr = lr.predict(x_test)

# Evaluating the model :
rmse = mean_squared_error(y_test, y_pred_lr, squared = False)
r2 = r2_score(y_test, y_pred_lr)

print("Linear Regression Accuracy :", lr.score(x_test, y_test))
print("Root Mean Squared Error (RMSE) :",rmse)
print("R-squared value :", r2)

Linear Regression Accuracy : 0.7998358093726134
Root Mean Squared Error (RMSE) : 92.8630397804649
R-squared value : 0.7998358093726134




# Decision Tree Regressor :

In [None]:
dtr = DecisionTreeRegressor()
dtr.fit(x_train, y_train)
y_pred_dtr = dtr.predict(x_test)
print("Decision Tree Regressor Accuracy :", dtr.score(x_test, y_test))

Decision Tree Regressor Accuracy : 0.9315929468429429


## Evaluating Decision Tree Regressor :

In [None]:
rmse = mean_squared_error(y_test, y_pred_dtr, squared = False)
r2 = r2_score(y_test, y_pred_dtr)

print("Root Mean Squared Error (RMSE) :",rmse)
print("R-squared value :", r2)

Root Mean Squared Error (RMSE) : 53.81748781245293
R-squared value : 0.9327724265984698




# Random Forest Regressor :

In [None]:
rfr = RandomForestRegressor()
rfr.fit(x_train, y_train)
y_pred_rfr = rfr.predict(x_test)
print("Random Forest Regressor Accuracy :", rfr.score(x_test, y_test))

Random Forest Regressor Accuracy : 0.9605288558236772


## Evaluating the Random Forest Regressor :

In [None]:
rmse = mean_squared_error(y_test, y_pred_rfr, squared = False)
r2 = r2_score(y_test, y_pred_rfr)

print("Root Mean Squared Error (RMSE) :",rmse)
print("R-squared value :", r2)

Root Mean Squared Error (RMSE) : 41.2372372366389
R-squared value : 0.9605288558236772




## So Random Forest Regressor is the best algorithm for our data with highest accuracy and lowest error.