In [None]:
import requests
import json
import pandas as pd
import time
import datetime
from dateutil.relativedelta import relativedelta
import folium
from folium.plugins import HeatMap
from geopy.geocoders import Nominatim
import matplotlib.pyplot as plt

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##Data Preparation


In [None]:
df_polution_data = pd.read_csv('/content/drive/MyDrive/Daft/df_polution_f.csv')
df_polution_data

Unnamed: 0,Column1,Column110,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,...,Column22,Column23,Column24,Column25,Column26,Column27,Column28,Column29,Column30,Column31
0,,monitor,co,co,co,no,no,no,no2,no2,...,pm2_5,pm4,pm4,pm4,tsp,tsp,tsp,o3,o3,o3
1,,,mean,max,min,mean,max,min,mean,max,...,min,mean,max,min,mean,max,min,mean,max,min
2,date,,,,,,,,,,...,,,,,,,,,,
3,2024-05-09,DCC-AQ1,0.405,0.47,0.31,22.4925,38.96,7.94,107.7375,134.64,...,,,,,,,,,,
4,2024-05-10,DCC-AQ1,0.2733333333333334,0.35,0.22,14.8775,63.23,0.8,59.97833333333333,157.89,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4836,2024-11-05,DCC-AQ11,0.285,0.52,0.13,,,,32.54208333333333,46.31,...,6.14,11.212,16.28,7.09,,,,,,
4837,2024-11-06,DCC-AQ11,0.23916666666666667,0.48,0.15,,,,25.205416666666665,54.72,...,,,,,,,,,,
4838,2024-11-07,DCC-AQ11,0.18166666666666667,0.22,0.16,,,,14.658333333333333,22.75,...,,,,,,,,,,
4839,2024-11-08,DCC-AQ11,0.21041666666666667,0.26,0.16,,,,17.629583333333333,29.75,...,,,,,,,,,,


In [None]:
# Combine the first and second rows into a single header
substances = df_polution_data.iloc[0, 1:].fillna('')
metrics = df_polution_data.iloc[1, 1:].fillna('')
combined_headers = substances + "#" + metrics


headers = ['Date'] + combined_headers.tolist()
polution_data = df_polution_data.iloc[3:]
polution_data.columns = headers


polution_data = polution_data.reset_index(drop=True)
polution_data

Unnamed: 0,Date,monitor#,co#mean,co#max,co#min,no#mean,no#max,no#min,no2#mean,no2#max,...,pm2_5#min,pm4#mean,pm4#max,pm4#min,tsp#mean,tsp#max,tsp#min,o3#mean,o3#max,o3#min
0,2024-05-09,DCC-AQ1,0.405,0.47,0.31,22.4925,38.96,7.94,107.7375,134.64,...,,,,,,,,,,
1,2024-05-10,DCC-AQ1,0.2733333333333334,0.35,0.22,14.8775,63.23,0.8,59.97833333333333,157.89,...,,,,,,,,,,
2,2024-05-11,DCC-AQ1,0.27875,0.34,0.22,4.066666666666666,22.32,0.28,40.437916666666666,87.62,...,,,,,,,,,,
3,2024-05-12,DCC-AQ1,0.295,0.38,0.25,2.1666666666666665,12.72,0.21,31.714583333333334,94.98,...,,,,,,,,,,
4,2024-05-13,DCC-AQ1,0.23458333333333334,0.3,0.16,1.7,5.1,-0.04,16.925,28.72,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4833,2024-11-05,DCC-AQ11,0.285,0.52,0.13,,,,32.54208333333333,46.31,...,6.14,11.212,16.28,7.09,,,,,,
4834,2024-11-06,DCC-AQ11,0.23916666666666667,0.48,0.15,,,,25.205416666666665,54.72,...,,,,,,,,,,
4835,2024-11-07,DCC-AQ11,0.18166666666666667,0.22,0.16,,,,14.658333333333333,22.75,...,,,,,,,,,,
4836,2024-11-08,DCC-AQ11,0.21041666666666667,0.26,0.16,,,,17.629583333333333,29.75,...,,,,,,,,,,


In [None]:
#Splitting the 'Substance_Metric' column into two
polution_data1 = polution_data.melt(id_vars=['Date', 'monitor#'],
                  var_name='Substance_Metric',
                  value_name='Value')


split_columns = polution_data1['Substance_Metric'].str.split('#', expand=True)
polution_data1['Substance'] = split_columns[0]
polution_data1['Metric'] = split_columns[1]
polution_data1 = polution_data1.drop(columns=['Substance_Metric'])
print(polution_data1)

              Date  monitor#                Value Substance Metric
0       2024-05-09   DCC-AQ1                0.405        co   mean
1       2024-05-10   DCC-AQ1   0.2733333333333334        co   mean
2       2024-05-11   DCC-AQ1              0.27875        co   mean
3       2024-05-12   DCC-AQ1                0.295        co   mean
4       2024-05-13   DCC-AQ1  0.23458333333333334        co   mean
...            ...       ...                  ...       ...    ...
145135  2024-11-05  DCC-AQ11                  NaN        o3    min
145136  2024-11-06  DCC-AQ11                  NaN        o3    min
145137  2024-11-07  DCC-AQ11                  NaN        o3    min
145138  2024-11-08  DCC-AQ11                  NaN        o3    min
145139  2024-11-09  DCC-AQ11                  NaN        o3    min

[145140 rows x 5 columns]


In [None]:
polution_data1['Metric'].value_counts()
polution_data1 = polution_data1.rename(columns={'monitor#': 'monitor'})
polution_data1

Unnamed: 0,Date,monitor,Value,Substance,Metric
0,2024-05-09,DCC-AQ1,0.405,co,mean
1,2024-05-10,DCC-AQ1,0.2733333333333334,co,mean
2,2024-05-11,DCC-AQ1,0.27875,co,mean
3,2024-05-12,DCC-AQ1,0.295,co,mean
4,2024-05-13,DCC-AQ1,0.23458333333333334,co,mean
...,...,...,...,...,...
145135,2024-11-05,DCC-AQ11,,o3,min
145136,2024-11-06,DCC-AQ11,,o3,min
145137,2024-11-07,DCC-AQ11,,o3,min
145138,2024-11-08,DCC-AQ11,,o3,min


In [None]:
polution_data2 = polution_data1.dropna()
polution_data2

Unnamed: 0,Date,monitor,Value,Substance,Metric
0,2024-05-09,DCC-AQ1,0.405,co,mean
1,2024-05-10,DCC-AQ1,0.2733333333333334,co,mean
2,2024-05-11,DCC-AQ1,0.27875,co,mean
3,2024-05-12,DCC-AQ1,0.295,co,mean
4,2024-05-13,DCC-AQ1,0.23458333333333334,co,mean
...,...,...,...,...,...
142991,2024-11-05,DCC-AQ69,-1.4,o3,min
142992,2024-11-06,DCC-AQ69,-1.78,o3,min
142993,2024-11-07,DCC-AQ69,5.63,o3,min
142994,2024-11-08,DCC-AQ69,1.58,o3,min


In [None]:
polution_data2['Value'] = polution_data2['Value'].astype(float)
polution_data2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 62778 entries, 0 to 142995
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       62778 non-null  object 
 1   monitor    62778 non-null  object 
 2   Value      62778 non-null  float64
 3   Substance  62778 non-null  object 
 4   Metric     62778 non-null  object 
dtypes: float64(1), object(4)
memory usage: 4.9+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  polution_data2['Value'] = polution_data2['Value'].astype(float)


In [None]:
polution_data2['Substance'].value_counts()

Unnamed: 0_level_0,count
Substance,Unnamed: 1_level_1
pm2_5,13635
pm10,13617
pm1,12111
pm4,6561
no2,5298
tsp,4389
no,2868
so2,2439
o3,987
co,873


##Dataset with detectors locations


In [None]:
import requests
import json
import pandas as pd
import time
import datetime
from dateutil.relativedelta import relativedelta
import folium
from folium.plugins import HeatMap
from geopy.geocoders import Nominatim
import matplotlib.pyplot as plt

In [None]:
#Setting the date parameteres
import time
import datetime
from dateutil.relativedelta import relativedelta
d = datetime.date(2024,11,6)
d = datetime.datetime.now()
print(d)
currentTimeDate = d - relativedelta(months=3)
currentTime = currentTimeDate.strftime('%Y-%m-%d')
print(currentTime,type(currentTimeDate))
unixtime = time.mktime(d.timetuple())
qq = round(time.mktime(currentTimeDate.timetuple()))
print(round(unixtime))
print(round(time.mktime(currentTimeDate.timetuple())))

2024-12-08 10:03:06.824220
2024-09-08 <class 'datetime.datetime'>
1733652186
1725789786


In [None]:
url = 'https://data.smartdublin.ie/sonitus-api/api/monitors?username=dublincityapi&password=Xpa5vAQ9ki'
username = 'dublincityapi'
pwd = 'Xpa5vAQ9ki'

In [None]:
r = requests.post(url, headers={'accept': '*/*'})
#, "username": username, "password":pwd}) # выполняем запрос
result = r.json()
monitors_data = pd.DataFrame(result)
monitors_data[['latitude','longitude']] = monitors_data[['latitude','longitude']].apply(pd.to_numeric)
monitors_data

Unnamed: 0,serial_number,label,location,latitude,longitude,last_calibrated
0,10.1.1.1,Noise 1,Drumcondra Library,53.369864,-6.258966,2018-08-29
1,01749,Noise 2,Bull Island,53.36866,-6.149316,2023-03-13
2,01508,Noise 3,Ballyfermot Civic Centre,53.343337,-6.362923,2022-02-08
3,10118,Noise 4,Ballymun,53.390401,-6.264755,2023-03-01
4,01548,Noise 5,DCC Rowing Club,53.346116,-6.321013,2023-02-21
5,10115,Noise 6,Walkinstown,53.319492,-6.321945,2023-02-28
6,10.1.1.7,Noise 7,Woodstock Gardens,53.323524,-6.247734,2021-01-20
7,01870,Noise 8,Navan Road,53.370758,-6.325578,2023-08-22
8,01575,Noise 9,Raheny,53.379996,-6.172829,2023-02-21
9,01737,Noise 10,Ringsend Sports Centre,53.340031,-6.220023,2023-03-07


In [None]:
monitors_position = monitors_data[~monitors_data['label'].str.contains('Noise')][['serial_number', 'latitude', 'longitude']].reset_index(drop = True)

# Change the 'label' column to a sequence from 1 to 5
#monitors_position['label'] = range(1, len(monitors_position) + 1)

monitors_position

Unnamed: 0,serial_number,latitude,longitude
0,DCC-AQ1,53.344239,-6.271525
1,DCC-AQ2,53.368067,-6.22785
2,DCC-AQ3,53.364442,-6.348967
3,DCC-AQ4,53.390281,-6.305769
4,DCC-AQ5,53.340148,-6.35181
5,DCC-AQ6,53.336289,-6.309006
6,DCC-AQ7,53.385644,-6.369925
7,DCC-AQ8,53.285756,-6.131822
8,DCC-AQ9,53.280533,-6.356044
9,DCC-AQ10,53.345707,-6.295775


In [None]:
monitors_position.rename(columns={'serial_number': 'monitor'}, inplace=True)
monitors_position

Unnamed: 0,monitor,latitude,longitude
0,DCC-AQ1,53.344239,-6.271525
1,DCC-AQ2,53.368067,-6.22785
2,DCC-AQ3,53.364442,-6.348967
3,DCC-AQ4,53.390281,-6.305769
4,DCC-AQ5,53.340148,-6.35181
5,DCC-AQ6,53.336289,-6.309006
6,DCC-AQ7,53.385644,-6.369925
7,DCC-AQ8,53.285756,-6.131822
8,DCC-AQ9,53.280533,-6.356044
9,DCC-AQ10,53.345707,-6.295775


##Merging the data and adding columns

In [None]:
pollution_data = pd.merge(polution_data2, monitors_position, on='monitor', how = 'left')
pollution_data

NameError: name 'polution_data2' is not defined

In [None]:
pollution_data.info()

NameError: name 'pollution_data' is not defined

In [None]:
level = pd.DataFrame({
    'Substance': ['co', 'no', 'no2', 'so2', 'pm1', 'pm10', 'pm2_5', 'pm4', 'tsp', 'o3'],
    'good': [9, 50, 25, 20, 5, 10, 15, 20, 50, 50],
    'satisfactory': [35, 100, 100, 75, 15, 25, 30, 50, 100, 100],
    'poor': [35, 100, 100, 75, 15, 25, 30, 50, 100, 100]
})
level

Unnamed: 0,Substance,good,satisfactory,poor
0,co,9,35,35
1,no,50,100,100
2,no2,25,100,100
3,so2,20,75,75
4,pm1,5,15,15
5,pm10,10,25,25
6,pm2_5,15,30,30
7,pm4,20,50,50
8,tsp,50,100,100
9,o3,50,100,100


In [None]:
level.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Substance     10 non-null     object
 1   good          10 non-null     int64 
 2   satisfactory  10 non-null     int64 
 3   poor          10 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 448.0+ bytes


In [None]:
#Adding a new column to classify pollution level
pollution_data_all = pd.merge(pollution_data, level, on='Substance', how = 'left')
def classify_pollution(row):
    if row['Value'] <= row['good']:
        return 1
    elif row['Value'] <= row['satisfactory']:
        return 2
    else:
        return 3


pollution_data_all['Pollution_Level'] = pollution_data_all.apply(classify_pollution, axis=1)
pollution_data_all = pollution_data_all.drop(columns = ['good', 'satisfactory', 'poor'])
pollution_data_all


Unnamed: 0,Date,monitor,Value,Substance,Metric,latitude,longitude,Pollution_Level
0,2024-05-09,DCC-AQ1,0.405000,co,mean,53.344239,-6.271525,1
1,2024-05-10,DCC-AQ1,0.273333,co,mean,53.344239,-6.271525,1
2,2024-05-11,DCC-AQ1,0.278750,co,mean,53.344239,-6.271525,1
3,2024-05-12,DCC-AQ1,0.295000,co,mean,53.344239,-6.271525,1
4,2024-05-13,DCC-AQ1,0.234583,co,mean,53.344239,-6.271525,1
...,...,...,...,...,...,...,...,...
62773,2024-11-05,DCC-AQ69,-1.400000,o3,min,53.345053,-6.254344,1
62774,2024-11-06,DCC-AQ69,-1.780000,o3,min,53.345053,-6.254344,1
62775,2024-11-07,DCC-AQ69,5.630000,o3,min,53.345053,-6.254344,1
62776,2024-11-08,DCC-AQ69,1.580000,o3,min,53.345053,-6.254344,1


In [None]:
pollution_data_all[pollution_data_all['Pollution_Level'] == 3]['Substance'].value_counts()

Unnamed: 0_level_0,count
Substance,Unnamed: 1_level_1
pm10,1453
pm1,409
pm2_5,95
no,84
no2,30
pm4,22
tsp,21
so2,6
o3,2
co,1


In [None]:
pollution_data_all['Substance'].value_counts()

Unnamed: 0_level_0,count
Substance,Unnamed: 1_level_1
pm2_5,13635
pm10,13617
pm1,12111
pm4,6561
no2,5298
tsp,4389
no,2868
so2,2439
o3,987
co,873


In [None]:
pollution_data_all.to_csv('pollution_data_all', index = False)

In [None]:
pollution_data_all['Pollution_Level'].value_counts()

Unnamed: 0_level_0,count
Pollution_Level,Unnamed: 1_level_1
1,49203
2,11452
3,2123


##Adding temporal data


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


In [None]:
pollution = pd.read_csv('/content/pollution_data_all')
pollution

Unnamed: 0.1,Unnamed: 0,Date,monitor,Value,Substance,Metric,latitude,longitude,Pollution_Level
0,0,2024-05-09,1,0.405000,co,mean,53.344239,-6.271525,1
1,1,2024-05-10,1,0.273333,co,mean,53.344239,-6.271525,1
2,2,2024-05-11,1,0.278750,co,mean,53.344239,-6.271525,1
3,3,2024-05-12,1,0.295000,co,mean,53.344239,-6.271525,1
4,4,2024-05-13,1,0.234583,co,mean,53.344239,-6.271525,1
...,...,...,...,...,...,...,...,...,...
62773,62773,2024-11-05,14,-1.400000,o3,min,53.390433,-6.265270,1
62774,62774,2024-11-06,14,-1.780000,o3,min,53.390433,-6.265270,1
62775,62775,2024-11-07,14,5.630000,o3,min,53.390433,-6.265270,1
62776,62776,2024-11-08,14,1.580000,o3,min,53.390433,-6.265270,1


In [None]:
#Adding 'season' and 'day_of_week' columns
pollution['Date'] = pd.to_datetime(pollution['Date'])
pollution['month'] = pollution['Date'].dt.month
pollution['day_of_week'] = pollution['Date'].dt.day_name()
pollution

Unnamed: 0.1,Unnamed: 0,Date,monitor,Value,Substance,Metric,latitude,longitude,Pollution_Level,month,day_of_week
0,0,2024-05-09,1,0.405000,co,mean,53.344239,-6.271525,1,5,Thursday
1,1,2024-05-10,1,0.273333,co,mean,53.344239,-6.271525,1,5,Friday
2,2,2024-05-11,1,0.278750,co,mean,53.344239,-6.271525,1,5,Saturday
3,3,2024-05-12,1,0.295000,co,mean,53.344239,-6.271525,1,5,Sunday
4,4,2024-05-13,1,0.234583,co,mean,53.344239,-6.271525,1,5,Monday
...,...,...,...,...,...,...,...,...,...,...,...
62773,62773,2024-11-05,14,-1.400000,o3,min,53.390433,-6.265270,1,11,Tuesday
62774,62774,2024-11-06,14,-1.780000,o3,min,53.390433,-6.265270,1,11,Wednesday
62775,62775,2024-11-07,14,5.630000,o3,min,53.390433,-6.265270,1,11,Thursday
62776,62776,2024-11-08,14,1.580000,o3,min,53.390433,-6.265270,1,11,Friday


In [None]:
# Mapping of seasons and days to numbers #code months names instead of seasons
day_mapping = {
    'Monday': 1,
    'Tuesday': 2,
    'Wednesday': 3,
    'Thursday': 4,
    'Friday': 5,
    'Saturday': 6,
    'Sunday': 7
}
pollution['day_of_week'] = pollution['day_of_week'].map(day_mapping)
pollution

Unnamed: 0.1,Unnamed: 0,Date,monitor,Value,Substance,Metric,latitude,longitude,Pollution_Level,month,day_of_week
0,0,2024-05-09,1,0.405000,co,mean,53.344239,-6.271525,1,5,4
1,1,2024-05-10,1,0.273333,co,mean,53.344239,-6.271525,1,5,5
2,2,2024-05-11,1,0.278750,co,mean,53.344239,-6.271525,1,5,6
3,3,2024-05-12,1,0.295000,co,mean,53.344239,-6.271525,1,5,7
4,4,2024-05-13,1,0.234583,co,mean,53.344239,-6.271525,1,5,1
...,...,...,...,...,...,...,...,...,...,...,...
62773,62773,2024-11-05,14,-1.400000,o3,min,53.390433,-6.265270,1,11,2
62774,62774,2024-11-06,14,-1.780000,o3,min,53.390433,-6.265270,1,11,3
62775,62775,2024-11-07,14,5.630000,o3,min,53.390433,-6.265270,1,11,4
62776,62776,2024-11-08,14,1.580000,o3,min,53.390433,-6.265270,1,11,5


In [None]:
pollution.to_csv('pollution_with_days', index = False)

In [None]:
#filtering the dataset only for maximum and non-winter values
pollution = pd.read_csv('/content/pollution_with_days')
pollution1 = pollution[(pollution['month'] < 11) & (pollution['Metric'] == 'max')]
pollution1

Unnamed: 0.1,Unnamed: 0,Date,monitor,Value,Substance,Metric,latitude,longitude,Pollution_Level,month,day_of_week
291,291,2024-05-09,1,0.47,co,max,53.344239,-6.271525,1,5,4
292,292,2024-05-10,1,0.35,co,max,53.344239,-6.271525,1,5,5
293,293,2024-05-11,1,0.34,co,max,53.344239,-6.271525,1,5,6
294,294,2024-05-12,1,0.38,co,max,53.344239,-6.271525,1,5,7
295,295,2024-05-13,1,0.30,co,max,53.344239,-6.271525,1,5,1
...,...,...,...,...,...,...,...,...,...,...,...
62435,62435,2024-10-27,14,56.26,o3,max,53.390433,-6.265270,2,10,7
62436,62436,2024-10-28,14,45.48,o3,max,53.390433,-6.265270,1,10,1
62437,62437,2024-10-29,14,32.25,o3,max,53.390433,-6.265270,1,10,2
62438,62438,2024-10-30,14,23.42,o3,max,53.390433,-6.265270,1,10,3


In [None]:
#filtering the dataset only for non-winter values
pollution2 = pollution[pollution['month'] < 11]
pollution2
#pollution2.to_csv('pollution2', index = False)

Unnamed: 0.1,Unnamed: 0,Date,monitor,Value,Substance,Metric,latitude,longitude,Pollution_Level,month,day_of_week
0,0,2024-05-09,1,0.405000,co,mean,53.344239,-6.271525,1,5,4
1,1,2024-05-10,1,0.273333,co,mean,53.344239,-6.271525,1,5,5
2,2,2024-05-11,1,0.278750,co,mean,53.344239,-6.271525,1,5,6
3,3,2024-05-12,1,0.295000,co,mean,53.344239,-6.271525,1,5,7
4,4,2024-05-13,1,0.234583,co,mean,53.344239,-6.271525,1,5,1
...,...,...,...,...,...,...,...,...,...,...,...
62764,62764,2024-10-27,14,5.190000,o3,min,53.390433,-6.265270,1,10,7
62765,62765,2024-10-28,14,7.060000,o3,min,53.390433,-6.265270,1,10,1
62766,62766,2024-10-29,14,0.340000,o3,min,53.390433,-6.265270,1,10,2
62767,62767,2024-10-30,14,-0.920000,o3,min,53.390433,-6.265270,1,10,3


Using GridSearchCV or RandomizedSearchCV to tune hyperparameters for my classification model

##Adding substances columns

In [None]:
pollution2 = pd.read_csv('/content/pollution2')
pollution2 = pd.get_dummies(pollution2, columns=['Substance'], drop_first=False)
#pollution_data1['Substance_numeric'] = encoder.fit_transform(pollution_data1['Substance'])

Unnamed: 0,Date,monitor,Value,Metric,latitude,longitude,Pollution_Level,month,day_of_week,Substance_co,Substance_no,Substance_no2,Substance_o3,Substance_pm1,Substance_pm10,Substance_pm2_5,Substance_pm4,Substance_so2,Substance_tsp
0,2024-05-09,DCC-AQ1,0.405000,mean,53.344239,-6.271525,1,5,4,True,False,False,False,False,False,False,False,False,False
1,2024-05-10,DCC-AQ1,0.273333,mean,53.344239,-6.271525,1,5,5,True,False,False,False,False,False,False,False,False,False
2,2024-05-11,DCC-AQ1,0.278750,mean,53.344239,-6.271525,1,5,6,True,False,False,False,False,False,False,False,False,False
3,2024-05-12,DCC-AQ1,0.295000,mean,53.344239,-6.271525,1,5,7,True,False,False,False,False,False,False,False,False,False
4,2024-05-13,DCC-AQ1,0.234583,mean,53.344239,-6.271525,1,5,1,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59743,2024-10-27,DCC-AQ69,5.190000,min,53.345053,-6.254344,1,10,7,False,False,False,True,False,False,False,False,False,False
59744,2024-10-28,DCC-AQ69,7.060000,min,53.345053,-6.254344,1,10,1,False,False,False,True,False,False,False,False,False,False
59745,2024-10-29,DCC-AQ69,0.340000,min,53.345053,-6.254344,1,10,2,False,False,False,True,False,False,False,False,False,False
59746,2024-10-30,DCC-AQ69,-0.920000,min,53.345053,-6.254344,1,10,3,False,False,False,True,False,False,False,False,False,False


In [None]:
#making list of substances from the columns
substances = list(pollution2.columns)[9:]
pollution2[substances] = pollution2[substances].astype(int)
pollution2

Unnamed: 0,Date,monitor,Value,Metric,latitude,longitude,Pollution_Level,month,day_of_week,Substance_co,Substance_no,Substance_no2,Substance_o3,Substance_pm1,Substance_pm10,Substance_pm2_5,Substance_pm4,Substance_so2,Substance_tsp
0,2024-05-09,DCC-AQ1,0.405000,mean,53.344239,-6.271525,1,5,4,1,0,0,0,0,0,0,0,0,0
1,2024-05-10,DCC-AQ1,0.273333,mean,53.344239,-6.271525,1,5,5,1,0,0,0,0,0,0,0,0,0
2,2024-05-11,DCC-AQ1,0.278750,mean,53.344239,-6.271525,1,5,6,1,0,0,0,0,0,0,0,0,0
3,2024-05-12,DCC-AQ1,0.295000,mean,53.344239,-6.271525,1,5,7,1,0,0,0,0,0,0,0,0,0
4,2024-05-13,DCC-AQ1,0.234583,mean,53.344239,-6.271525,1,5,1,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59743,2024-10-27,DCC-AQ69,5.190000,min,53.345053,-6.254344,1,10,7,0,0,0,1,0,0,0,0,0,0
59744,2024-10-28,DCC-AQ69,7.060000,min,53.345053,-6.254344,1,10,1,0,0,0,1,0,0,0,0,0,0
59745,2024-10-29,DCC-AQ69,0.340000,min,53.345053,-6.254344,1,10,2,0,0,0,1,0,0,0,0,0,0
59746,2024-10-30,DCC-AQ69,-0.920000,min,53.345053,-6.254344,1,10,3,0,0,0,1,0,0,0,0,0,0


In [None]:
#Setting the model variables
substances1 = ['latitude', 'longitude', 'Value', 'month', 'day_of_week'] + substances
substances1

['latitude',
 'longitude',
 'Value',
 'month',
 'day_of_week',
 'Substance_co',
 'Substance_no',
 'Substance_no2',
 'Substance_o3',
 'Substance_pm1',
 'Substance_pm10',
 'Substance_pm2_5',
 'Substance_pm4',
 'Substance_so2',
 'Substance_tsp']

##The right encoding: 0, 1, 2

In [None]:
#Encoding the dependent variable as 0 for the cleanest level, 1 for acceptable and 2 for the poorest
pollution = pd.read_csv('/content/pollution_with_days')
right_pollution = pollution.copy()
right_pollution['Pollution_Level'] = right_pollution['Pollution_Level'].replace(1, 0)
right_pollution['Pollution_Level'] = right_pollution['Pollution_Level'].replace(2, 1)
right_pollution['Pollution_Level'] = right_pollution['Pollution_Level'].replace(3, 2)
right_pollution['Pollution_Level'].value_counts()

Unnamed: 0_level_0,count
Pollution_Level,Unnamed: 1_level_1
2,39529
0,20309
1,2940


In [None]:
#making a list of substances
right_pollution = pd.get_dummies(right_pollution, columns=['Substance'], drop_first=False)
substances = list(right_pollution.columns)[10:]
right_pollution[substances] = right_pollution[substances].astype(int)
right_pollution[substances]

Unnamed: 0,Substance_co,Substance_no,Substance_no2,Substance_o3,Substance_pm1,Substance_pm10,Substance_pm2,Substance_pm4,Substance_so2,Substance_tsp
0,1,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
62773,0,0,0,1,0,0,0,0,0,0
62774,0,0,0,1,0,0,0,0,0,0
62775,0,0,0,1,0,0,0,0,0,0
62776,0,0,0,1,0,0,0,0,0,0


In [None]:
right_pollution[''].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62778 entries, 0 to 62777
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       62778 non-null  int64  
 1   Date             62778 non-null  object 
 2   monitor          62778 non-null  int64  
 3   Value            62778 non-null  float64
 4   Metric           62778 non-null  object 
 5   latitude         62778 non-null  float64
 6   longitude        62778 non-null  float64
 7   Pollution_Level  62778 non-null  int64  
 8   month            62778 non-null  int64  
 9   day_of_week      62778 non-null  int64  
 10  Substance_co     62778 non-null  int64  
 11  Substance_no     62778 non-null  int64  
 12  Substance_no2    62778 non-null  int64  
 13  Substance_o3     62778 non-null  int64  
 14  Substance_pm1    62778 non-null  int64  
 15  Substance_pm10   62778 non-null  int64  
 16  Substance_pm2    62778 non-null  int64  
 17  Substance_pm

In [None]:
#Setting two datasets
right_pollution1 = right_pollution[(right_pollution['month'] < 11) & (right_pollution['Metric'] == 'max')]
right_pollution2 = right_pollution[right_pollution['month'] < 11]

## with k-cross validation

###Random Forest

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

X = right_pollution1[['latitude', 'longitude', 'Value', 'month', 'day_of_week', 'monitor']]
y = right_pollution1['Pollution_Level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 15,],
}

grid_search = GridSearchCV(rf_model, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)
print("Best parameters found by GridSearchCV:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# The best model predictions on the test set
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))

Best parameters found by GridSearchCV: {'max_depth': 10, 'n_estimators': 150}
Best cross-validation score: 0.7228548516439455
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.63      0.66      1018
           1       0.67      0.81      0.73       385
           2       0.78      0.78      0.78      1715

    accuracy                           0.74      3118
   macro avg       0.71      0.74      0.72      3118
weighted avg       0.74      0.74      0.73      3118



In [None]:
#just a regular cross-validation without the model parameteres tuning for the first dataset
X = right_pollution1[['latitude', 'longitude', 'Value', 'month', 'day_of_week', 'monitor']]
y = right_pollution1['Pollution_Level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(
    n_estimators=150,
    max_depth=10
)

cv_scores = cross_val_score(rf_model, X_train, y_train, cv = 10, scoring = 'accuracy')
print("Cross-validation scores: ", cv_scores)
print("Mean cross-validation score: ", np.mean(cv_scores))

Cross-validation scores:  [0.73055333 0.72253408 0.71772253 0.71772253 0.70970329 0.74178027
 0.72734563 0.71371291 0.72734563 0.7329591 ]
Mean cross-validation score:  0.7241379310344828


In [None]:
#building a dataframe to see all the results together
result_dict = {}
result_dict['RF with 6 initial variables with the first dataset'] = {}
result_dict['RF with 6 initial variables with the first dataset']['Dataset'] = 'Dataset with max values'
result_dict['RF with 6 initial variables with the first dataset']['parameters'] = {'max_depth': 10, 'n_estimators': 150}
result_dict['RF with 6 initial variables with the first dataset']['Mean cross-validation score'] = np.mean(cv_scores)
result_dict


{'RF with 6 initial variables with the first dataset': {'Dataset': 'Dataset with max values',
  'parameters': {'max_depth': 10, 'n_estimators': 150},
  'Mean cross-validation score': 0.7241379310344828}}

In [None]:
#just a regular cross-validation without the model parameteres tuning for the second dataset
X = right_pollution2[['latitude', 'longitude', 'Value', 'month', 'day_of_week', 'monitor']]
y = right_pollution2['Pollution_Level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(
    n_estimators=150,
    max_depth=10
)

cv_scores = cross_val_score(rf_model, X_train, y_train, cv = 10, scoring = 'accuracy')
print("Cross-validation scores: ", cv_scores)
print("Mean cross-validation score: ", np.mean(cv_scores))

Cross-validation scores:  [0.73953975 0.72635983 0.73284519 0.73263598 0.72866109 0.73514644
 0.73682008 0.74079498 0.72400084 0.73760201]
Mean cross-validation score:  0.7334406193063973


In [None]:
result_dict['RF with 6 initial variables with the second dataset'] = {}
result_dict['RF with 6 initial variables with the second dataset']['Dataset'] = 'Dataset with all values'
result_dict['RF with 6 initial variables with the second dataset']['parameters'] = {'max_depth': 10, 'n_estimators': 150}
result_dict['RF with 6 initial variables with the second dataset']['Mean cross-validation score'] = np.mean(cv_scores)
result_dict

{'RF with 6 initial variables with the first dataset': {'Dataset': 'Dataset with max values',
  'parameters': {'max_depth': 10, 'n_estimators': 150},
  'Mean cross-validation score': 0.7241379310344828},
 'RF with 6 initial variables with the second dataset': {'Dataset': 'Dataset with all values',
  'parameters': {'max_depth': 10, 'n_estimators': 150},
  'Mean cross-validation score': 0.7334406193063973}}

In [None]:
#just a regular cross-validation for the initial variables and two most common substances without the model parameteres tuning for the first dataset
X = right_pollution1[['latitude', 'longitude', 'Value', 'month', 'day_of_week', 'monitor', 'Substance_pm1', 'Substance_pm10']]
y = right_pollution1['Pollution_Level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(
    n_estimators=150,
    max_depth=10
)

cv_scores = cross_val_score(rf_model, X_train, y_train, cv = 10, scoring = 'accuracy')
print("Cross-validation scores: ", cv_scores)
print("Mean cross-validation score: ", np.mean(cv_scores))

Cross-validation scores:  [0.97594226 0.97193264 0.97514034 0.97514034 0.96952686 0.97594226
 0.96391339 0.96952686 0.97113071 0.96872494]
Mean cross-validation score:  0.9716920609462709


In [None]:
result_dict['RF with 6 initial variables and two most common substances with the first dataset'] = {}
result_dict['RF with 6 initial variables and two most common substances with the first dataset']['Dataset'] = 'Dataset with max values'
result_dict['RF with 6 initial variables and two most common substances with the first dataset']['parameters'] = {'max_depth': 10, 'n_estimators': 150}
result_dict['RF with 6 initial variables and two most common substances with the first dataset']['Mean cross-validation score'] = np.mean(cv_scores)

In [None]:
#just a regular cross-validation for the initial variables and two most common substances without the model parameteres tuning for the first dataset
X = right_pollution2[['latitude', 'longitude', 'Value', 'month', 'day_of_week', 'monitor', 'Substance_pm1', 'Substance_pm10']]
y = right_pollution2['Pollution_Level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(
    n_estimators=150,
    max_depth=10
)

cv_scores = cross_val_score(rf_model, X_train, y_train, cv = 10, scoring = 'accuracy')
print("Cross-validation scores: ", cv_scores)
print("Mean cross-validation score: ", np.mean(cv_scores))

Cross-validation scores:  [0.84560669 0.84246862 0.8458159  0.84476987 0.83158996 0.841841
 0.83933054 0.84790795 0.83762293 0.83845993]
Mean cross-validation score:  0.8415413406456596


In [None]:
result_dict['RF with 6 initial variables and two most common substances with the second dataset'] = {}
result_dict['RF with 6 initial variables and two most common substances with the second dataset']['Dataset'] = 'Dataset with all values'
result_dict['RF with 6 initial variables and two most common substances with the second dataset']['parameters'] = {'max_depth': 10, 'n_estimators': 150}
result_dict['RF with 6 initial variables and two most common substances with the second dataset']['Mean cross-validation score'] = np.mean(cv_scores)


###Gradient Boosting

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

X = right_pollution1[['latitude', 'longitude', 'Value', 'month', 'day_of_week', 'monitor']]
y = right_pollution1['Pollution_Level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

gbc_model = GradientBoostingClassifier(random_state=42)
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

grid_search = GridSearchCV(gbc_model, param_grid, cv=10, scoring='accuracy')
grid_search.fit(X_train, y_train)
print("Best parameters found by GridSearchCV:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

#The best model predictions on the test set
best_gbc = grid_search.best_estimator_
y_pred = best_gbc.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))

Best parameters found by GridSearchCV: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
Best cross-validation score: 0.7340016038492381
Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.66      0.67      1018
           1       0.70      0.79      0.74       385
           2       0.78      0.77      0.77      1715

    accuracy                           0.74      3118
   macro avg       0.72      0.74      0.73      3118
weighted avg       0.74      0.74      0.74      3118



In [None]:
#just a regular cross-validation without the model parameteres tuning for the first dataset
from sklearn.model_selection import cross_val_score
import numpy as np

X = right_pollution1[['latitude', 'longitude', 'Value', 'month', 'day_of_week', 'monitor']]
y = right_pollution1['Pollution_Level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

gbc_model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)

cv_scores = cross_val_score(gbc_model, X_train, y_train, cv = 10, scoring = 'accuracy')
print("Cross-validation scores: ", cv_scores)
print("Mean cross-validation score: ", np.mean(cv_scores))

Cross-validation scores:  [0.7393745  0.7457899  0.71852446 0.73857257 0.7297514  0.73777065
 0.74178027 0.71772253 0.7297514  0.74097835]
Mean cross-validation score:  0.7340016038492381


In [None]:
result_dict['GB with 6 initial variables with the first dataset'] = {}
result_dict['GB with 6 initial variables with the first dataset']['Dataset'] = 'Dataset with max values'
result_dict['GB with 6 initial variables with the first dataset']['parameters'] = {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
result_dict['GB with 6 initial variables with the first dataset']['Mean cross-validation score'] = np.mean(cv_scores)
result_dict

{'RF with 6 initial variables with the first dataset': {'Dataset': 'Dataset with max values',
  'parameters': {'max_depth': 10, 'n_estimators': 150},
  'Mean cross-validation score': 0.7241379310344828},
 'RF with 6 initial variables with the second dataset': {'Dataset': 'Dataset with all values',
  'parameters': {'max_depth': 10, 'n_estimators': 150},
  'Mean cross-validation score': 0.7334406193063973},
 'RF with 6 initial variables and two most common substances with the first dataset': {'Dataset': 'Dataset with max values',
  'parameters': {'max_depth': 10, 'n_estimators': 150},
  'Mean cross-validation score': 0.9716920609462709},
 'RF with 6 initial variables and two most common substances with the second dataset': {'Dataset': 'Dataset with all values',
  'parameters': {'max_depth': 10, 'n_estimators': 150},
  'Mean cross-validation score': 0.8415413406456596},
 'GB with 6 initial variables with the first dataset': {'Dataset': 'Dataset with max values',
  'parameters': {'learning

In [None]:
#just a regular cross-validation without the model parameteres tuning for the second dataset
X = right_pollution2[['latitude', 'longitude', 'Value', 'month', 'day_of_week', 'monitor']]
y = right_pollution2['Pollution_Level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

gbc_model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)

cv_scores = cross_val_score(gbc_model, X_train, y_train, cv = 10, scoring = 'accuracy')
print("Cross-validation scores: ", cv_scores)
print("Mean cross-validation score: ", np.mean(cv_scores))

Cross-validation scores:  [0.74205021 0.73682008 0.73368201 0.74246862 0.73640167 0.73786611
 0.73661088 0.75       0.72588408 0.73174304]
Mean cross-validation score:  0.7373526700234025


In [None]:
result_dict['GB with 6 initial variables with the second dataset'] = {}
result_dict['GB with 6 initial variables with the second dataset']['Dataset'] = 'Dataset with all values'
result_dict['GB with 6 initial variables with the second dataset']['parameters'] = {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
result_dict['GB with 6 initial variables with the second dataset']['Mean cross-validation score'] = np.mean(cv_scores)
result_dict

{'RF with 6 initial variables with the first dataset': {'Dataset': 'Dataset with max values',
  'parameters': {'max_depth': 10, 'n_estimators': 150},
  'Mean cross-validation score': 0.7241379310344828},
 'RF with 6 initial variables with the second dataset': {'Dataset': 'Dataset with all values',
  'parameters': {'max_depth': 10, 'n_estimators': 150},
  'Mean cross-validation score': 0.7334406193063973},
 'RF with 6 initial variables and two most common substances with the first dataset': {'Dataset': 'Dataset with max values',
  'parameters': {'max_depth': 10, 'n_estimators': 150},
  'Mean cross-validation score': 0.9716920609462709},
 'RF with 6 initial variables and two most common substances with the second dataset': {'Dataset': 'Dataset with all values',
  'parameters': {'max_depth': 10, 'n_estimators': 150},
  'Mean cross-validation score': 0.8415413406456596},
 'GB with 6 initial variables with the first dataset': {'Dataset': 'Dataset with max values',
  'parameters': {'learning

In [None]:
#just a regular cross-validation for the initial variables and two most common substances without the model parameteres tuning for the first dataset
X = right_pollution1[['latitude', 'longitude', 'Value', 'month', 'day_of_week', 'monitor', 'Substance_pm1', 'Substance_pm10']]
y = right_pollution1['Pollution_Level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

gbc_model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)

cv_scores = cross_val_score(gbc_model, X_train, y_train, cv = 10, scoring = 'accuracy')
print("Cross-validation scores: ", cv_scores)
print("Mean cross-validation score: ", np.mean(cv_scores))

Cross-validation scores:  [0.97594226 0.97594226 0.97433841 0.97514034 0.97754611 0.98396151
 0.96631917 0.97594226 0.97193264 0.96792302]
Mean cross-validation score:  0.9744987971130715


In [None]:
result_dict['GB with 6 initial variables and two most common substances with the first dataset'] = {}
result_dict['GB with 6 initial variables and two most common substances with the first dataset']['Dataset'] = 'Dataset with max values'
result_dict['GB with 6 initial variables and two most common substances with the first dataset']['parameters'] = {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
result_dict['GB with 6 initial variables and two most common substances with the first dataset']['Mean cross-validation score'] = np.mean(cv_scores)
result_dict

{'RF with 6 initial variables with the first dataset': {'Dataset': 'Dataset with max values',
  'parameters': {'max_depth': 10, 'n_estimators': 150},
  'Mean cross-validation score': 0.7241379310344828},
 'RF with 6 initial variables with the second dataset': {'Dataset': 'Dataset with all values',
  'parameters': {'max_depth': 10, 'n_estimators': 150},
  'Mean cross-validation score': 0.7334406193063973},
 'RF with 6 initial variables and two most common substances with the first dataset': {'Dataset': 'Dataset with max values',
  'parameters': {'max_depth': 10, 'n_estimators': 150},
  'Mean cross-validation score': 0.9716920609462709},
 'RF with 6 initial variables and two most common substances with the second dataset': {'Dataset': 'Dataset with all values',
  'parameters': {'max_depth': 10, 'n_estimators': 150},
  'Mean cross-validation score': 0.8415413406456596},
 'GB with 6 initial variables with the first dataset': {'Dataset': 'Dataset with max values',
  'parameters': {'learning

In [None]:
#just a regular cross-validation for the initial variables and two most common substances without the model parameteres tuning for the second dataset
X = right_pollution2[['latitude', 'longitude', 'Value', 'month', 'day_of_week', 'monitor', 'Substance_pm1', 'Substance_pm10']]
y = right_pollution2['Pollution_Level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

gbc_model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)

cv_scores = cross_val_score(gbc_model, X_train, y_train, cv = 10, scoring = 'accuracy')
print("Cross-validation scores: ", cv_scores)
print("Mean cross-validation score: ", np.mean(cv_scores))

Cross-validation scores:  [0.85083682 0.84937238 0.85       0.85104603 0.83849372 0.84958159
 0.84497908 0.85125523 0.83825068 0.84097091]
Mean cross-validation score:  0.8464786448032318


In [None]:
result_dict['GB with 6 initial variables and two most common substances with the second dataset'] = {}
result_dict['GB with 6 initial variables and two most common substances with the second dataset']['Dataset'] = 'Dataset with all values'
result_dict['GB with 6 initial variables and two most common substances with the second dataset']['parameters'] = {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
result_dict['GB with 6 initial variables and two most common substances with the second dataset']['Mean cross-validation score'] = np.mean(cv_scores)
result_dict

{'RF with 6 initial variables with the first dataset': {'Dataset': 'Dataset with max values',
  'parameters': {'max_depth': 10, 'n_estimators': 150},
  'Mean cross-validation score': 0.7241379310344828},
 'RF with 6 initial variables with the second dataset': {'Dataset': 'Dataset with all values',
  'parameters': {'max_depth': 10, 'n_estimators': 150},
  'Mean cross-validation score': 0.7334406193063973},
 'RF with 6 initial variables and two most common substances with the first dataset': {'Dataset': 'Dataset with max values',
  'parameters': {'max_depth': 10, 'n_estimators': 150},
  'Mean cross-validation score': 0.9716920609462709},
 'RF with 6 initial variables and two most common substances with the second dataset': {'Dataset': 'Dataset with all values',
  'parameters': {'max_depth': 10, 'n_estimators': 150},
  'Mean cross-validation score': 0.8415413406456596},
 'GB with 6 initial variables with the first dataset': {'Dataset': 'Dataset with max values',
  'parameters': {'learning

###Results

In [None]:
result_dict

{'RF with 6 initial variables with the first dataset': {'Dataset': 'Dataset with max values',
  'parameters': {'max_depth': 10, 'n_estimators': 150},
  'Mean cross-validation score': 0.7241379310344828},
 'RF with 6 initial variables with the second dataset': {'Dataset': 'Dataset with all values',
  'parameters': {'max_depth': 10, 'n_estimators': 150},
  'Mean cross-validation score': 0.7334406193063973},
 'RF with 6 initial variables and two most common substances with the first dataset': {'Dataset': 'Dataset with max values',
  'parameters': {'max_depth': 10, 'n_estimators': 150},
  'Mean cross-validation score': 0.9716920609462709},
 'RF with 6 initial variables and two most common substances with the second dataset': {'Dataset': 'Dataset with all values',
  'parameters': {'max_depth': 10, 'n_estimators': 150},
  'Mean cross-validation score': 0.8415413406456596},
 'GB with 6 initial variables with the first dataset': {'Dataset': 'Dataset with max values',
  'parameters': {'learning

In [None]:
result = pd.DataFrame.from_dict(result_dict)
result

Unnamed: 0,RF with 6 initial variables with the first dataset,RF with 6 initial variables with the second dataset,RF with 6 initial variables and two most common substances with the first dataset,RF with 6 initial variables and two most common substances with the second dataset,GB with 6 initial variables with the first dataset,GB with 6 initial variables with the second dataset,GB with 6 initial variables and two most common substances with the first dataset,GB with 6 initial variables and two most common substances with the second dataset
Dataset,Dataset with max values,Dataset with all values,Dataset with max values,Dataset with all values,Dataset with max values,Dataset with all values,Dataset with max values,Dataset with all values
parameters,"{'max_depth': 10, 'n_estimators': 150}","{'max_depth': 10, 'n_estimators': 150}","{'max_depth': 10, 'n_estimators': 150}","{'max_depth': 10, 'n_estimators': 150}","{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...","{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...","{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...","{'learning_rate': 0.1, 'max_depth': 5, 'n_esti..."
Mean cross-validation score,0.724138,0.733441,0.971692,0.841541,0.734002,0.737353,0.974499,0.846479


In [None]:
results_cv = result.transpose()
results_cv.reset_index(inplace=True)
results_cv['Model'] = results_cv['index'].apply(lambda x: x.split(' ')[0])
results_cv['Dataset Description'] = results_cv['index'].apply(lambda x: ' '.join(x.split(' ')[1:]))
results_cv = results_cv.drop(columns = 'index')
results_cv

Unnamed: 0,Dataset,parameters,Mean cross-validation score,Model,Dataset Description
0,Dataset with max values,"{'max_depth': 10, 'n_estimators': 150}",0.724138,RF,with 6 initial variables with the first dataset
1,Dataset with all values,"{'max_depth': 10, 'n_estimators': 150}",0.733441,RF,with 6 initial variables with the second dataset
2,Dataset with max values,"{'max_depth': 10, 'n_estimators': 150}",0.971692,RF,with 6 initial variables and two most common s...
3,Dataset with all values,"{'max_depth': 10, 'n_estimators': 150}",0.841541,RF,with 6 initial variables and two most common s...
4,Dataset with max values,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...",0.734002,GB,with 6 initial variables with the first dataset
5,Dataset with all values,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...",0.737353,GB,with 6 initial variables with the second dataset
6,Dataset with max values,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...",0.974499,GB,with 6 initial variables and two most common s...
7,Dataset with all values,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...",0.846479,GB,with 6 initial variables and two most common s...


In [None]:
results_cv[results_cv['Dataset'] == 'Dataset with max values'][['Model', 'Mean cross-validation score']]

Unnamed: 0,Model,Mean cross-validation score
0,RF,0.724138
2,RF,0.971692
4,GB,0.734002
6,GB,0.974499


In [None]:
results_cv[results_cv['Dataset'] == 'Dataset with all values'][['Model', 'Mean cross-validation score']]

Unnamed: 0,Model,Mean cross-validation score
1,RF,0.733441
3,RF,0.841541
5,GB,0.737353
7,GB,0.846479


Gradient boosting models are slightly better

##Test models

###Random Forest

In [None]:
#pollution1 dataset with 6 initial dependent variables
X = right_pollution1[['latitude', 'longitude', 'Value', 'month', 'day_of_week', 'monitor']]   # 'Substance', 'Metric', 'monitor']] Add more features as needed
y = right_pollution1['Pollution_Level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(
    n_estimators=150,
    max_depth=10
)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))
class_report = classification_report(y_test, y_pred, output_dict=True)

              precision    recall  f1-score   support

           0       0.70      0.62      0.66      1018
           1       0.68      0.81      0.74       385
           2       0.77      0.78      0.78      1715

    accuracy                           0.73      3118
   macro avg       0.71      0.74      0.72      3118
weighted avg       0.73      0.73      0.73      3118



In [None]:
class_report

{'0': {'precision': 0.6287657920310982,
  'recall': 0.6355599214145383,
  'f1-score': 0.6321446018563752,
  'support': 1018.0},
 '1': {'precision': 0.6690997566909975,
  'recall': 0.7142857142857143,
  'f1-score': 0.6909547738693468,
  'support': 385.0},
 '2': {'precision': 0.7508939213349225,
  'recall': 0.7346938775510204,
  'f1-score': 0.7427055702917772,
  'support': 1715.0},
 'accuracy': 0.699807568954458,
 'macro avg': {'precision': 0.6829198233523394,
  'recall': 0.694846504417091,
  'f1-score': 0.6886016486724998,
  'support': 3118.0},
 'weighted avg': {'precision': 0.7009204803409506,
  'recall': 0.699807568954458,
  'f1-score': 0.7002183597433888,
  'support': 3118.0}}

In [None]:
test_result_dict = {}
test_result_dict['RF with 6 initial variables with the first dataset'] = {}
test_result_dict['RF with 6 initial variables with the first dataset']['Precision_0'] = class_report['0']['precision']
test_result_dict['RF with 6 initial variables with the first dataset']['Recall_0'] = class_report['0']['recall']
test_result_dict['RF with 6 initial variables with the first dataset']['F1_0'] = class_report['0']['f1-score']
test_result_dict['RF with 6 initial variables with the first dataset']['Precision_1'] = class_report['1']['precision']
test_result_dict['RF with 6 initial variables with the first dataset']['Recall_1'] = class_report['1']['recall']
test_result_dict['RF with 6 initial variables with the first dataset']['F1_1'] = class_report['1']['f1-score']
test_result_dict['RF with 6 initial variables with the first dataset']['Precision_2'] = class_report['2']['precision']
test_result_dict['RF with 6 initial variables with the first dataset']['Recall_2'] = class_report['2']['recall']
test_result_dict['RF with 6 initial variables with the first dataset']['F1_2'] = class_report['2']['f1-score']
test_result_dict['RF with 6 initial variables with the first dataset']['Weighted Precision'] = class_report['weighted avg']['precision']
test_result_dict['RF with 6 initial variables with the first dataset']['Weighted Recall'] = class_report['weighted avg']['recall']
test_result_dict['RF with 6 initial variables with the first dataset']['Weighted F1'] = class_report['weighted avg']['f1-score']
test_result_dict['RF with 6 initial variables with the first dataset']['Accuracy'] = class_report['accuracy']
test_result_dict['RF with 6 initial variables with the first dataset']['Dataset'] = 'Dataset with max values'
test_result_dict['RF with 6 initial variables with the first dataset']['parameters'] = {'max_depth': 10, 'n_estimators': 150}

test_result_dict['RF with 6 initial variables with the first dataset']['Classes_Precision'] = str(test_result_dict['RF with 6 initial variables with the first dataset']['Precision_0']) + '/' + str(test_result_dict['RF with 6 initial variables with the first dataset']['Precision_1']) + '/' + str(test_result_dict['RF with 6 initial variables with the first dataset']['Precision_2'])
test_result_dict['RF with 6 initial variables with the first dataset']['Classes_Recall'] = str(test_result_dict['RF with 6 initial variables with the first dataset']['Recall_0']) + '/' + str(test_result_dict['RF with 6 initial variables with the first dataset']['Recall_1']) + '/' + str(test_result_dict['RF with 6 initial variables with the first dataset']['Recall_2'])
test_result_dict['RF with 6 initial variables with the first dataset']['Classes_F1'] = str(test_result_dict['RF with 6 initial variables with the first dataset']['F1_0']) + '/' + str(test_result_dict['RF with 6 initial variables with the first dataset']['F1_1']) + '/' + str(test_result_dict['RF with 6 initial variables with the first dataset']['F1_2'])

#test_result_dict['RF with 6 initial variables with the first dataset'] = test_result_dict['RF with 6 initial variables with the first dataset'].drop(['Precision_0', 'Recall_0', 'F1_0', 'Precision_1', 'Recall_1', 'F1_1', 'Precision_2', 'Recall_2', 'F1_2'], axis=1)

keys_to_remove = ['Precision_0', 'Recall_0', 'F1_0', 'Precision_1', 'Recall_1', 'F1_1', 'Precision_2', 'Recall_2', 'F1_2']
test_result_dict['RF with 6 initial variables with the first dataset'] = {
    key: value for key, value in test_result_dict['RF with 6 initial variables with the first dataset'].items()
    if key not in keys_to_remove
}

print(test_result_dict['RF with 6 initial variables with the first dataset'])



{'Weighted Precision': 0.7345999765558708, 'Weighted Recall': 0.7347658755612572, 'Weighted F1': 0.7332423010906775, 'Accuracy': 0.7347658755612572, 'Dataset': 'Dataset with max values', 'parameters': {'max_depth': 10, 'n_estimators': 150}, 'Classes_Precision': '0.6955093099671413/0.6760259179265659/0.7709529276693455', 'Classes_Recall': '0.6237721021611002/0.812987012987013/0.7830903790087463', 'Classes_F1': '0.6576903158984981/0.7382075471698113/0.7769742551345097'}


In [None]:
#pollution2 dataset with 6 initial dependent variables
X = right_pollution2[['latitude', 'longitude', 'Value', 'month', 'day_of_week', 'monitor']]   # 'Substance', 'Metric', 'monitor']] Add more features as needed
y = right_pollution2['Pollution_Level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(
    n_estimators=150,
    max_depth=10
)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))
class_report = classification_report(y_test, y_pred, output_dict=True)

              precision    recall  f1-score   support

           0       0.79      0.31      0.44      3907
           1       0.71      0.65      0.68       564
           2       0.72      0.95      0.82      7479

    accuracy                           0.73     11950
   macro avg       0.74      0.64      0.65     11950
weighted avg       0.74      0.73      0.69     11950



In [None]:
class_report

{'0': {'precision': 0.7789948453608248,
  'recall': 0.30944458663936525,
  'f1-score': 0.4429382670818831,
  'support': 3907.0},
 '1': {'precision': 0.73,
  'recall': 0.6471631205673759,
  'f1-score': 0.6860902255639098,
  'support': 564.0},
 '2': {'precision': 0.7184279652455041,
  'recall': 0.9507955609038642,
  'f1-score': 0.8184381653910341,
  'support': 7479.0},
 'accuracy': 0.7267782426778243,
 'macro avg': {'precision': 0.7424742702021097,
  'recall': 0.6358010893702017,
  'f1-score': 0.6491555526789424,
  'support': 11950.0},
 'weighted avg': {'precision': 0.7387762019159722,
  'recall': 0.7267782426778243,
  'f1-score': 0.6894237435704189,
  'support': 11950.0}}

In [None]:
test_result_dict['RF with 6 initial variables with the second dataset'] = {}
test_result_dict['RF with 6 initial variables with the second dataset']['Precision_0'] = class_report['0']['precision']
test_result_dict['RF with 6 initial variables with the second dataset']['Recall_0'] = class_report['0']['recall']
test_result_dict['RF with 6 initial variables with the second dataset']['F1_0'] = class_report['0']['f1-score']
test_result_dict['RF with 6 initial variables with the second dataset']['Precision_1'] = class_report['1']['precision']
test_result_dict['RF with 6 initial variables with the second dataset']['Recall_1'] = class_report['1']['recall']
test_result_dict['RF with 6 initial variables with the second dataset']['F1_1'] = class_report['1']['f1-score']
test_result_dict['RF with 6 initial variables with the second dataset']['Precision_2'] = class_report['2']['precision']
test_result_dict['RF with 6 initial variables with the second dataset']['Recall_2'] = class_report['2']['recall']
test_result_dict['RF with 6 initial variables with the second dataset']['F1_2'] = class_report['2']['f1-score']
test_result_dict['RF with 6 initial variables with the second dataset']['Weighted Precision'] = class_report['weighted avg']['precision']
test_result_dict['RF with 6 initial variables with the second dataset']['Weighted Recall'] = class_report['weighted avg']['recall']
test_result_dict['RF with 6 initial variables with the second dataset']['Weighted F1'] = class_report['weighted avg']['f1-score']
test_result_dict['RF with 6 initial variables with the second dataset']['Accuracy'] = class_report['accuracy']
test_result_dict['RF with 6 initial variables with the second dataset']['Dataset'] = 'Dataset with all values'
test_result_dict['RF with 6 initial variables with the second dataset']['parameters'] = {'max_depth': 10, 'n_estimators': 150}

test_result_dict['RF with 6 initial variables with the second dataset']['Classes_Precision'] = str(test_result_dict['RF with 6 initial variables with the second dataset']['Precision_0']) + '/' + str(test_result_dict['RF with 6 initial variables with the second dataset']['Precision_1']) + '/' + str(test_result_dict['RF with 6 initial variables with the second dataset']['Precision_2'])
test_result_dict['RF with 6 initial variables with the second dataset']['Classes_Recall'] = str(test_result_dict['RF with 6 initial variables with the second dataset']['Recall_0']) + '/' + str(test_result_dict['RF with 6 initial variables with the second dataset']['Recall_1']) + '/' + str(test_result_dict['RF with 6 initial variables with the second dataset']['Recall_2'])
test_result_dict['RF with 6 initial variables with the second dataset']['Classes_F1'] = str(test_result_dict['RF with 6 initial variables with the second dataset']['F1_0']) + '/' + str(test_result_dict['RF with 6 initial variables with the second dataset']['F1_1']) + '/' + str(test_result_dict['RF with 6 initial variables with the second dataset']['F1_2'])

#test_result_dict['RF with 6 initial variables with the first dataset'] = test_result_dict['RF with 6 initial variables with the first dataset'].drop(['Precision_0', 'Recall_0', 'F1_0', 'Precision_1', 'Recall_1', 'F1_1', 'Precision_2', 'Recall_2', 'F1_2'], axis=1)

keys_to_remove = ['Precision_0', 'Recall_0', 'F1_0', 'Precision_1', 'Recall_1', 'F1_1', 'Precision_2', 'Recall_2', 'F1_2']
test_result_dict['RF with 6 initial variables with the second dataset'] = {
    key: value for key, value in test_result_dict['RF with 6 initial variables with the second dataset'].items()
    if key not in keys_to_remove
}

print(test_result_dict['RF with 6 initial variables with the second dataset'])

{'Weighted Precision': 0.7410404031288794, 'Weighted Recall': 0.7270292887029288, 'Weighted F1': 0.6892300143415494, 'Accuracy': 0.7270292887029288, 'Dataset': 'Dataset with all values', 'parameters': {'max_depth': 10, 'n_estimators': 150}, 'Classes_Precision': '0.7887139107611548/0.7120622568093385/0.718321226795803', 'Classes_Recall': '0.30765293063731763/0.648936170212766/0.9519989303382805', 'Classes_F1': '0.4426440802798748/0.6790352504638218/0.8188143292507619'}


In [None]:
#pollution1 dataset with 6 dependent variables and two largest substances (pm1, pm10)

X = right_pollution1[['latitude', 'longitude', 'Value', 'month', 'day_of_week', 'monitor', 'Substance_pm1', 'Substance_pm10']]   # 'Substance', 'Metric', 'monitor']] Add more features as needed
y = right_pollution1['Pollution_Level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))
class_report = classification_report(y_test, y_pred, output_dict=True)

              precision    recall  f1-score   support

           0       0.95      0.96      0.96      1018
           1       0.88      0.87      0.88       385
           2       1.00      1.00      1.00      1715

    accuracy                           0.97      3118
   macro avg       0.94      0.94      0.94      3118
weighted avg       0.97      0.97      0.97      3118



In [None]:
test_result_dict['RF with 6 initial variables and two most common substances with the first dataset'] = {}
test_result_dict['RF with 6 initial variables and two most common substances with the first dataset']['Precision_0'] = class_report['0']['precision']
test_result_dict['RF with 6 initial variables and two most common substances with the first dataset']['Recall_0'] = class_report['0']['recall']
test_result_dict['RF with 6 initial variables and two most common substances with the first dataset']['F1_0'] = class_report['0']['f1-score']
test_result_dict['RF with 6 initial variables and two most common substances with the first dataset']['Precision_1'] = class_report['1']['precision']
test_result_dict['RF with 6 initial variables and two most common substances with the first dataset']['Recall_1'] = class_report['1']['recall']
test_result_dict['RF with 6 initial variables and two most common substances with the first dataset']['F1_1'] = class_report['1']['f1-score']
test_result_dict['RF with 6 initial variables and two most common substances with the first dataset']['Precision_2'] = class_report['2']['precision']
test_result_dict['RF with 6 initial variables and two most common substances with the first dataset']['Recall_2'] = class_report['2']['recall']
test_result_dict['RF with 6 initial variables and two most common substances with the first dataset']['F1_2'] = class_report['2']['f1-score']
test_result_dict['RF with 6 initial variables and two most common substances with the first dataset']['Weighted Precision'] = class_report['weighted avg']['precision']
test_result_dict['RF with 6 initial variables and two most common substances with the first dataset']['Weighted Recall'] = class_report['weighted avg']['recall']
test_result_dict['RF with 6 initial variables and two most common substances with the first dataset']['Accuracy'] = class_report['accuracy']
test_result_dict['RF with 6 initial variables and two most common substances with the first dataset']['Weighted F1'] = class_report['weighted avg']['f1-score']
test_result_dict['RF with 6 initial variables and two most common substances with the first dataset']['Dataset'] = 'Dataset with max values'
test_result_dict['RF with 6 initial variables and two most common substances with the first dataset']['parameters'] = {'max_depth': 10, 'n_estimators': 150}

test_result_dict['RF with 6 initial variables and two most common substances with the first dataset']['Classes_Precision'] = str(test_result_dict['RF with 6 initial variables and two most common substances with the first dataset']['Precision_0']) + '/' + str(test_result_dict['RF with 6 initial variables and two most common substances with the first dataset']['Precision_1']) + '/' + str(test_result_dict['RF with 6 initial variables and two most common substances with the first dataset']['Precision_2'])
test_result_dict['RF with 6 initial variables and two most common substances with the first dataset']['Classes_Recall'] = str(test_result_dict['RF with 6 initial variables and two most common substances with the first dataset']['Recall_0']) + '/' + str(test_result_dict['RF with 6 initial variables and two most common substances with the first dataset']['Recall_1']) + '/' + str(test_result_dict['RF with 6 initial variables and two most common substances with the first dataset']['Recall_2'])
test_result_dict['RF with 6 initial variables and two most common substances with the first dataset']['Classes_F1'] = str(test_result_dict['RF with 6 initial variables and two most common substances with the first dataset']['F1_0']) + '/' + str(test_result_dict['RF with 6 initial variables and two most common substances with the first dataset']['F1_1']) + '/' + str(test_result_dict['RF with 6 initial variables and two most common substances with the first dataset']['F1_2'])

#test_result_dict['RF with 6 initial variables with the first dataset'] = test_result_dict['RF with 6 initial variables with the first dataset'].drop(['Precision_0', 'Recall_0', 'F1_0', 'Precision_1', 'Recall_1', 'F1_1', 'Precision_2', 'Recall_2', 'F1_2'], axis=1)

keys_to_remove = ['Precision_0', 'Recall_0', 'F1_0', 'Precision_1', 'Recall_1', 'F1_1', 'Precision_2', 'Recall_2', 'F1_2']
test_result_dict['RF with 6 initial variables and two most common substances with the first dataset'] = {
    key: value for key, value in test_result_dict['RF with 6 initial variables and two most common substances with the first dataset'].items()
    if key not in keys_to_remove
}

print(test_result_dict['RF with 6 initial variables and two most common substances with the first dataset'])

{'Weighted Precision': 0.9692048688382753, 'Weighted Recall': 0.9692110327132777, 'Accuracy': 0.9692110327132777, 'Weighted F1': 0.9692049234394945, 'Dataset': 'Dataset with max values', 'parameters': {'max_depth': 10, 'n_estimators': 150}, 'Classes_Precision': '0.9540566959921799/0.8772845953002611/0.9988317757009346', 'Classes_Recall': '0.9587426326129665/0.8727272727272727/0.9970845481049563', 'Classes_F1': '0.9563939245467908/0.875/0.997957397140356'}


In [None]:
#pollution2 with largest substances (pm1, pm10) columns

X = right_pollution2[['latitude', 'longitude', 'Value', 'month', 'day_of_week', 'monitor', 'Substance_pm1', 'Substance_pm10']]   # 'Substance', 'Metric', 'monitor']] Add more features as needed
y = right_pollution2['Pollution_Level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))
class_report = classification_report(y_test, y_pred, output_dict=True)

              precision    recall  f1-score   support

           0       0.70      0.71      0.70      3907
           1       0.81      0.76      0.78       564
           2       0.85      0.85      0.85      7479

    accuracy                           0.80     11950
   macro avg       0.78      0.77      0.78     11950
weighted avg       0.80      0.80      0.80     11950



In [None]:
test_result_dict['RF with 6 initial variables and two most common substances with the second dataset'] = {}
test_result_dict['RF with 6 initial variables and two most common substances with the second dataset']['Precision_0'] = class_report['0']['precision']
test_result_dict['RF with 6 initial variables and two most common substances with the second dataset']['Recall_0'] = class_report['0']['recall']
test_result_dict['RF with 6 initial variables and two most common substances with the second dataset']['F1_0'] = class_report['0']['f1-score']
test_result_dict['RF with 6 initial variables and two most common substances with the second dataset']['Precision_1'] = class_report['1']['precision']
test_result_dict['RF with 6 initial variables and two most common substances with the second dataset']['Recall_1'] = class_report['1']['recall']
test_result_dict['RF with 6 initial variables and two most common substances with the second dataset']['F1_1'] = class_report['1']['f1-score']
test_result_dict['RF with 6 initial variables and two most common substances with the second dataset']['Precision_2'] = class_report['2']['precision']
test_result_dict['RF with 6 initial variables and two most common substances with the second dataset']['Recall_2'] = class_report['2']['recall']
test_result_dict['RF with 6 initial variables and two most common substances with the second dataset']['F1_2'] = class_report['2']['f1-score']
test_result_dict['RF with 6 initial variables and two most common substances with the second dataset']['Weighted Precision'] = class_report['weighted avg']['precision']
test_result_dict['RF with 6 initial variables and two most common substances with the second dataset']['Weighted Recall'] = class_report['weighted avg']['recall']
test_result_dict['RF with 6 initial variables and two most common substances with the second dataset']['Weighted F1'] = class_report['weighted avg']['f1-score']
test_result_dict['RF with 6 initial variables and two most common substances with the second dataset']['Accuracy'] = class_report['accuracy']
test_result_dict['RF with 6 initial variables and two most common substances with the second dataset']['Dataset'] = 'Dataset with all values'
test_result_dict['RF with 6 initial variables and two most common substances with the second dataset']['parameters'] = {'max_depth': 10, 'n_estimators': 150}

test_result_dict['RF with 6 initial variables and two most common substances with the second dataset']['Classes_Precision'] = str(test_result_dict['RF with 6 initial variables and two most common substances with the second dataset']['Precision_0']) + '/' + str(test_result_dict['RF with 6 initial variables and two most common substances with the second dataset']['Precision_1']) + '/' + str(test_result_dict['RF with 6 initial variables and two most common substances with the second dataset']['Precision_2'])
test_result_dict['RF with 6 initial variables and two most common substances with the second dataset']['Classes_Recall'] = str(test_result_dict['RF with 6 initial variables and two most common substances with the second dataset']['Recall_0']) + '/' + str(test_result_dict['RF with 6 initial variables and two most common substances with the second dataset']['Recall_1']) + '/' + str(test_result_dict['RF with 6 initial variables and two most common substances with the second dataset']['Recall_2'])
test_result_dict['RF with 6 initial variables and two most common substances with the second dataset']['Classes_F1'] = str(test_result_dict['RF with 6 initial variables and two most common substances with the second dataset']['F1_0']) + '/' + str(test_result_dict['RF with 6 initial variables and two most common substances with the second dataset']['F1_1']) + '/' + str(test_result_dict['RF with 6 initial variables and two most common substances with the second dataset']['F1_2'])

#test_result_dict['RF with 6 initial variables with the first dataset'] = test_result_dict['RF with 6 initial variables with the first dataset'].drop(['Precision_0', 'Recall_0', 'F1_0', 'Precision_1', 'Recall_1', 'F1_1', 'Precision_2', 'Recall_2', 'F1_2'], axis=1)

keys_to_remove = ['Precision_0', 'Recall_0', 'F1_0', 'Precision_1', 'Recall_1', 'F1_1', 'Precision_2', 'Recall_2', 'F1_2']
test_result_dict['RF with 6 initial variables and two most common substances with the second dataset'] = {
    key: value for key, value in test_result_dict['RF with 6 initial variables and two most common substances with the second dataset'].items()
    if key not in keys_to_remove
}

print(test_result_dict['RF with 6 initial variables and two most common substances with the second dataset'])

{'Weighted Precision': 0.7987884740492898, 'Weighted Recall': 0.7976569037656904, 'Weighted F1': 0.7981457432345849, 'Accuracy': 0.7976569037656904, 'Dataset': 'Dataset with all values', 'parameters': {'max_depth': 10, 'n_estimators': 150}, 'Classes_Precision': '0.695478391206595/0.8060263653483992/0.8522114347357066', 'Classes_Recall': '0.7125671871000768/0.7588652482269503/0.8450327583901591', 'Classes_F1': '0.7039190897597978/0.7817351598173516/0.8486069150721719'}


####checking

In [None]:
#pollution1 dataset with all dependent variables
substances1 = ['latitude', 'longitude', 'Value', 'month', 'day_of_week', 'monitor'] + substances
X = right_pollution1[substances1]   # 'Substance', 'Metric', 'monitor']] Add more features as needed
y = right_pollution1['Pollution_Level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1018
           1       0.99      0.99      0.99       385
           2       1.00      1.00      1.00      1715

    accuracy                           1.00      3118
   macro avg       1.00      1.00      1.00      3118
weighted avg       1.00      1.00      1.00      3118



###Gradient boosting

In [None]:
#pollution1 dataset with 6 initial variables
X = right_pollution1[['latitude', 'longitude', 'Value', 'month', 'day_of_week', 'monitor']]   # 'Substance', 'Metric', 'monitor']] Add more features as needed
y = right_pollution1['Pollution_Level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

gbc_model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42)

gbc_model.fit(X_train, y_train)
y_pred = gbc_model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
class_report = classification_report(y_test, y_pred, output_dict=True)

Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.66      0.67      1018
           1       0.70      0.79      0.74       385
           2       0.78      0.77      0.77      1715

    accuracy                           0.74      3118
   macro avg       0.72      0.74      0.73      3118
weighted avg       0.74      0.74      0.74      3118



In [None]:
test_result_dict['GB with 6 initial variables with the first dataset'] = {}
test_result_dict['GB with 6 initial variables with the first dataset']['Precision_0'] = class_report['0']['precision']
test_result_dict['GB with 6 initial variables with the first dataset']['Recall_0'] = class_report['0']['recall']
test_result_dict['GB with 6 initial variables with the first dataset']['F1_0'] = class_report['0']['f1-score']
test_result_dict['GB with 6 initial variables with the first dataset']['Precision_1'] = class_report['1']['precision']
test_result_dict['GB with 6 initial variables with the first dataset']['Recall_1'] = class_report['1']['recall']
test_result_dict['GB with 6 initial variables with the first dataset']['F1_1'] = class_report['1']['f1-score']
test_result_dict['GB with 6 initial variables with the first dataset']['Precision_2'] = class_report['2']['precision']
test_result_dict['GB with 6 initial variables with the first dataset']['Recall_2'] = class_report['2']['recall']
test_result_dict['GB with 6 initial variables with the first dataset']['F1_2'] = class_report['2']['f1-score']
test_result_dict['GB with 6 initial variables with the first dataset']['Weighted Precision'] = class_report['weighted avg']['precision']
test_result_dict['GB with 6 initial variables with the first dataset']['Weighted Recall'] = class_report['weighted avg']['recall']
test_result_dict['GB with 6 initial variables with the first dataset']['Weighted F1'] = class_report['weighted avg']['f1-score']
test_result_dict['GB with 6 initial variables with the first dataset']['Accuracy'] = class_report['accuracy']
test_result_dict['GB with 6 initial variables with the first dataset']['Dataset'] = 'Dataset with max values'
test_result_dict['GB with 6 initial variables with the first dataset']['parameters'] = {'max_depth': 5, 'learning_rate' : 0.1, 'n_estimators': 100}

test_result_dict['GB with 6 initial variables with the first dataset']['Classes_Precision'] = str(test_result_dict['GB with 6 initial variables with the first dataset']['Precision_0']) + '/' + str(test_result_dict['GB with 6 initial variables with the first dataset']['Precision_1']) + '/' + str(test_result_dict['GB with 6 initial variables with the first dataset']['Precision_2'])
test_result_dict['GB with 6 initial variables with the first dataset']['Classes_Recall'] = str(test_result_dict['GB with 6 initial variables with the first dataset']['Recall_0']) + '/' + str(test_result_dict['GB with 6 initial variables with the first dataset']['Recall_1']) + '/' + str(test_result_dict['GB with 6 initial variables with the first dataset']['Recall_2'])
test_result_dict['GB with 6 initial variables with the first dataset']['Classes_F1'] = str(test_result_dict['GB with 6 initial variables with the first dataset']['F1_0']) + '/' + str(test_result_dict['GB with 6 initial variables with the first dataset']['F1_1']) + '/' + str(test_result_dict['GB with 6 initial variables with the first dataset']['F1_2'])

#test_result_dict['GB with 6 initial variables with the first dataset'] = test_result_dict['GB with 6 initial variables with the first dataset'].drop(['Precision_0', 'Recall_0', 'F1_0', 'Precision_1', 'Recall_1', 'F1_1', 'Precision_2', 'Recall_2', 'F1_2'], axis=1)

keys_to_remove = ['Precision_0', 'Recall_0', 'F1_0', 'Precision_1', 'Recall_1', 'F1_1', 'Precision_2', 'Recall_2', 'F1_2']
test_result_dict['GB with 6 initial variables with the first dataset'] = {
    key: value for key, value in test_result_dict['GB with 6 initial variables with the first dataset'].items()
    if key not in keys_to_remove
}

print(test_result_dict['GB with 6 initial variables with the first dataset'])

{'Weighted Precision': 0.7362888643070007, 'Weighted Recall': 0.736048749198204, 'Weighted F1': 0.7357543938635792, 'Accuracy': 0.736048749198204, 'Dataset': 'Dataset with max values', 'parameters': {'max_depth': 5, 'learning_rate': 0.1, 'n_estimators': 100}, 'Classes_Precision': '0.6849593495934959/0.6988505747126437/0.7751618599175986', 'Classes_Recall': '0.6620825147347741/0.7896103896103897/0.7679300291545189', 'Classes_F1': '0.6733266733266733/0.7414634146341463/0.7715289982425307'}


In [None]:
#pollution2 dataset with 6 initial variables
X = right_pollution2[['latitude', 'longitude', 'Value', 'month', 'day_of_week', 'monitor']]   # 'Substance', 'Metric', 'monitor']] Add more features as needed
y = right_pollution2['Pollution_Level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

gbc_model = GradientBoostingClassifier(
    n_estimators=100,  # Number of trees
    learning_rate=0.1,  # Step size shrinkage
    max_depth=5,
    random_state=42)

gbc_model.fit(X_train, y_train)
y_pred = gbc_model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
class_report = classification_report(y_test, y_pred, output_dict=True)

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.38      0.50      3907
           1       0.71      0.70      0.71       564
           2       0.73      0.92      0.82      7479

    accuracy                           0.73     11950
   macro avg       0.73      0.67      0.67     11950
weighted avg       0.73      0.73      0.71     11950



In [None]:
test_result_dict['GB with 6 initial variables with the second dataset'] = {}
test_result_dict['GB with 6 initial variables with the second dataset']['Precision_0'] = class_report['0']['precision']
test_result_dict['GB with 6 initial variables with the second dataset']['Recall_0'] = class_report['0']['recall']
test_result_dict['GB with 6 initial variables with the second dataset']['F1_0'] = class_report['0']['f1-score']
test_result_dict['GB with 6 initial variables with the second dataset']['Precision_1'] = class_report['1']['precision']
test_result_dict['GB with 6 initial variables with the second dataset']['Recall_1'] = class_report['1']['recall']
test_result_dict['GB with 6 initial variables with the second dataset']['F1_1'] = class_report['1']['f1-score']
test_result_dict['GB with 6 initial variables with the second dataset']['Precision_2'] = class_report['2']['precision']
test_result_dict['GB with 6 initial variables with the second dataset']['Recall_2'] = class_report['2']['recall']
test_result_dict['GB with 6 initial variables with the second dataset']['F1_2'] = class_report['2']['f1-score']
test_result_dict['GB with 6 initial variables with the second dataset']['Weighted Precision'] = class_report['weighted avg']['precision']
test_result_dict['GB with 6 initial variables with the second dataset']['Weighted Recall'] = class_report['weighted avg']['recall']
test_result_dict['GB with 6 initial variables with the second dataset']['Weighted F1'] = class_report['weighted avg']['f1-score']
test_result_dict['GB with 6 initial variables with the second dataset']['Accuracy'] = class_report['accuracy']
test_result_dict['GB with 6 initial variables with the second dataset']['Dataset'] = 'Dataset with all values'
test_result_dict['GB with 6 initial variables with the second dataset']['parameters'] = {'max_depth': 5, 'learning_rate' : 0.1, 'n_estimators': 100}

test_result_dict['GB with 6 initial variables with the second dataset']['Classes_Precision'] = str(test_result_dict['GB with 6 initial variables with the second dataset']['Precision_0']) + '/' + str(test_result_dict['GB with 6 initial variables with the second dataset']['Precision_1']) + '/' + str(test_result_dict['GB with 6 initial variables with the second dataset']['Precision_2'])
test_result_dict['GB with 6 initial variables with the second dataset']['Classes_Recall'] = str(test_result_dict['GB with 6 initial variables with the second dataset']['Recall_0']) + '/' + str(test_result_dict['GB with 6 initial variables with the second dataset']['Recall_1']) + '/' + str(test_result_dict['GB with 6 initial variables with the second dataset']['Recall_2'])
test_result_dict['GB with 6 initial variables with the second dataset']['Classes_F1'] = str(test_result_dict['GB with 6 initial variables with the second dataset']['F1_0']) + '/' + str(test_result_dict['GB with 6 initial variables with the second dataset']['F1_1']) + '/' + str(test_result_dict['GB with 6 initial variables with the second dataset']['F1_2'])

#test_result_dict['GB with 6 initial variables with the first dataset'] = test_result_dict['GB with 6 initial variables with the first dataset'].drop(['Precision_0', 'Recall_0', 'F1_0', 'Precision_1', 'Recall_1', 'F1_1', 'Precision_2', 'Recall_2', 'F1_2'], axis=1)

keys_to_remove = ['Precision_0', 'Recall_0', 'F1_0', 'Precision_1', 'Recall_1', 'F1_1', 'Precision_2', 'Recall_2', 'F1_2']
test_result_dict['GB with 6 initial variables with the second dataset'] = {
    key: value for key, value in test_result_dict['GB with 6 initial variables with the second dataset'].items()
    if key not in keys_to_remove
}

print(test_result_dict['GB with 6 initial variables with the second dataset'])

{'Weighted Precision': 0.732854730868841, 'Weighted Recall': 0.7333054393305439, 'Weighted F1': 0.7083337324246852, 'Accuracy': 0.7333054393305439, 'Dataset': 'Dataset with all values', 'parameters': {'max_depth': 5, 'learning_rate': 0.1, 'n_estimators': 100}, 'Classes_Precision': '0.7321164282190429/0.7109515260323159/0.7348921631432842', 'Classes_Recall': '0.37983107243409264/0.7021276595744681/0.9203102018986495', 'Classes_F1': '0.5001685203909673/0.7065120428189117/0.8172157910359157'}


In [None]:
#pollution1 dataset with 6 dependent variables and two largest substances (pm1, pm10)
X = right_pollution1[['latitude', 'longitude', 'Value', 'month', 'day_of_week', 'monitor', 'Substance_pm1', 'Substance_pm10']]
y = right_pollution1['Pollution_Level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

gbc_model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42)

gbc_model.fit(X_train, y_train)
y_pred = gbc_model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
class_report = classification_report(y_test, y_pred, output_dict=True)

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96      1018
           1       0.89      0.90      0.90       385
           2       1.00      1.00      1.00      1715

    accuracy                           0.97      3118
   macro avg       0.95      0.95      0.95      3118
weighted avg       0.97      0.97      0.97      3118



In [None]:
test_result_dict['GB with 6 initial variables and two most common substances with the first dataset'] = {}
test_result_dict['GB with 6 initial variables and two most common substances with the first dataset']['Precision_0'] = class_report['0']['precision']
test_result_dict['GB with 6 initial variables and two most common substances with the first dataset']['Recall_0'] = class_report['0']['recall']
test_result_dict['GB with 6 initial variables and two most common substances with the first dataset']['F1_0'] = class_report['0']['f1-score']
test_result_dict['GB with 6 initial variables and two most common substances with the first dataset']['Precision_1'] = class_report['1']['precision']
test_result_dict['GB with 6 initial variables and two most common substances with the first dataset']['Recall_1'] = class_report['1']['recall']
test_result_dict['GB with 6 initial variables and two most common substances with the first dataset']['F1_1'] = class_report['1']['f1-score']
test_result_dict['GB with 6 initial variables and two most common substances with the first dataset']['Precision_2'] = class_report['2']['precision']
test_result_dict['GB with 6 initial variables and two most common substances with the first dataset']['Recall_2'] = class_report['2']['recall']
test_result_dict['GB with 6 initial variables and two most common substances with the first dataset']['F1_2'] = class_report['2']['f1-score']
test_result_dict['GB with 6 initial variables and two most common substances with the first dataset']['Weighted Precision'] = class_report['weighted avg']['precision']
test_result_dict['GB with 6 initial variables and two most common substances with the first dataset']['Weighted Recall'] = class_report['weighted avg']['recall']
test_result_dict['GB with 6 initial variables and two most common substances with the first dataset']['Weighted F1'] = class_report['weighted avg']['f1-score']
test_result_dict['GB with 6 initial variables and two most common substances with the first dataset']['Accuracy'] = class_report['accuracy']
test_result_dict['GB with 6 initial variables and two most common substances with the first dataset']['Dataset'] = 'Dataset with max values'
test_result_dict['GB with 6 initial variables and two most common substances with the first dataset']['parameters'] = {'max_depth': 5, 'learning_rate' : 0.1, 'n_estimators': 100}

test_result_dict['GB with 6 initial variables and two most common substances with the first dataset']['Classes_Precision'] = str(test_result_dict['GB with 6 initial variables and two most common substances with the first dataset']['Precision_0']) + '/' + str(test_result_dict['GB with 6 initial variables and two most common substances with the first dataset']['Precision_1']) + '/' + str(test_result_dict['GB with 6 initial variables and two most common substances with the first dataset']['Precision_2'])
test_result_dict['GB with 6 initial variables and two most common substances with the first dataset']['Classes_Recall'] = str(test_result_dict['GB with 6 initial variables and two most common substances with the first dataset']['Recall_0']) + '/' + str(test_result_dict['GB with 6 initial variables and two most common substances with the first dataset']['Recall_1']) + '/' + str(test_result_dict['GB with 6 initial variables and two most common substances with the first dataset']['Recall_2'])
test_result_dict['GB with 6 initial variables and two most common substances with the first dataset']['Classes_F1'] = str(test_result_dict['GB with 6 initial variables and two most common substances with the first dataset']['F1_0']) + '/' + str(test_result_dict['GB with 6 initial variables and two most common substances with the first dataset']['F1_1']) + '/' + str(test_result_dict['GB with 6 initial variables and two most common substances with the first dataset']['F1_2'])

#test_result_dict['GB with 6 initial variables with the first dataset'] = test_result_dict['GB with 6 initial variables with the first dataset'].drop(['Precision_0', 'Recall_0', 'F1_0', 'Precision_1', 'Recall_1', 'F1_1', 'Precision_2', 'Recall_2', 'F1_2'], axis=1)

keys_to_remove = ['Precision_0', 'Recall_0', 'F1_0', 'Precision_1', 'Recall_1', 'F1_1', 'Precision_2', 'Recall_2', 'F1_2']
test_result_dict['GB with 6 initial variables and two most common substances with the first dataset'] = {
    key: value for key, value in test_result_dict['GB with 6 initial variables and two most common substances with the first dataset'].items()
    if key not in keys_to_remove
}

print(test_result_dict['GB with 6 initial variables and two most common substances with the first dataset'])

{'Weighted Precision': 0.9741135425494344, 'Weighted Recall': 0.9740218088518281, 'Weighted F1': 0.9740657479181349, 'Accuracy': 0.9740218088518281, 'Dataset': 'Dataset with max values', 'parameters': {'max_depth': 5, 'learning_rate': 0.1, 'n_estimators': 100}, 'Classes_Precision': '0.9636184857423795/0.8917525773195877/0.9988324576765908', 'Classes_Recall': '0.962671905697446/0.8987012987012987/0.997667638483965', 'Classes_F1': '0.9631449631449631/0.8952134540750324/0.9982497082847142'}


In [None]:
#pollution2 dataset with 6 dependent variables and two largest substances (pm1, pm10)
X = right_pollution2[['latitude', 'longitude', 'Value', 'month', 'day_of_week', 'monitor', 'Substance_pm1', 'Substance_pm10']]
y = right_pollution2['Pollution_Level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

gbc_model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42)

gbc_model.fit(X_train, y_train)
y_pred = gbc_model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
class_report = classification_report(y_test, y_pred, output_dict=True)

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.83      0.78      3907
           1       0.82      0.81      0.82       564
           2       0.91      0.84      0.88      7479

    accuracy                           0.84     11950
   macro avg       0.82      0.83      0.82     11950
weighted avg       0.85      0.84      0.84     11950



In [None]:
test_result_dict['GB with 6 initial variables and two most common substances with the second dataset'] = {}
test_result_dict['GB with 6 initial variables and two most common substances with the second dataset']['Precision_0'] = class_report['0']['precision']
test_result_dict['GB with 6 initial variables and two most common substances with the second dataset']['Recall_0'] = class_report['0']['recall']
test_result_dict['GB with 6 initial variables and two most common substances with the second dataset']['F1_0'] = class_report['0']['f1-score']
test_result_dict['GB with 6 initial variables and two most common substances with the second dataset']['Precision_1'] = class_report['1']['precision']
test_result_dict['GB with 6 initial variables and two most common substances with the second dataset']['Recall_1'] = class_report['1']['recall']
test_result_dict['GB with 6 initial variables and two most common substances with the second dataset']['F1_1'] = class_report['1']['f1-score']
test_result_dict['GB with 6 initial variables and two most common substances with the second dataset']['Precision_2'] = class_report['2']['precision']
test_result_dict['GB with 6 initial variables and two most common substances with the second dataset']['Recall_2'] = class_report['2']['recall']
test_result_dict['GB with 6 initial variables and two most common substances with the second dataset']['F1_2'] = class_report['2']['f1-score']
test_result_dict['GB with 6 initial variables and two most common substances with the second dataset']['Weighted Precision'] = class_report['weighted avg']['precision']
test_result_dict['GB with 6 initial variables and two most common substances with the second dataset']['Weighted Recall'] = class_report['weighted avg']['recall']
test_result_dict['GB with 6 initial variables and two most common substances with the second dataset']['Weighted F1'] = class_report['weighted avg']['f1-score']
test_result_dict['GB with 6 initial variables and two most common substances with the second dataset']['Accuracy'] = class_report['accuracy']
test_result_dict['GB with 6 initial variables and two most common substances with the second dataset']['Dataset'] = 'Dataset with all values'
test_result_dict['GB with 6 initial variables and two most common substances with the second dataset']['parameters'] = {'max_depth': 5, 'learning_rate' : 0.1, 'n_estimators': 100}

test_result_dict['GB with 6 initial variables and two most common substances with the second dataset']['Classes_Precision'] = str(test_result_dict['GB with 6 initial variables and two most common substances with the second dataset']['Precision_0']) + '/' + str(test_result_dict['GB with 6 initial variables and two most common substances with the second dataset']['Precision_1']) + '/' + str(test_result_dict['GB with 6 initial variables and two most common substances with the second dataset']['Precision_2'])
test_result_dict['GB with 6 initial variables and two most common substances with the second dataset']['Classes_Recall'] = str(test_result_dict['GB with 6 initial variables and two most common substances with the second dataset']['Recall_0']) + '/' + str(test_result_dict['GB with 6 initial variables and two most common substances with the second dataset']['Recall_1']) + '/' + str(test_result_dict['GB with 6 initial variables and two most common substances with the second dataset']['Recall_2'])
test_result_dict['GB with 6 initial variables and two most common substances with the second dataset']['Classes_F1'] = str(test_result_dict['GB with 6 initial variables and two most common substances with the second dataset']['F1_0']) + '/' + str(test_result_dict['GB with 6 initial variables and two most common substances with the second dataset']['F1_1']) + '/' + str(test_result_dict['GB with 6 initial variables and two most common substances with the second dataset']['F1_2'])

#test_result_dict['GB with 6 initial variables with the first dataset'] = test_result_dict['GB with 6 initial variables with the first dataset'].drop(['Precision_0', 'Recall_0', 'F1_0', 'Precision_1', 'Recall_1', 'F1_1', 'Precision_2', 'Recall_2', 'F1_2'], axis=1)

keys_to_remove = ['Precision_0', 'Recall_0', 'F1_0', 'Precision_1', 'Recall_1', 'F1_1', 'Precision_2', 'Recall_2', 'F1_2']
test_result_dict['GB with 6 initial variables and two most common substances with the second dataset'] = {
    key: value for key, value in test_result_dict['GB with 6 initial variables and two most common substances with the second dataset'].items()
    if key not in keys_to_remove
}

print(test_result_dict['GB with 6 initial variables and two most common substances with the second dataset'])

{'Weighted Precision': 0.8475758390191959, 'Weighted Recall': 0.8385774058577405, 'Weighted F1': 0.8410076090872144, 'Accuracy': 0.8385774058577405, 'Dataset': 'Dataset with all values', 'parameters': {'max_depth': 5, 'learning_rate': 0.1, 'n_estimators': 100}, 'Classes_Precision': '0.7256913470115968/0.822262118491921/0.9131567520625271', 'Classes_Recall': '0.8328640900947019/0.8120567375886525/0.8435619735258725', 'Classes_F1': '0.7755928971517102/0.8171275646743978/0.8769808173477899'}


###Results

In [None]:
test_result_dict

{'RF with 6 initial variables with the first dataset': {'Weighted Precision': 0.7345999765558708,
  'Weighted Recall': 0.7347658755612572,
  'Weighted F1': 0.7332423010906775,
  'Accuracy': 0.7347658755612572,
  'Dataset': 'Dataset with max values',
  'parameters': {'max_depth': 10, 'n_estimators': 150},
  'Classes_Precision': '0.6955093099671413/0.6760259179265659/0.7709529276693455',
  'Classes_Recall': '0.6237721021611002/0.812987012987013/0.7830903790087463',
  'Classes_F1': '0.6576903158984981/0.7382075471698113/0.7769742551345097'},
 'RF with 6 initial variables with the second dataset': {'Weighted Precision': 0.7410404031288794,
  'Weighted Recall': 0.7270292887029288,
  'Weighted F1': 0.6892300143415494,
  'Accuracy': 0.7270292887029288,
  'Dataset': 'Dataset with all values',
  'parameters': {'max_depth': 10, 'n_estimators': 150},
  'Classes_Precision': '0.7887139107611548/0.7120622568093385/0.718321226795803',
  'Classes_Recall': '0.30765293063731763/0.648936170212766/0.95199

In [None]:
test_result = pd.DataFrame.from_dict(test_result_dict)
test_result

Unnamed: 0,RF with 6 initial variables with the first dataset,RF with 6 initial variables with the second dataset,RF with 6 initial variables and two most common substances with the first dataset,RF with 6 initial variables and two most common substances with the second dataset,GB with 6 initial variables with the first dataset,GB with 6 initial variables with the second dataset,GB with 6 initial variables and two most common substances with the first dataset,GB with 6 initial variables and two most common substances with the second dataset
Weighted Precision,0.7346,0.74104,0.969205,0.798788,0.736289,0.732855,0.974114,0.847576
Weighted Recall,0.734766,0.727029,0.969211,0.797657,0.736049,0.733305,0.974022,0.838577
Weighted F1,0.733242,0.68923,0.969205,0.798146,0.735754,0.708334,0.974066,0.841008
Accuracy,0.734766,0.727029,0.969211,0.797657,0.736049,0.733305,0.974022,0.838577
Dataset,Dataset with max values,Dataset with all values,Dataset with max values,Dataset with all values,Dataset with max values,Dataset with all values,Dataset with max values,Dataset with all values
parameters,"{'max_depth': 10, 'n_estimators': 150}","{'max_depth': 10, 'n_estimators': 150}","{'max_depth': 10, 'n_estimators': 150}","{'max_depth': 10, 'n_estimators': 150}","{'max_depth': 5, 'learning_rate': 0.1, 'n_esti...","{'max_depth': 5, 'learning_rate': 0.1, 'n_esti...","{'max_depth': 5, 'learning_rate': 0.1, 'n_esti...","{'max_depth': 5, 'learning_rate': 0.1, 'n_esti..."
Classes_Precision,0.6955093099671413/0.6760259179265659/0.770952...,0.7887139107611548/0.7120622568093385/0.718321...,0.9540566959921799/0.8772845953002611/0.998831...,0.695478391206595/0.8060263653483992/0.8522114...,0.6849593495934959/0.6988505747126437/0.775161...,0.7321164282190429/0.7109515260323159/0.734892...,0.9636184857423795/0.8917525773195877/0.998832...,0.7256913470115968/0.822262118491921/0.9131567...
Classes_Recall,0.6237721021611002/0.812987012987013/0.7830903...,0.30765293063731763/0.648936170212766/0.951998...,0.9587426326129665/0.8727272727272727/0.997084...,0.7125671871000768/0.7588652482269503/0.845032...,0.6620825147347741/0.7896103896103897/0.767930...,0.37983107243409264/0.7021276595744681/0.92031...,0.962671905697446/0.8987012987012987/0.9976676...,0.8328640900947019/0.8120567375886525/0.843561...
Classes_F1,0.6576903158984981/0.7382075471698113/0.776974...,0.4426440802798748/0.6790352504638218/0.818814...,0.9563939245467908/0.875/0.997957397140356,0.7039190897597978/0.7817351598173516/0.848606...,0.6733266733266733/0.7414634146341463/0.771528...,0.5001685203909673/0.7065120428189117/0.817215...,0.9631449631449631/0.8952134540750324/0.998249...,0.7755928971517102/0.8171275646743978/0.876980...


In [None]:
results_test = test_result.transpose()
results_test.reset_index(inplace=True)
results_test['Model'] = results_test['index'].apply(lambda x: x.split(' ')[0])
results_test['Dataset Description'] = results_test['index'].apply(lambda x: ' '.join(x.split(' ')[1:]))
results_test = results_test.drop(columns = 'index')
results_test

Unnamed: 0,Weighted Precision,Weighted Recall,Weighted F1,Accuracy,Dataset,parameters,Classes_Precision,Classes_Recall,Classes_F1,Model,Dataset Description
0,0.7346,0.734766,0.733242,0.734766,Dataset with max values,"{'max_depth': 10, 'n_estimators': 150}",0.6955093099671413/0.6760259179265659/0.770952...,0.6237721021611002/0.812987012987013/0.7830903...,0.6576903158984981/0.7382075471698113/0.776974...,RF,with 6 initial variables with the first dataset
1,0.74104,0.727029,0.68923,0.727029,Dataset with all values,"{'max_depth': 10, 'n_estimators': 150}",0.7887139107611548/0.7120622568093385/0.718321...,0.30765293063731763/0.648936170212766/0.951998...,0.4426440802798748/0.6790352504638218/0.818814...,RF,with 6 initial variables with the second dataset
2,0.969205,0.969211,0.969205,0.969211,Dataset with max values,"{'max_depth': 10, 'n_estimators': 150}",0.9540566959921799/0.8772845953002611/0.998831...,0.9587426326129665/0.8727272727272727/0.997084...,0.9563939245467908/0.875/0.997957397140356,RF,with 6 initial variables and two most common s...
3,0.798788,0.797657,0.798146,0.797657,Dataset with all values,"{'max_depth': 10, 'n_estimators': 150}",0.695478391206595/0.8060263653483992/0.8522114...,0.7125671871000768/0.7588652482269503/0.845032...,0.7039190897597978/0.7817351598173516/0.848606...,RF,with 6 initial variables and two most common s...
4,0.736289,0.736049,0.735754,0.736049,Dataset with max values,"{'max_depth': 5, 'learning_rate': 0.1, 'n_esti...",0.6849593495934959/0.6988505747126437/0.775161...,0.6620825147347741/0.7896103896103897/0.767930...,0.6733266733266733/0.7414634146341463/0.771528...,GB,with 6 initial variables with the first dataset
5,0.732855,0.733305,0.708334,0.733305,Dataset with all values,"{'max_depth': 5, 'learning_rate': 0.1, 'n_esti...",0.7321164282190429/0.7109515260323159/0.734892...,0.37983107243409264/0.7021276595744681/0.92031...,0.5001685203909673/0.7065120428189117/0.817215...,GB,with 6 initial variables with the second dataset
6,0.974114,0.974022,0.974066,0.974022,Dataset with max values,"{'max_depth': 5, 'learning_rate': 0.1, 'n_esti...",0.9636184857423795/0.8917525773195877/0.998832...,0.962671905697446/0.8987012987012987/0.9976676...,0.9631449631449631/0.8952134540750324/0.998249...,GB,with 6 initial variables and two most common s...
7,0.847576,0.838577,0.841008,0.838577,Dataset with all values,"{'max_depth': 5, 'learning_rate': 0.1, 'n_esti...",0.7256913470115968/0.822262118491921/0.9131567...,0.8328640900947019/0.8120567375886525/0.843561...,0.7755928971517102/0.8171275646743978/0.876980...,GB,with 6 initial variables and two most common s...


In [None]:
def scale_values(value):
    parts = value.split('/')
    rounded_parts = [f"{float(part):.3f}" for part in parts]
    return '/'.join(rounded_parts)

results_test['Classes_Precision'] = results_test['Classes_Precision'].apply(scale_values)
results_test['Classes_Recall'] = results_test['Classes_Recall'].apply(scale_values)
results_test['Classes_F1'] = results_test['Classes_F1'].apply(scale_values)
results_test

Unnamed: 0,Weighted Precision,Weighted Recall,Weighted F1,Accuracy,Dataset,parameters,Classes_Precision,Classes_Recall,Classes_F1,Model,Dataset Description
0,0.7346,0.734766,0.733242,0.734766,Dataset with max values,"{'max_depth': 10, 'n_estimators': 150}",0.696/0.676/0.771,0.624/0.813/0.783,0.658/0.738/0.777,RF,with 6 initial variables with the first dataset
1,0.74104,0.727029,0.68923,0.727029,Dataset with all values,"{'max_depth': 10, 'n_estimators': 150}",0.789/0.712/0.718,0.308/0.649/0.952,0.443/0.679/0.819,RF,with 6 initial variables with the second dataset
2,0.969205,0.969211,0.969205,0.969211,Dataset with max values,"{'max_depth': 10, 'n_estimators': 150}",0.954/0.877/0.999,0.959/0.873/0.997,0.956/0.875/0.998,RF,with 6 initial variables and two most common s...
3,0.798788,0.797657,0.798146,0.797657,Dataset with all values,"{'max_depth': 10, 'n_estimators': 150}",0.695/0.806/0.852,0.713/0.759/0.845,0.704/0.782/0.849,RF,with 6 initial variables and two most common s...
4,0.736289,0.736049,0.735754,0.736049,Dataset with max values,"{'max_depth': 5, 'learning_rate': 0.1, 'n_esti...",0.685/0.699/0.775,0.662/0.790/0.768,0.673/0.741/0.772,GB,with 6 initial variables with the first dataset
5,0.732855,0.733305,0.708334,0.733305,Dataset with all values,"{'max_depth': 5, 'learning_rate': 0.1, 'n_esti...",0.732/0.711/0.735,0.380/0.702/0.920,0.500/0.707/0.817,GB,with 6 initial variables with the second dataset
6,0.974114,0.974022,0.974066,0.974022,Dataset with max values,"{'max_depth': 5, 'learning_rate': 0.1, 'n_esti...",0.964/0.892/0.999,0.963/0.899/0.998,0.963/0.895/0.998,GB,with 6 initial variables and two most common s...
7,0.847576,0.838577,0.841008,0.838577,Dataset with all values,"{'max_depth': 5, 'learning_rate': 0.1, 'n_esti...",0.726/0.822/0.913,0.833/0.812/0.844,0.776/0.817/0.877,GB,with 6 initial variables and two most common s...


In [None]:
results_test[results_test['Dataset'] == 'Dataset with max values'][['Model', 'Accuracy', 'Weighted F1', 'Weighted Precision', 'Weighted Recall', 'Classes_F1']]

Unnamed: 0,Model,Accuracy,Weighted F1,Weighted Precision,Weighted Recall,Classes_F1
0,RF,0.734766,0.733242,0.7346,0.734766,0.658/0.738/0.777
2,RF,0.969211,0.969205,0.969205,0.969211,0.956/0.875/0.998
4,GB,0.736049,0.735754,0.736289,0.736049,0.673/0.741/0.772
6,GB,0.974022,0.974066,0.974114,0.974022,0.963/0.895/0.998


##Other non-important notes

In [None]:
results_table = pd.DataFrame(columns=['Model Type', 'Dataset Used', 'Mean Cross-Validation Score'])
results_table['Model Type']

result_dict = {}
result_dict['RF with 6 initial variables with the first dataset'] = {}
result_dict['RF with 6 initial variables with the first dataset']['Dataset'] = 'Dataset with max values'
result_dict['RF with 6 initial variables with the first dataset']['parameters'] = {'max_depth': 10, 'n_estimators': 150}
result_dict['RF with 6 initial variables with the first dataset']['Mean cross-validation score'] = np.mean(cv_scores)


result_dict['RF with 6 initial variables with the second dataset'] = {}
result_dict['RF with 6 initial variables with the second dataset']['Dataset'] = 'Dataset with all values'
result_dict['RF with 6 initial variables with the second dataset']['parameters'] = {'max_depth': 10, 'n_estimators': 150}
result_dict['RF with 6 initial variables with the second dataset']['Mean cross-validation score'] = np.mean(cv_scores)

result_dict['GB with 6 initial variables with the first dataset'] = {}
result_dict['GB with 6 initial variables with the first dataset']['Dataset'] = 'Dataset with max values'
result_dict['GB with 6 initial variables with the first dataset']['parameters'] = {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
result_dict['GB with 6 initial variables with the first dataset']['Mean cross-validation score'] = np.mean(cv_scores)

result_dict['GB with 6 initial variables with the second dataset'] = {}
result_dict['GB with 6 initial variables with the second dataset']['Dataset'] = 'Dataset with all values'
result_dict['GB with 6 initial variables with the second dataset']['parameters'] = {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
result_dict['GB with 6 initial variables with the second dataset']['Mean cross-validation score'] = np.mean(cv_scores)

#for the best models:

result_dict['RF with 6 initial variables and two most common substances with the first dataset'] = {}
result_dict['RF with 6 initial variables and two most common substances with the first dataset']['Dataset'] = 'Dataset with max values'
result_dict['RF with 6 initial variables and two most common substances with the first dataset']['parameters'] = {'max_depth': 10, 'n_estimators': 150}
result_dict['RF with 6 initial variables and two most common substances with the first dataset']['Mean cross-validation score'] = np.mean(cv_scores)


result_dict['RF with 6 initial variables and two most common substances with the second dataset'] = {}
result_dict['RF with 6 initial variables and two most common substances with the second dataset']['Dataset'] = 'Dataset with all values'
result_dict['RF with 6 initial variables and two most common substances with the second dataset']['parameters'] = {'max_depth': 10, 'n_estimators': 150}
result_dict['RF with 6 initial variables and two most common substances with the second dataset']['Mean cross-validation score'] = np.mean(cv_scores)

result_dict['GB with 6 initial variables and two most common substances with the first dataset'] = {}
result_dict['GB with 6 initial variables and two most common substances with the first dataset']['Dataset'] = 'Dataset with max values'
result_dict['GB with 6 initial variables and two most common substances with the first dataset']['parameters'] = {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
result_dict['GB with 6 initial variables and two most common substances with the first dataset']['Mean cross-validation score'] = np.mean(cv_scores)

result_dict['GB with 6 initial variables and two most common substances with the second dataset'] = {}
result_dict['GB with 6 initial variables and two most common substances with the second dataset']['Dataset'] = 'Dataset with all values'
result_dict['GB with 6 initial variables and two most common substances with the second dataset']['parameters'] = {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
result_dict['GB with 6 initial variables and two most common substances with the second dataset']['Mean cross-validation score'] = np.mean(cv_scores)




result = pd.DataFrame.from_dict(result_dict)

##only two classes

In [None]:
pollution = pd.read_csv('/content/pollution_with_days')
pollution_new = pollution.copy()
pollution_new['Pollution_Level'] = pollution_new['Pollution_Level'].replace(1, 0)
pollution_new['Pollution_Level'] = pollution_new['Pollution_Level'].replace(2, 1)
pollution_new['Pollution_Level'] = pollution_new['Pollution_Level'].replace(3, 1)
pollution_new['Pollution_Level'].value_counts()

Unnamed: 0_level_0,count
Pollution_Level,Unnamed: 1_level_1
1,42469
0,20309


In [None]:
pollution['Pollution_Level'].value_counts()

Unnamed: 0_level_0,count
Pollution_Level,Unnamed: 1_level_1
3,39529
1,20309
2,2940


In [None]:
pollution_new1 = pollution_new[(pollution_new['month'] < 11) & (pollution_new['Metric'] == 'max')]
pollution_new2 = pollution_new[pollution_new['month'] < 11]

In [None]:
pollution_new1['Pollution_Level'].value_counts()

Unnamed: 0_level_0,count
Pollution_Level,Unnamed: 1_level_1
1,10255
0,5333


In [None]:
pollution_new2['Pollution_Level'].value_counts()

Unnamed: 0_level_0,count
Pollution_Level,Unnamed: 1_level_1
1,40375
0,19373


Random Forrest

In [None]:
#pollution1

X = pollution_new1[['latitude', 'longitude', 'Value', 'month', 'day_of_week']]   # 'Substance', 'Metric', 'monitor']] Add more features as needed
y = pollution_new1['Pollution_Level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.63      0.61      0.62      1018
           1       0.82      0.83      0.82      2100

    accuracy                           0.76      3118
   macro avg       0.73      0.72      0.72      3118
weighted avg       0.76      0.76      0.76      3118



In [None]:
#pollution2

X = pollution_new2[['latitude', 'longitude', 'Value', 'month', 'day_of_week']]   # 'Substance', 'Metric', 'monitor']] Add more features as needed
y = pollution_new2['Pollution_Level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.52      0.47      0.49      3907
           1       0.75      0.79      0.77      8043

    accuracy                           0.68     11950
   macro avg       0.64      0.63      0.63     11950
weighted avg       0.68      0.68      0.68     11950

