In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
path="/content/gdrive/My Drive/Colab Notebooks/Big_Data_AQI_Estimation-master/"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Install neccessary libraries and models

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
import os
import pickle
import json
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import joblib
np.random.seed(24)
import xgboost

import warnings
warnings.filterwarnings("ignore")

from sklearn.svm import SVR

In [3]:
cd "gdrive/My Drive/Colab Notebooks/Big_Data_AQI_Estimation-master"

/content/gdrive/My Drive/Colab Notebooks/Big_Data_AQI_Estimation-master


In [4]:
!pip install -r requirements.txt



# Basic Functions

In [5]:
class calculateAQI(object):
    '''
    calculate AQI with Taiwan standard 
    
    
    '''


    def __init__(self,air_data):
        
        """
        air_data:{'Ox_8':.0,'Ox':.0,'PM10':.0,'PM25':.0,'CO':.0,'SO2':.0,'SO2_24':.0,'NO2':.0}
        """
        self.MAX_AQI=0
        self.air_data=air_data
        self.calculatetAQI()
            
 
    
    def calculatetAQI(self):
        '''
        Constructor
        '''
        self.AQI = []
        myAQI={'Ox_8':.0,'Ox':.0,'PM10':.0,'PM25':.0,'CO':.0,'SO2':.0,'SO2_24':.0,'NO2':.0}
        self.rank_rule=50
        
        for air in self.air_data.keys():
            myAQI[air]=self.air_data[air]
        
       
        AQI_Ox_8=0
        if myAQI['Ox_8']<=0.054:
            AQI_Ox_8 =50/(0.054)*(myAQI['Ox_8'])
        elif myAQI['Ox_8']<=0.070:
            AQI_Ox_8=49/(0.015)*(myAQI['Ox_8']-0.055)+51
        elif myAQI['Ox_8']<=0.085:
            AQI_Ox_8=49/(0.085-0.071)*(myAQI['Ox_8']-0.071)+101
        elif myAQI['Ox_8']<=0.105:
            AQI_Ox_8=49/(0.105-0.086)*(myAQI['Ox_8']-0.086)+151
        elif myAQI['Ox_8']<=0.200:
            AQI_Ox_8=49/(0.200-0.106)*(myAQI['Ox_8']-0.106)+201
        self.AQI.append(AQI_Ox_8)
        
        AQI_Ox=0
        if myAQI['Ox']<=0.125:
            AQI_Ox =0
        elif myAQI['Ox']<=0.164:
            AQI_Ox=49/(0.164-0.125)*(myAQI['Ox']-0.125)+101
        elif myAQI['Ox']<=0.204:
            AQI_Ox=49/(0.204-0.165)*(myAQI['Ox']-0.165)+151
        elif myAQI['Ox']<=0.404:
            AQI_Ox=49/(0.404-0.205)*(myAQI['Ox']-0.205)+201
        elif myAQI['Ox']<=0.504:
            AQI_Ox=49/(0.504-0.405)*(myAQI['Ox']-0.405)+301
        elif myAQI['Ox']<=0.504:
            AQI_Ox=49/(0.604-0.505)*(myAQI['Ox']-0.505)+401
        self.AQI.append(AQI_Ox)
        
        
        AQI_PM10=0
        if myAQI['PM10']<=54:
            AQI_PM10 =50/(54)*(myAQI['PM10'])
        elif myAQI['PM10']<=154:
            AQI_PM10=49/(99)*(myAQI['PM10']-55)+51
        elif myAQI['PM10']<=254:
            AQI_PM10=49/(99)*(myAQI['PM10']-155)+101
        elif myAQI['PM10']<=354:
            AQI_PM10=49/(99)*(myAQI['PM10']-255)+151
        elif myAQI['PM10']<=424:
            AQI_PM10=49/(69)*(myAQI['PM10']-355)+201
        elif myAQI['PM10']<=504:
            AQI_PM10=49/(89)*(myAQI['PM10']-425)+301
        elif myAQI['PM10']<=604:
            AQI_PM10=49/(99)*(myAQI['PM10']-505)+401
        self.AQI.append(AQI_PM10)
        
        
        AQI_PM25=0
       
        if myAQI['PM25']<=15.4:
            AQI_PM25 =50/(15.4)*(myAQI['PM25'])
        elif myAQI['PM25']<=35.4:
            AQI_PM25=49/(35.4-15.5)*(myAQI['PM25']-15.5)+51
        elif myAQI['PM25']<=54.4:
            AQI_PM25=49/(54.4-35.5)*(myAQI['PM25']-40.5)+101
        elif myAQI['PM25']<=150.4:
            AQI_PM25=49/(150.4-54.5)*(myAQI['PM25']-65.5)+151
        elif myAQI['PM25']<=250.4:
            AQI_PM25=49/(250.4-150.5)*(myAQI['PM25']-150.5)+201
        elif myAQI['PM25']<=350.4:
            AQI_PM25=49/(350.4-250.5)*(myAQI['PM25']-250.5)+301
        elif myAQI['PM25']<=500.4:
            AQI_PM25=49/(500.4-350.5)*(myAQI['PM25']-350.5)+401
        self.AQI.append(AQI_PM25)
        
        AQI_CO=0
        if myAQI['CO']<=4.4:
            AQI_CO =50/(4.4)*(myAQI['CO'])
        elif myAQI['CO']<=9.4:
            AQI_CO=49/(4.9)*(myAQI['CO']-4.5)+51
        elif myAQI['CO']<=12.4:
            AQI_CO=49/(2.9)*(myAQI['CO']-9.5)+101
        elif myAQI['CO']<=15.4:
            AQI_CO=49/(2.9)*(myAQI['CO']-12.5)+151
        elif myAQI['CO']<=30.4:
            AQI_CO=49/(14.9)*(myAQI['CO']-15.5)+201
        elif myAQI['CO']<=40.4:
            AQI_CO=49/(9.9)*(myAQI['CO']-30.5)+301
        elif myAQI['CO']<=50.4:
            AQI_CO=49/(9.9)*(myAQI['CO']-40.5)+401
        self.AQI.append(AQI_CO)
        
        
        AQI_SO2=0
        if myAQI['SO2']<=0.035:
            AQI_SO2 =50/(0.035)*(myAQI['SO2'])
        elif myAQI['SO2']<=0.075:
            AQI_SO2=49/(0.075-0.036)*(myAQI['SO2']-0.036)+51
        elif myAQI['SO2']<=0.185:
            AQI_SO2=49/(0.185-0.076)*(myAQI['SO2']-0.076)+101
       
        self.AQI.append(AQI_SO2)   
        
        
        AQI_SO2_24=0
        if myAQI['SO2_24']<0.186:
            AQI_SO2_24=0
        elif myAQI['SO2_24']<=0.304 and myAQI['SO2_24']>=0.186:
            AQI_SO2_24=49/(0.304-0.186)*(myAQI['SO2_24']-0.186)+151
        elif myAQI['SO2_24']<=0.604:
            AQI_SO2_24=49/(0.604-0.305)*(myAQI['SO2_24']-0.305)+201
        elif myAQI['SO2_24']<=0.804:
            AQI_SO2_24=49/(0.804-0.605)*(myAQI['SO2_24']-0.605)+301
        elif myAQI['SO2_24']<=1.004:
            AQI_SO2_24=49/(1.004-0.805)*(myAQI['SO2_24']-0.805)+401
            
        self.AQI.append(AQI_SO2_24)      
            
        AQI_NO2=0
        if myAQI['NO2']<=0.053:
            AQI_NO2 =50/(0.053)*(myAQI['NO2'])     
        elif myAQI['NO2']<=0.100:
            AQI_NO2=49/(0.100-0.054)*(myAQI['NO2']-0.054)+51
        elif myAQI['NO2']<=0.360:
            AQI_NO2=49/(0.360-0.101)*(myAQI['NO2']-0.360)+101
        elif myAQI['NO2']<=0.649:
            AQI_NO2=49/(0.649-0.361)*(myAQI['NO2']-0.361)+151
        elif myAQI['NO2']<=1.249:
            AQI_NO2=49/(1.249-0.650)*(myAQI['NO2']-0.650)+201
        elif myAQI['NO2']<=1.649:
            AQI_NO2=49/(1.649-1.250)*(myAQI['NO2']-1.250)+301
        elif myAQI['NO2']<=2.049:
            AQI_NO2=49/(2.049-1.650)*(myAQI['NO2']-1.650)+401
            
        self.AQI.append(AQI_NO2)
        self.MAX_AQI=max(self.AQI)
    
    def getAQI(self):    
        '''
        Return: float, the value of Taiwan AQI
        
        '''     
        return  self.MAX_AQI
    
    
    
    
    def getAQIRank(self):
        '''
        Return: int, the rank of Taiwan AQI
        
        '''
        
        Rank = 0
        if self.MAX_AQI/self.rank_rule<4.0:
            Rank = int(self.MAX_AQI/50)
        elif self.MAX_AQI/self.rank_rule<=6.0:
            Rank = int(4)
        else :
            Rank = int(5)
        
        return Rank

In [6]:
def calc_AQI_from_df(input_df):
    # Calculate AQI and AQI rank for input dataframe
    AQI_list = []
    AQI_rank = []
    pollutants = ['0x_8', '0x', 'PM10', 'PM25', 'CO', 'SO2', 'SO2_24', 'NO2']
    
    for row in input_df.to_numpy():
        myAQI = { key:value for key, value in zip(pollutants, row)}
        AQI_val = calculateAQI(myAQI).getAQI()
        AQI_rank_val = calculateAQI(myAQI).getAQIRank()
        AQI_list.append(AQI_val)
        AQI_rank.append(AQI_rank_val)
    
    AQI_list_np = np.array(AQI_list)
    AQI_rank_np = np.array(AQI_rank)
    
    # Return lists of AQIs and AQI_ranks
    return AQI_list_np, AQI_rank_np

def create_empty_pollutants_columns(preds, pollutants):
    preds_df = pd.DataFrame(data=preds, columns=pollutants)
    # Add empty values for other pollutants
    # empty_pollutant = ['o3_8', 'o3', 'pm10', 'co', 'so2', 'so2_24', 'no2']
    empty_pollutant = ['o3_8', 'so2_24']
    for col in empty_pollutant:
        preds_df[col] = 0
    preds_df= preds_df.reindex(columns=['o3_8', 'o3', 'pm10', 'pm25', 'co', 'so2', 'so2_24', 'no2'])
    return preds_df

In [7]:
def create_empty_pollutants_columns_(preds, empty_pols, pollutants):
    preds_df = pd.DataFrame(data=preds, columns=pollutants)
    # Add empty values for other pollutants
    empty_pollutant = empty_pols
    # empty_pollutant = ['o3_8', 'so2_24']
    for col in empty_pollutant:
        preds_df[col] = 0
    preds_df= preds_df.reindex(columns=['o3_8', 'o3', 'pm10', 'pm25', 'co', 'so2', 'so2_24', 'no2'])
    return preds_df

In [8]:
def convert_AQI_value_to_AQI_Rank(aqi_value):
    rank_rule = 50
    Rank = 0
    if (aqi_value / rank_rule) < 4.0:
        Rank = int(aqi_value / 50)
    elif (aqi_value/ rank_rule) <= 6.0:
        Rank = int(4)
    else:
        Rank = int(5)
    return Rank

def resample_data(df, time_window):
    print(f"[*] Resampling data based on {time_window} time window")
    df_remove_dup = df.drop_duplicates('time', keep='first', inplace=False, ignore_index=True)
    resampled_data = pd.DataFrame()
    resampler = df_remove_dup.set_index('time').resample(time_window)
    resampled_groups = resampler.groups
    for key in resampled_groups:
        indices = resampler._get_index(key)
        # If there are values in the group bin
        if len(indices):
            # Pick a random value from the group
            indx_choice = [np.random.choice(indices)]
            # Get the picked value from df
            value_df = df_remove_dup.take(indx_choice)
            resampled_data = pd.concat([resampled_data, value_df])      
    print("Done")
    return resampled_data

In [9]:
def convert_AQI_value_to_AQI_Rank(aqi_value):
    rank_rule = 50
    Rank = 0
    if (aqi_value / rank_rule) < 4.0:
        Rank = int(aqi_value / 50)
    elif (aqi_value/ rank_rule) <= 6.0:
        Rank = int(4)
    else:
        Rank = int(5)
    return Rank

def convert_aqi_array_to_aqi_rank_df(aqi_array):
    aqi_rank = []
    for i in range(len(aqi_array)):
        aqi_rank.append(convert_AQI_value_to_AQI_Rank(aqi_array[i]))

    return pd.DataFrame(data = aqi_rank, columns = ['aqi_rank'])

# F1 - score and Accuracy

**1. Sensor Features**

In [10]:
import joblib
#Sensor_features
test = pd.read_csv(path + 'Data Processed/MediaEval2019 Processed/Sensor Features/Random split/test_data_sensor_standardized.csv')
label = pd.read_csv(path + 'Data Processed/MediaEval2019 Processed/Sensor Features/Random split/test_label_sensor.csv')

#1. Catboost
catboostAQI = joblib.load(path + 'Saved models/Randomized Search/AQI Regression/Sensor Features/Catboost Random Split Best Model.pkl')
aqi_pred = catboostAQI.predict(test)

catboostPM25 = joblib.load(path + 'Saved models/Randomized Search/PM25 Regression/Sensor Features/Catboost Random Split Best Model.pkl')
pm25_pred = catboostPM25.predict(test)

sensor_catboost_df = pd.DataFrame({'pm25': pm25_pred})

#sensor_catboost_df.head()

empty_cols =  ["co", "no2", "o3", "o3_8", "so2", "so2_24", "pm10"]
pollutants =  ["pm25"]
sensor_catboost_df = create_empty_pollutants_columns_(sensor_catboost_df, empty_cols, pollutants)
#sensor_catboost_df.head()
#df.head()
sensor_catboost_df['aqi'], sensor_catboost_df['aqi_rank'] = calc_AQI_from_df(sensor_catboost_df)
#sensor_catboost_df.head()

print("[*] Catboost's Classification score for AQI rank with sensor features")
range_ = list(range(0, 6))
accuracy = accuracy_score(label['aqi_rank'], sensor_catboost_df['aqi_rank']) * 100
f1 = f1_score(label['aqi_rank'], sensor_catboost_df['aqi_rank'], labels=range_, average='weighted', zero_division=0) * 100
print("Accuracy Score: {:.2f}%".format(accuracy))
print("F1 score: {:.2f}%".format(f1))
conf_matrix = confusion_matrix(label['aqi_rank'], sensor_catboost_df['aqi_rank'], labels=range_)
print("Confusion Matrix:")
print(conf_matrix)

print("\n")
#From predicted AQI values to AQI ranks

rank_pred = convert_aqi_array_to_aqi_rank_df(aqi_pred)

print("[*] Catboost's Classification score for AQI rank (from AQI value) with sensor features")
range_ = list(range(0, 6))
accuracy = accuracy_score(label['aqi_rank'], rank_pred['aqi_rank']) * 100
f1 = f1_score(label['aqi_rank'], rank_pred['aqi_rank'], labels=range_, average='weighted', zero_division=0) * 100
print("Accuracy Score: {:.2f}%".format(accuracy))
print("F1 score: {:.2f}%".format(f1))
conf_matrix = confusion_matrix(label['aqi_rank'], rank_pred['aqi_rank'], labels=range_)
print("Confusion Matrix:")
print(conf_matrix)

print("\n")
print("====================================================================================")
print("\n")

#2. LightGBM
lightgbmAQI = joblib.load(path + 'Saved models/Randomized Search/AQI Regression/Sensor Features/LightGBM Random Split Best Model.pkl')
aqi_pred = lightgbmAQI.predict(test)

lightgbmPM25 = joblib.load(path + 'Saved models/Randomized Search/PM25 Regression/Sensor Features/LightGBM Random Split Best Model.pkl')
pm25_pred = lightgbmPM25.predict(test)

sensor_lightgbm_df = pd.DataFrame({'pm25': pm25_pred})

#sensor_lightgbm_df.head()

empty_cols =  ["co", "no2", "o3", "o3_8", "so2", "so2_24", "pm10"]
pollutants =  ["pm25"]
sensor_lightgbm_df = create_empty_pollutants_columns_(sensor_lightgbm_df, empty_cols, pollutants)
#sensor_lightgbm_df.head()
#df.head()
sensor_lightgbm_df['aqi'], sensor_lightgbm_df['aqi_rank'] = calc_AQI_from_df(sensor_lightgbm_df)
#sensor_lightgbm_df.head()

print("[*] LightGBM's Classification score for AQI rank with sensor features")
range_ = list(range(0, 6))
accuracy = accuracy_score(label['aqi_rank'], sensor_lightgbm_df['aqi_rank']) * 100
f1 = f1_score(label['aqi_rank'], sensor_lightgbm_df['aqi_rank'], labels=range_, average='weighted', zero_division=0) * 100
print("Accuracy Score: {:.2f}%".format(accuracy))
print("F1 score: {:.2f}%".format(f1))
conf_matrix = confusion_matrix(label['aqi_rank'], sensor_lightgbm_df['aqi_rank'], labels=range_)
print("Confusion Matrix:")
print(conf_matrix)

print("\n")
#From predicted AQI values to AQI ranks

rank_pred = convert_aqi_array_to_aqi_rank_df(aqi_pred)

print("[*] LightGBM's Classification score for AQI rank (from AQI value) with sensor features")
range_ = list(range(0, 6))
accuracy = accuracy_score(label['aqi_rank'], rank_pred['aqi_rank']) * 100
f1 = f1_score(label['aqi_rank'], rank_pred['aqi_rank'], labels=range_, average='weighted', zero_division=0) * 100
print("Accuracy Score: {:.2f}%".format(accuracy))
print("F1 score: {:.2f}%".format(f1))
conf_matrix = confusion_matrix(label['aqi_rank'], rank_pred['aqi_rank'], labels=range_)
print("Confusion Matrix:")
print(conf_matrix)

print("\n")
print("====================================================================================")
print("\n")

#3. Random Forest
randomforestAQI = joblib.load(path + 'Saved models/Randomized Search/AQI Regression/Sensor Features/Random Forest Random Split Best Model.pkl')
aqi_pred = randomforestAQI.predict(test)

randomforestPM25 = joblib.load(path + 'Saved models/Randomized Search/PM25 Regression/Sensor Features/Random Forest Random Split Best Model.pkl')
pm25_pred = randomforestPM25.predict(test)


sensor_randomforest_df = pd.DataFrame({'pm25': pm25_pred})

empty_cols =  ["co", "no2", "o3", "o3_8", "so2", "so2_24", "pm10"]
pollutants =  ["pm25"]
sensor_randomforest_df = create_empty_pollutants_columns_(sensor_randomforest_df, empty_cols, pollutants)
#sensor_randomforest_df.head()
sensor_randomforest_df['aqi'], sensor_randomforest_df['aqi_rank'] = calc_AQI_from_df(sensor_randomforest_df)
#sensor_randomforest_df.head()

print("[*] Random Forest's Classification score for AQI rank with sensor features")
range_ = list(range(0, 6))
accuracy = accuracy_score(label['aqi_rank'], sensor_randomforest_df['aqi_rank']) * 100
f1 = f1_score(label['aqi_rank'], sensor_randomforest_df['aqi_rank'], labels=range_, average='weighted', zero_division=0) * 100
print("Accuracy Score: {:.2f}%".format(accuracy))
print("F1 score: {:.2f}%".format(f1))
conf_matrix = confusion_matrix(label['aqi_rank'], sensor_randomforest_df['aqi_rank'], labels=range_)
print("Confusion Matrix:")
print(conf_matrix)

print("\n")
#From predicted AQI values to AQI ranks

rank_pred = convert_aqi_array_to_aqi_rank_df(aqi_pred)

print("[*] Random Forest's Classification score for AQI rank (from AQI value) with sensor features")
range_ = list(range(0, 6))
accuracy = accuracy_score(label['aqi_rank'], rank_pred['aqi_rank']) * 100
f1 = f1_score(label['aqi_rank'], rank_pred['aqi_rank'], labels=range_, average='weighted', zero_division=0) * 100
print("Accuracy Score: {:.2f}%".format(accuracy))
print("F1 score: {:.2f}%".format(f1))
conf_matrix = confusion_matrix(label['aqi_rank'], rank_pred['aqi_rank'], labels=range_)
print("Confusion Matrix:")
print(conf_matrix)

print("\n")
print("====================================================================================")
print("\n")

#4. SVM
svmAQI = joblib.load(path + 'Saved models/Randomized Search/AQI Regression/Sensor Features/SVM Random Split Best Model.pkl')
aqi_pred = svmAQI.predict(test)

svmPM25 = joblib.load(path + 'Saved models/Randomized Search/PM25 Regression/Sensor Features/SVM Random Split Best Model.pkl')
pm25_pred = svmPM25.predict(test)

sensor_svm_df = pd.DataFrame({'pm25': pm25_pred})

empty_cols =  ["co", "no2", "o3", "o3_8", "so2", "so2_24", "pm10"]
pollutants =  ["pm25"]
sensor_svm_df = create_empty_pollutants_columns_(sensor_svm_df, empty_cols, pollutants)
#sensor_svm_df.head()
sensor_svm_df['aqi'], sensor_svm_df['aqi_rank'] = calc_AQI_from_df(sensor_svm_df)
#sensor_svm_df.head()

print("[*] SVM's Classification score for AQI rank with sensor features")
range_ = list(range(0, 6))
accuracy = accuracy_score(label['aqi_rank'], sensor_svm_df['aqi_rank']) * 100
f1 = f1_score(label['aqi_rank'], sensor_svm_df['aqi_rank'], labels=range_, average='weighted', zero_division=0) * 100
print("Accuracy Score: {:.2f}%".format(accuracy))
print("F1 score: {:.2f}%".format(f1))
conf_matrix = confusion_matrix(label['aqi_rank'], sensor_svm_df['aqi_rank'], labels=range_)
print("Confusion Matrix:")
print(conf_matrix)

print("\n")
#From predicted AQI values to AQI ranks

rank_pred = convert_aqi_array_to_aqi_rank_df(aqi_pred)

print("[*] SVM's Classification score for AQI rank (from AQI value) with sensor features")
range_ = list(range(0, 6))
accuracy = accuracy_score(label['aqi_rank'], rank_pred['aqi_rank']) * 100
f1 = f1_score(label['aqi_rank'], rank_pred['aqi_rank'], labels=range_, average='weighted', zero_division=0) * 100
print("Accuracy Score: {:.2f}%".format(accuracy))
print("F1 score: {:.2f}%".format(f1))
conf_matrix = confusion_matrix(label['aqi_rank'], rank_pred['aqi_rank'], labels=range_)
print("Confusion Matrix:")
print(conf_matrix)

print("\n")
print("====================================================================================")
print("\n")

#5. XgBoost
xgboostAQI = joblib.load(path + 'Saved models/Randomized Search/AQI Regression/Sensor Features/XGBoost Random Split Best Model.pkl')
aqi_pred = xgboostAQI.predict(test)

xgboostPM25 = joblib.load(path + 'Saved models/Randomized Search/PM25 Regression/Sensor Features/XGBoost Random Split Best Model.pkl')
pm25_pred = xgboostPM25.predict(test)

sensor_xgboost_df = pd.DataFrame({'pm25': pm25_pred})

empty_cols =  ["co", "no2", "o3", "o3_8", "so2", "so2_24", "pm10"]
pollutants =  ["pm25"]
sensor_xgboost_df = create_empty_pollutants_columns_(sensor_xgboost_df, empty_cols, pollutants)
#sensor_xgboost_df.head()
sensor_xgboost_df['aqi'], sensor_xgboost_df['aqi_rank'] = calc_AQI_from_df(sensor_xgboost_df)
#sensor_xgboost_df.head()

print("[*] XgBoost's Classification score for AQI rank with sensor features")
range_ = list(range(0, 6))
accuracy = accuracy_score(label['aqi_rank'], sensor_xgboost_df['aqi_rank']) * 100
f1 = f1_score(label['aqi_rank'], sensor_xgboost_df['aqi_rank'], labels=range_, average='weighted', zero_division=0) * 100
print("Accuracy Score: {:.2f}%".format(accuracy))
print("F1 score: {:.2f}%".format(f1))
conf_matrix = confusion_matrix(label['aqi_rank'], sensor_xgboost_df['aqi_rank'], labels=range_)
print("Confusion Matrix:")
print(conf_matrix)

print("\n")
#From predicted AQI values to AQI ranks

rank_pred = convert_aqi_array_to_aqi_rank_df(aqi_pred)

print("[*] XgBoost's Classification score for AQI rank (from AQI value) with sensor features")
range_ = list(range(0, 6))
accuracy = accuracy_score(label['aqi_rank'], rank_pred['aqi_rank']) * 100
f1 = f1_score(label['aqi_rank'], rank_pred['aqi_rank'], labels=range_, average='weighted', zero_division=0) * 100
print("Accuracy Score: {:.2f}%".format(accuracy))
print("F1 score: {:.2f}%".format(f1))
conf_matrix = confusion_matrix(label['aqi_rank'], rank_pred['aqi_rank'], labels=range_)
print("Confusion Matrix:")
print(conf_matrix)

print("\n")
print("====================================================================================")
print("\n")

#From predicted AQI values to AQI ranks
#print(type(label['aqi_rank'].values))

[*] Catboost's Classification score for AQI rank with sensor features
Accuracy Score: 88.34%
F1 score: 88.33%
Confusion Matrix:
[[3416  284    0    0    0    0]
 [ 186  700   15    0    0    0]
 [   2   44   19    0    0    0]
 [   5    3    6    0    0    0]
 [   0    1    0    0    0    0]
 [   0    0    0    0    0    0]]


[*] Catboost's Classification score for AQI rank (from AQI value) with sensor features
Accuracy Score: 88.72%
F1 score: 88.56%
Confusion Matrix:
[[3477  223    0    0    0    0]
 [ 222  652   27    0    0    0]
 [   2   39   24    0    0    0]
 [   5    3    6    0    0    0]
 [   0    1    0    0    0    0]
 [   0    0    0    0    0    0]]




[*] LightGBM's Classification score for AQI rank with sensor features
Accuracy Score: 88.78%
F1 score: 88.69%
Confusion Matrix:
[[3446  254    0    0    0    0]
 [ 196  690   15    0    0    0]
 [   2   43   20    0    0    0]
 [   5    4    5    0    0    0]
 [   0    1    0    0    0    0]
 [   0    0    0    0    0    

**2. Sensor+PW Features**

In [11]:
import joblib
#Sensor_features
test = pd.read_csv(path + 'Data Processed/MediaEval2019 Processed/Sensor+PW Features/Random split/test_data_sensor_pw_standardized.csv')
label = pd.read_csv(path + 'Data Processed/MediaEval2019 Processed/Sensor+PW Features/Random split/test_label_sensor_pw.csv')

#1. Catboost
catboostAQI = joblib.load(path + 'Saved models/Randomized Search/AQI Regression/Sensor+PW Features/Catboost Random Split Best Model.pkl')
aqi_pred = catboostAQI.predict(test)

catboostPM25 = joblib.load(path + 'Saved models/Randomized Search/PM25 Regression/Sensor+PW Features/Catboost Random Split Best Model.pkl')
pm25_pred = catboostPM25.predict(test)

sensor_pw_catboost_df = pd.DataFrame({'pm25': pm25_pred})

#sensor_pw_catboost_df.head()

empty_cols =  ["co", "no2", "o3", "o3_8", "so2", "so2_24", "pm10"]
pollutants =  ["pm25"]
sensor_pw_catboost_df = create_empty_pollutants_columns_(sensor_pw_catboost_df, empty_cols, pollutants)
#sensor_pw_catboost_df.head()
#df.head()
sensor_pw_catboost_df['aqi'], sensor_pw_catboost_df['aqi_rank'] = calc_AQI_from_df(sensor_pw_catboost_df)
#sensor_pw_catboost_df.head()

print("[*] Catboost's Classification score for AQI rank with Sensor+PW features")
range_ = list(range(0, 6))
accuracy = accuracy_score(label['aqi_rank'], sensor_pw_catboost_df['aqi_rank']) * 100
f1 = f1_score(label['aqi_rank'], sensor_pw_catboost_df['aqi_rank'], labels=range_, average='weighted', zero_division=0) * 100
print("Accuracy Score: {:.2f}%".format(accuracy))
print("F1 score: {:.2f}%".format(f1))
conf_matrix = confusion_matrix(label['aqi_rank'], sensor_pw_catboost_df['aqi_rank'], labels=range_)
print("Confusion Matrix:")
print(conf_matrix)

print("\n")
#From predicted AQI values to AQI ranks

rank_pred = convert_aqi_array_to_aqi_rank_df(aqi_pred)

print("[*] Catboost's Classification score for AQI rank (from AQI value) with Sensor+PW features")
range_ = list(range(0, 6))
accuracy = accuracy_score(label['aqi_rank'], rank_pred['aqi_rank']) * 100
f1 = f1_score(label['aqi_rank'], rank_pred['aqi_rank'], labels=range_, average='weighted', zero_division=0) * 100
print("Accuracy Score: {:.2f}%".format(accuracy))
print("F1 score: {:.2f}%".format(f1))
conf_matrix = confusion_matrix(label['aqi_rank'], rank_pred['aqi_rank'], labels=range_)
print("Confusion Matrix:")
print(conf_matrix)

print("\n")
print("====================================================================================")
print("\n")

#2. LightGBM
lightgbmAQI = joblib.load(path + 'Saved models/Randomized Search/AQI Regression/Sensor+PW Features/LightGBM Random Split Best Model.pkl')
aqi_pred = lightgbmAQI.predict(test)

lightgbmPM25 = joblib.load(path + 'Saved models/Randomized Search/PM25 Regression/Sensor+PW Features/LightGBM Random Split Best Model.pkl')
pm25_pred = lightgbmPM25.predict(test)

sensor_pw_lightgbm_df = pd.DataFrame({'pm25': pm25_pred})

#sensor_pw_lightgbm_df.head()

empty_cols =  ["co", "no2", "o3", "o3_8", "so2", "so2_24", "pm10"]
pollutants =  ["pm25"]
sensor_pw_lightgbm_df = create_empty_pollutants_columns_(sensor_pw_lightgbm_df, empty_cols, pollutants)
#sensor_pw_lightgbm_df.head()
#df.head()
sensor_pw_lightgbm_df['aqi'], sensor_pw_lightgbm_df['aqi_rank'] = calc_AQI_from_df(sensor_pw_lightgbm_df)
#sensor_pw_lightgbm_df.head()

print("[*] LightGBM's Classification score for AQI rank with Sensor+PW features")
range_ = list(range(0, 6))
accuracy = accuracy_score(label['aqi_rank'], sensor_pw_lightgbm_df['aqi_rank']) * 100
f1 = f1_score(label['aqi_rank'], sensor_pw_lightgbm_df['aqi_rank'], labels=range_, average='weighted', zero_division=0) * 100
print("Accuracy Score: {:.2f}%".format(accuracy))
print("F1 score: {:.2f}%".format(f1))
conf_matrix = confusion_matrix(label['aqi_rank'], sensor_pw_lightgbm_df['aqi_rank'], labels=range_)
print("Confusion Matrix:")
print(conf_matrix)

print("\n")
#From predicted AQI values to AQI ranks

rank_pred = convert_aqi_array_to_aqi_rank_df(aqi_pred)

print("[*] LightGBM's Classification score for AQI rank (from AQI value) with Sensor+PW features")
range_ = list(range(0, 6))
accuracy = accuracy_score(label['aqi_rank'], rank_pred['aqi_rank']) * 100
f1 = f1_score(label['aqi_rank'], rank_pred['aqi_rank'], labels=range_, average='weighted', zero_division=0) * 100
print("Accuracy Score: {:.2f}%".format(accuracy))
print("F1 score: {:.2f}%".format(f1))
conf_matrix = confusion_matrix(label['aqi_rank'], rank_pred['aqi_rank'], labels=range_)
print("Confusion Matrix:")
print(conf_matrix)

print("\n")
print("====================================================================================")
print("\n")

#3. Random Forest
randomforestAQI = joblib.load(path + 'Saved models/Randomized Search/AQI Regression/Sensor+PW Features/Random Forest Random Split Best Model.pkl')
aqi_pred = randomforestAQI.predict(test)

randomforestPM25 = joblib.load(path + 'Saved models/Randomized Search/PM25 Regression/Sensor+PW Features/Random Forest Random Split Best Model.pkl')
pm25_pred = randomforestPM25.predict(test)


sensor_pw_randomforest_df = pd.DataFrame({'pm25': pm25_pred})

empty_cols =  ["co", "no2", "o3", "o3_8", "so2", "so2_24", "pm10"]
pollutants =  ["pm25"]
sensor_pw_randomforest_df = create_empty_pollutants_columns_(sensor_pw_randomforest_df, empty_cols, pollutants)
#sensor_pw_randomforest_df.head()
sensor_pw_randomforest_df['aqi'], sensor_pw_randomforest_df['aqi_rank'] = calc_AQI_from_df(sensor_pw_randomforest_df)
#sensor_pw_randomforest_df.head()

print("[*] Random Forest's Classification score for AQI rank with Sensor+PW features")
range_ = list(range(0, 6))
accuracy = accuracy_score(label['aqi_rank'], sensor_pw_randomforest_df['aqi_rank']) * 100
f1 = f1_score(label['aqi_rank'], sensor_pw_randomforest_df['aqi_rank'], labels=range_, average='weighted', zero_division=0) * 100
print("Accuracy Score: {:.2f}%".format(accuracy))
print("F1 score: {:.2f}%".format(f1))
conf_matrix = confusion_matrix(label['aqi_rank'], sensor_pw_randomforest_df['aqi_rank'], labels=range_)
print("Confusion Matrix:")
print(conf_matrix)

print("\n")
#From predicted AQI values to AQI ranks

rank_pred = convert_aqi_array_to_aqi_rank_df(aqi_pred)

print("[*] Random Forest's Classification score for AQI rank (from AQI value) with Sensor+PW features")
range_ = list(range(0, 6))
accuracy = accuracy_score(label['aqi_rank'], rank_pred['aqi_rank']) * 100
f1 = f1_score(label['aqi_rank'], rank_pred['aqi_rank'], labels=range_, average='weighted', zero_division=0) * 100
print("Accuracy Score: {:.2f}%".format(accuracy))
print("F1 score: {:.2f}%".format(f1))
conf_matrix = confusion_matrix(label['aqi_rank'], rank_pred['aqi_rank'], labels=range_)
print("Confusion Matrix:")
print(conf_matrix)

print("\n")
print("====================================================================================")
print("\n")

#4. SVM
svmAQI = joblib.load(path + 'Saved models/Randomized Search/AQI Regression/Sensor+PW Features/SVM Random Split Best Model.pkl')
aqi_pred = svmAQI.predict(test)

svmPM25 = joblib.load(path + 'Saved models/Randomized Search/PM25 Regression/Sensor+PW Features/SVM Random Split Best Model.pkl')
pm25_pred = svmPM25.predict(test)

sensor_pw_svm_df = pd.DataFrame({'pm25': pm25_pred})

empty_cols =  ["co", "no2", "o3", "o3_8", "so2", "so2_24", "pm10"]
pollutants =  ["pm25"]
sensor_pw_svm_df = create_empty_pollutants_columns_(sensor_pw_svm_df, empty_cols, pollutants)
#sensor_pw_svm_df.head()
sensor_pw_svm_df['aqi'], sensor_pw_svm_df['aqi_rank'] = calc_AQI_from_df(sensor_pw_svm_df)
#sensor_pw_svm_df.head()

print("[*] SVM's Classification score for AQI rank with Sensor+PW features")
range_ = list(range(0, 6))
accuracy = accuracy_score(label['aqi_rank'], sensor_pw_svm_df['aqi_rank']) * 100
f1 = f1_score(label['aqi_rank'], sensor_pw_svm_df['aqi_rank'], labels=range_, average='weighted', zero_division=0) * 100
print("Accuracy Score: {:.2f}%".format(accuracy))
print("F1 score: {:.2f}%".format(f1))
conf_matrix = confusion_matrix(label['aqi_rank'], sensor_pw_svm_df['aqi_rank'], labels=range_)
print("Confusion Matrix:")
print(conf_matrix)

print("\n")
#From predicted AQI values to AQI ranks

rank_pred = convert_aqi_array_to_aqi_rank_df(aqi_pred)

print("[*] SVM's Classification score for AQI rank (from AQI value) with Sensor+PW features")
range_ = list(range(0, 6))
accuracy = accuracy_score(label['aqi_rank'], rank_pred['aqi_rank']) * 100
f1 = f1_score(label['aqi_rank'], rank_pred['aqi_rank'], labels=range_, average='weighted', zero_division=0) * 100
print("Accuracy Score: {:.2f}%".format(accuracy))
print("F1 score: {:.2f}%".format(f1))
conf_matrix = confusion_matrix(label['aqi_rank'], rank_pred['aqi_rank'], labels=range_)
print("Confusion Matrix:")
print(conf_matrix)

print("\n")
print("====================================================================================")
print("\n")

#5. XgBoost
xgboostAQI = joblib.load(path + 'Saved models/Randomized Search/AQI Regression/Sensor+PW Features/XGBoost Random Split Best Model.pkl')
aqi_pred = xgboostAQI.predict(test)

xgboostPM25 = joblib.load(path + 'Saved models/Randomized Search/PM25 Regression/Sensor+PW Features/XGBoost Random Split Best Model.pkl')
pm25_pred = xgboostPM25.predict(test)

sensor_pw_xgboost_df = pd.DataFrame({'pm25': pm25_pred})

empty_cols =  ["co", "no2", "o3", "o3_8", "so2", "so2_24", "pm10"]
pollutants =  ["pm25"]
sensor_pw_xgboost_df = create_empty_pollutants_columns_(sensor_pw_xgboost_df, empty_cols, pollutants)
#sensor_pw_xgboost_df.head()
sensor_pw_xgboost_df['aqi'], sensor_pw_xgboost_df['aqi_rank'] = calc_AQI_from_df(sensor_pw_xgboost_df)
#sensor_pw_xgboost_df.head()

print("[*] XgBoost's Classification score for AQI rank with Sensor+PW features")
range_ = list(range(0, 6))
accuracy = accuracy_score(label['aqi_rank'], sensor_pw_xgboost_df['aqi_rank']) * 100
f1 = f1_score(label['aqi_rank'], sensor_pw_xgboost_df['aqi_rank'], labels=range_, average='weighted', zero_division=0) * 100
print("Accuracy Score: {:.2f}%".format(accuracy))
print("F1 score: {:.2f}%".format(f1))
conf_matrix = confusion_matrix(label['aqi_rank'], sensor_pw_xgboost_df['aqi_rank'], labels=range_)
print("Confusion Matrix:")
print(conf_matrix)

print("\n")
#From predicted AQI values to AQI ranks

rank_pred = convert_aqi_array_to_aqi_rank_df(aqi_pred)

print("[*] XgBoost's Classification score for AQI rank (from AQI value) with Sensor+PW features")
range_ = list(range(0, 6))
accuracy = accuracy_score(label['aqi_rank'], rank_pred['aqi_rank']) * 100
f1 = f1_score(label['aqi_rank'], rank_pred['aqi_rank'], labels=range_, average='weighted', zero_division=0) * 100
print("Accuracy Score: {:.2f}%".format(accuracy))
print("F1 score: {:.2f}%".format(f1))
conf_matrix = confusion_matrix(label['aqi_rank'], rank_pred['aqi_rank'], labels=range_)
print("Confusion Matrix:")
print(conf_matrix)

print("\n")
print("====================================================================================")
print("\n")

#From predicted AQI values to AQI ranks
#print(type(label['aqi_rank'].values))

[*] Catboost's Classification score for AQI rank with Sensor+PW features
Accuracy Score: 89.17%
F1 score: 89.29%
Confusion Matrix:
[[3401  299    0    0    0    0]
 [ 138  751   12    0    0    0]
 [   2   42   21    0    0    0]
 [   5    1    7    1    0    0]
 [   0    1    0    0    0    0]
 [   0    0    0    0    0    0]]


[*] Catboost's Classification score for AQI rank (from AQI value) with Sensor+PW features
Accuracy Score: 89.94%
F1 score: 89.91%
Confusion Matrix:
[[3467  233    0    0    0    0]
 [ 169  713   19    0    0    0]
 [   2   33   30    0    0    0]
 [   5    1    8    0    0    0]
 [   0    1    0    0    0    0]
 [   0    0    0    0    0    0]]




[*] LightGBM's Classification score for AQI rank with Sensor+PW features
Accuracy Score: 89.64%
F1 score: 89.68%
Confusion Matrix:
[[3436  264    0    0    0    0]
 [ 152  734   15    0    0    0]
 [   1   38   26    0    0    0]
 [   5    1    8    0    0    0]
 [   0    1    0    0    0    0]
 [   0    0    0    0