***This notebook was runned on Google Colab***

#        Layer.ai Air Quality Prediction Challenge
Can you use Sentinel 5P data to predict air quality in Kampala for AirQo?

**Description**


AirQo’s air quality sensing network has more than 120 low-cost devices deployed across Uganda, which regulalry sense and report on air quality using the PM2.5 measure.

In this challenge we explore using satellite radar data from Sentinel 5P to predict air quality in regions in Kampala. The use of satellite data could expand air quality predictions to areas without air quality sensor devices.

The objective of this challenge is to predict air quality readings from AirQo’s sensors using Sentinel 5P data.

# Team ; **AI-SQUAD**

***position : 8th Place*** 

Team Members

Emmanuel Ebiendele (Ebiendele)

Adetoro Michael oluwaferanmi (Mike_ade)

Blessing Irenosen (D-PROF)

Eromosele Precious Ebiendele (preciousdata)

In [1]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1-cp37-none-manylinux1_x86_64.whl (76.8 MB)
[K     |████████████████████████████████| 76.8 MB 1.2 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.1


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999
import seaborn as sns
from scipy import stats

from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
import datetime

import warnings
warnings.simplefilter(action='ignore')

In [3]:
#Label Encoder
def label_enc(train_df, test_df, features):
    lbl_enc = LabelEncoder()
    full_data = pd.concat([train_df[features], test_df[features]],axis=0)
    for col in (features):
        print(col)
        lbl_enc.fit(full_data[col].values)
        train_df[col] = lbl_enc.transform(train_df[col])
        test_df[col] = lbl_enc.transform(test_df[col])
    return train_df, test_df

In [4]:
#@markdown <br><center><img src='https://upload.wikimedia.org/wikipedia/commons/thumb/d/da/Google_Drive_logo.png/600px-Google_Drive_logo.png' height="150" alt="Gdrive-logo"/></center>
#@markdown <center><h2>Mount GDrive to /content/drive</h3></center><br>
MODE = "MOUNT" #@param ["MOUNT", "UNMOUNT"]
#Mount your Gdrive! 
from google.colab import drive
drive.mount._DEBUG = False
if MODE == "MOUNT":
  drive.mount('/content/drive', force_remount=True)
elif MODE == "UNMOUNT":
  try:
    drive.flush_and_unmount()
  except ValueError:
    pass
  get_ipython().system_raw("rm -rf /root/.config/Google/DriveFS")

Mounted at /content/drive


In [5]:
# load the data
path = '/content/drive/MyDrive/layer_data'
train_df = pd.read_csv(f'{path}/train.csv')
test_df = pd.read_csv(f'{path}/test.csv')
sub = pd.read_csv(f'{path}/SampleSubmission.csv')

In [6]:
# test_df.columns

In [7]:
col1 = ['ID','date']
col2 = ['device']
col3 = ['humidity', 'temp_mean']
location = ['site_latitude', 'site_longitude']
chem = [
        'SulphurDioxide_SO2_column_number_density',
       'SulphurDioxide_SO2_column_number_density_amf',
        'SulphurDioxide_SO2_column_number_density_15km',
       'CarbonMonoxide_CO_column_number_density',
        'NitrogenDioxide_NO2_column_number_density',
        'UvAerosolIndex_absorbing_aerosol_index',
        'Ozone_O3_column_number_density',
        'Cloud_cloud_fraction', 
        'Cloud_cloud_top_pressure'
]
target = ['pm2_5']
feat = col1 + col2 + col3 + location + chem
train_df = train_df[feat + target]
test_df = test_df[feat]

In [8]:
train_df.shape, test_df.shape

((9923, 17), (4254, 16))

# Add The Time Features

In [9]:
# train_df = train_df[train_df.pm2_5 < 220]

In [10]:
train_df = train_df.sort_values(['date', 'device']).reset_index(drop=True) 
test_df = test_df.sort_values(['date', 'device']).reset_index(drop=True)

for dataset in (train_df,test_df):
    dataset['date'] = pd.to_datetime(dataset['date'])
    dataset['Day'] = dataset.date.dt.day
    dataset['Month'] = dataset.date.dt.month
    dataset['Year'] = dataset.date.dt.year
    dataset['DayOfWeek'] = dataset.date.dt.dayofweek
    dataset['DayOfYear'] = dataset.date.dt.dayofyear
    dataset['Week'] = dataset.date.dt.weekofyear
    dataset.set_index('date', inplace=True)

In [11]:
ID = test_df['ID']
test_df.drop('ID',inplace=True,axis=1)
train_df.drop('ID',inplace=True,axis=1)

# Exploratory data analysis

In [12]:
test_df.shape, train_df.shape

((4254, 20), (9923, 21))

In [13]:
num_col = train_df.select_dtypes(exclude='O').columns.difference(['Month', 'pm2_5', 'site_latitude', 'site_longitude', 'humidity', 'temp_mean', 'Day', 'DayOfWeek', 'DayOfYear', 'Year', 'Week'])
train_df.temp_mean = train_df.temp_mean.fillna(train_df.temp_mean.median())
for data in (train_df, test_df):
    for feat in num_col:
        data[feat] = data[feat].bfill()

In [14]:
def LAG(data,LagFeature,shift=1,NewFeatures=[]) :
    data[NewFeatures[0]]   = data[LagFeature]  - data[LagFeature].shift(shift)
    data[NewFeatures[1]]   = data[LagFeature].shift(shift)

num_feats = train_df.columns
num_feats = num_feats.drop(['DayOfWeek','Month','Day','pm2_5','temp_mean','humidity','site_longitude', 'site_latitude','device', 'Year', 'DayOfYear', 'Week'])

for feature in num_feats:
    LAG(train_df,LagFeature=f'{feature}',shift=1,NewFeatures=[f'{feature}_diff_Lag1',f'{feature}_Lag1'])
    LAG(test_df,LagFeature=f'{feature}',shift=1,NewFeatures=[f'{feature}_diff_Lag1',f'{feature}_Lag1'])

In [15]:
for dataset in (train_df,test_df):
    dataset['Year_Month'] = dataset['Year'].astype(str) + '-' + dataset['Month'].astype(str)
    dataset['Year_Week'] = dataset['Year'].astype(str) + '-' + dataset['Week'].astype(str)
    dataset['Month_Day'] = dataset['Month'].astype(str) + '-' + dataset['Day'].astype(str)
    dataset['lat_lon'] = dataset['site_latitude'].astype(str) + '_' + dataset['site_longitude'].astype(str)
    
feats = ['Year_Month','Year_Week','Month_Day', 'device',
         'lat_lon'
        ]
train_df,test_df = label_enc(train_df,test_df,feats)

Year_Month
Year_Week
Month_Day
device
lat_lon


## - Aggregations Features

In [16]:
DevicePM2_5Mean = dict(train_df.groupby('device')['pm2_5'].mean())
DevicePM2_5Std = dict(train_df.groupby('device')['pm2_5'].std())
DevicePM2_5Min = dict(train_df.groupby('device')['pm2_5'].min())
DevicePM2_5Max = dict(train_df.groupby('device')['pm2_5'].max())

for dataset in (train_df,test_df):
    dataset['DevicePM2_5Mean'] = dataset['device'].map(DevicePM2_5Mean)
    dataset['DevicePM2_5Std'] = dataset['device'].map(DevicePM2_5Std)
    dataset['DevicePM2_5Min'] = dataset['device'].map(DevicePM2_5Min)
    dataset['DevicePM2_5Max'] = dataset['device'].map(DevicePM2_5Max)

In [17]:
data = pd.concat([train_df, test_df], axis = 0)
def Agg(Features):
    for dataset in (train_df,test_df):
        for Feature in Features:
            dataset[f'{Feature}_PerMonth'] = dataset['Month'].map(dict(data.groupby('Month')[Feature].mean()))
            dataset[f'{Feature}_PerWeek'] = dataset['Year_Week'].map(dict(data.groupby('Year_Week')[Feature].mean()))
            dataset[f'{Feature}_PerDay'] = dataset['Month_Day'].map(dict(data.groupby('Month_Day')[Feature].mean()))

            dataset[f'{Feature}_Month_std'] = dataset['Month'].map(dict(data.groupby('Month')[Feature].std()))
            dataset[f'{Feature}_Week_std'] = dataset['Year_Week'].map(dict(data.groupby('Year_Week')[Feature].std()))
            dataset[f'{Feature}_Day_std'] = dataset['Month_Day'].map(dict(data.groupby('Month_Day')[Feature].std()))

            dataset[f'{Feature}_Month_min'] = dataset['Month'].map(dict(data.groupby('Month')[Feature].min()))
            dataset[f'{Feature}_Week_min'] = dataset['Year_Week'].map(dict(data.groupby('Year_Week')[Feature].min()))
            dataset[f'{Feature}_Day_min'] = dataset['Month_Day'].map(dict(data.groupby('Month_Day')[Feature].min()))

            dataset[f'{Feature}_Month_max'] = dataset['Month'].map(dict(data.groupby('Month')[Feature].max()))
            dataset[f'{Feature}_Week_max'] = dataset['Year_Week'].map(dict(data.groupby('Year_Week')[Feature].max()))
            dataset[f'{Feature}_Day_max'] = dataset['Month_Day'].map(dict(data.groupby('Month_Day')[Feature].max()))
        
Agg(['temp_mean', 'humidity'])

In [18]:
train_df.shape, test_df.shape

((9923, 71), (4254, 70))

In [19]:
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

In [21]:
train_df.drop(['Year_Month','Year_Week','Month_Day'],inplace=True,axis=1)
test_df.drop(['Year_Month','Year_Week','Month_Day'],inplace=True,axis=1)

train_df.drop(['site_longitude', 'site_latitude'],inplace=True,axis=1)
test_df.drop(['site_longitude', 'site_latitude'],inplace=True,axis=1)

In [22]:
#Averaging the predictions of the same model with different seeds to get more consistent results
X = train_df.drop('pm2_5', axis = 1)
y = train_df.pm2_5

In [23]:
X.shape, y.shape, test_df.shape

((9923, 65), (9923,), (4254, 65))

In [24]:
# X_train, y_train = X[:9000], y[:9000]
# X_test, y_test = X[9000:], y[9000:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .1, shuffle = True, random_state = 42)

# Modeling

In [25]:
cb = CatBoostRegressor(n_estimators = 10303, learning_rate = 0.028926897706232692, depth = 8, verbose = 0, random_state = 42)
model = TransformedTargetRegressor(cb, func = np.log1p, inverse_func = np.expm1)
model.fit(X_train, y_train)
pred = model.predict(X_test)
mae(y_test, pred)

6.255595074998711

In [26]:
pred_1 = model.predict(test_df)
# pred_2 = LogCB.predict(test_df)
submission = pd.DataFrame({"Id": ID ,"pm2_5": pred_1})
submission.to_csv('/content/drive/MyDrive/layer_data/laisq.csv', index = False)

# **PRIVATE LEADERBOARD SCORE: 12.82981740718469**