In [6]:
#import neccessary library 
import numpy as np
import pandas as pd 
import requests 

## **Collect Data**

In [14]:
data = pd.read_csv('historical_air_quality_2021_en.csv')
data.head()

Unnamed: 0,Station ID,AQI index,Location,Station name,Url,Dominent pollutant,CO,Dew,Humidity,NO2,...,Pressure,PM10,PM2.5,SO2,Temperature,Wind,Data Time S,Data Time Tz,Status,Alert level
0,8767.0,102.0,"10.782978,106.700711","Ho Chi Minh City US Consulate, Vietnam (Lãnh s...",https://aqicn.org/city/vietnam/ho-chi-minh-cit...,pm25,-,,83.0,-,...,1009.0,,102.00,-,27.0,3.6,2021-01-21 19:00:00,+07:00,#NAME?,#NAME?
1,8688.0,221.0,"21.0811211,105.8180306","United Nations International School of Hanoi, ...",https://aqicn.org/city/vietnam/hanoi/unis,aqi,-,,77.0,-,...,1015.0,,-,-,18.0,1.5,2021-01-21 20:00:00,+07:00,#NAME?,#NAME?
2,8641.0,281.0,"21.0215063,105.8188748","Hanoi US Embassy, Vietnam (Đại sứ quán Mỹ, Hà ...",https://aqicn.org/city/vietnam/hanoi/us-embassy,pm25,0.30,,77.0,8.70,...,1015.0,,281.00,2.10,18.0,1.5,2021-01-21 20:00:00,+07:00,#NAME?,#NAME?
3,13012.0,36.0,"13.998599,107.996482","Gia Lai/phường Thống Nhất - Pleiku, Vietnam",https://aqicn.org/city/vietnam/gia-lai/phuong-...,pm25,3.00,,60.2,6.00,...,923.7,29.0,36.00,1.00,24.3,1.0,2021-01-21 20:00:00,+07:00,#NAME?,#NAME?
4,12488.0,68.0,"16.46226,107.596351","Thừa Thiên Huế/83 Hùng Vương, Vietnam",https://aqicn.org/city/vietnam/thua-thien-hue/...,pm25,2.00,,88.0,-,...,1015.0,52.0,68.00,-,21.0,1.0,2021-01-21 19:00:00,+07:00,#NAME?,#NAME?


## **Data Explore**

**The number of rows and columns**

In [18]:
rows, cols = data.shape
rows, cols

(3415, 21)

In [19]:
data.columns

Index(['Station ID', 'AQI index', 'Location', 'Station name', 'Url',
       'Dominent pollutant', 'CO', 'Dew', 'Humidity', 'NO2', 'O3', 'Pressure',
       'PM10', 'PM2.5', 'SO2', 'Temperature', 'Wind', 'Data Time S',
       'Data Time Tz', 'Status', 'Alert level'],
      dtype='object')

**This `Dataset on air quality in Vietnam in 2021` has 3415 rows and 21 columns**


|Column|Meanings
| :------ | ---
| Station ID  | Index
| AQI index  | Air quality index
| Location | Location
| Station name | Name of the station
| Url | Link to Real time AQI
| Dominent pollutant |Dominant Species
| CO | CO Concentration
| Dew | Fog Index
| Humidity | Humidity Index
| NO2 | NO2 Concentration
| O3 | O3 Concentration
| Pressure | Pressure Index
| PM10 | Particulate matter of 10 micrometers or less in diameter
| PM2.5 | Particulate matter of 2.5 micrometers or less in diameter
| SO2 | SO2 Concentration
| Temperature | Temperature Index
| Wind | Wind speed
| Data Time S | Update time
| Data Time Tz | Time zone
| Status | Alert Status
| Alert level | Alert level

In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3415 entries, 0 to 3414
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Station ID          2622 non-null   float64
 1   AQI index           2618 non-null   object 
 2   Location            2622 non-null   object 
 3   Station name        2622 non-null   object 
 4   Url                 2622 non-null   object 
 5   Dominent pollutant  2114 non-null   object 
 6   CO                  2615 non-null   object 
 7   Dew                 2263 non-null   float64
 8   Humidity            2608 non-null   float64
 9   NO2                 2615 non-null   object 
 10  O3                  2608 non-null   object 
 11  Pressure            2608 non-null   object 
 12  PM10                2583 non-null   object 
 13  PM2.5               2614 non-null   object 
 14  SO2                 2614 non-null   object 
 15  Temperature         2615 non-null   float64
 16  Wind  

## **Preprocessing**

**Convert object to numeric datatypes**

In [24]:
columns = ['AQI index','Dominent pollutant', 'CO', 'NO2', 'O3', 'Pressure',\
       'PM10', 'PM2.5', 'SO2']

In [25]:
data[columns] = data[columns].astype(str).apply(lambda x: pd.to_numeric(x.str.replace('\D','',regex=True)))

In [26]:
data.describe()

Unnamed: 0,Station ID,AQI index,Dominent pollutant,CO,Dew,Humidity,NO2,O3,Pressure,PM10,PM2.5,SO2,Temperature,Wind
count,2622.0,2101.0,2008.0,2155.0,2263.0,2608.0,1992.0,1227.0,2608.0,1888.0,2184.0,1261.0,2615.0,2615.0
mean,11079.826087,6663.874346,22.885956,961.350348,22.24821,75.790529,1350.085341,1022.290139,100350.479294,4540.338983,6324.061355,1467.581285,26.227036,2.922141
std,3548.080424,4701.377106,5.220637,1377.657636,4.256875,16.46953,1223.11405,1108.902307,1908.443486,3516.254299,4571.408248,4084.941581,4.733196,3.886933
min,1584.0,100.0,10.0,10.0,5.5,13.0,100.0,90.0,91690.0,50.0,50.0,10.0,0.5,0.2
25%,8767.0,3200.0,25.0,200.0,20.75,62.4,500.0,200.0,100200.0,1975.0,2975.0,200.0,23.0,2.0
50%,13012.0,5800.0,25.0,500.0,22.9,78.0,1000.0,650.0,100900.0,4200.0,5400.0,500.0,26.0,2.8
75%,13019.0,9100.0,25.0,1000.0,25.0,88.5,1900.0,1400.0,101200.0,6300.0,9100.0,1200.0,29.5,3.6
max,13252.0,34000.0,25.0,9800.0,29.0,100.0,8000.0,9300.0,102450.0,34000.0,31500.0,50000.0,39.0,150.0
