Xử lý dữ liệu không khí năm 2021

In [79]:
import pandas as pd
import numpy as np
import math
import datetime as dt

In [80]:
df_2021 = pd.read_csv(r"datasets\historical_air_quality_2021_en.csv")
df_2021.head()


Unnamed: 0,Station ID,AQI index,Location,Station name,Url,Dominent pollutant,CO,Dew,Humidity,NO2,...,Pressure,PM10,PM2.5,SO2,Temperature,Wind,Data Time S,Data Time Tz,Status,Alert level
0,8767,102,"10.782978,106.700711","Ho Chi Minh City US Consulate, Vietnam (Lãnh s...",https://aqicn.org/city/vietnam/ho-chi-minh-cit...,pm25,-,,83.0,-,...,1009.0,,102,-,27.0,3.6,1/21/2021 19:00,+07:00,#NAME?,#NAME?
1,8688,221,"21.0811211,105.8180306","United Nations International School of Hanoi, ...",https://aqicn.org/city/vietnam/hanoi/unis,aqi,-,,77.0,-,...,1015.0,,-,-,18.0,1.5,1/21/2021 20:00,+07:00,#NAME?,#NAME?
2,8641,281,"21.0215063,105.8188748","Hanoi US Embassy, Vietnam (Đại sứ quán Mỹ, Hà ...",https://aqicn.org/city/vietnam/hanoi/us-embassy,pm25,0.3,,77.0,8.7,...,1015.0,,281,2.1,18.0,1.5,1/21/2021 20:00,+07:00,#NAME?,#NAME?
3,13012,36,"13.998599,107.996482","Gia Lai/phường Thống Nhất - Pleiku, Vietnam",https://aqicn.org/city/vietnam/gia-lai/phuong-...,pm25,3,,60.2,6,...,923.7,29.0,36,1,24.3,1.0,1/21/2021 20:00,+07:00,#NAME?,#NAME?
4,12488,68,"16.46226,107.596351","Thừa Thiên Huế/83 Hùng Vương, Vietnam",https://aqicn.org/city/vietnam/thua-thien-hue/...,pm25,2,,88.0,-,...,1015.0,52.0,68,-,21.0,1.0,1/21/2021 19:00,+07:00,#NAME?,#NAME?


Dữ liệu có bao nhiêu dòng, cột:

In [81]:
num_rows, num_cols = df_2021.shape
num_rows, num_cols

(2622, 21)

Ý nghĩa các cột không cần thiết:

Station ID: ID của trạm quan trắc không khí.
Url: Đường dẫn đến trang web liên quan đến dữ liệu không khí của trạm.
Status: Trạng thái dữ liệu.
Alert level: Cấp độ cảnh báo.
Data Time Tz: Múi giờ của dữ liệu.

In [82]:
col_drop = ['Station ID', 'Url', 'Status', 'Alert level', 'Data Time Tz']
df_2021 = df_2021.drop(columns=col_drop)

In [83]:
df_2021 = df_2021.replace('-', np.nan)
df_2021.isna().sum()


AQI index              521
Location                 0
Station name             0
Dominent pollutant     508
CO                     467
Dew                    359
Humidity                14
NO2                    630
O3                    1395
Pressure                14
PM10                   734
PM2.5                  438
SO2                   1361
Temperature              7
Wind                     7
Data Time S              0
dtype: int64

Dữ liệu bị thiếu rất nhiều, và có nhiều cột phải xử lí kiểu dữ liệu

Tiền xử lí dữ liệu df_2021

In [84]:
def convert_to_datetime(date_str):
    res = pd.to_datetime(date_str)
    if res.tzinfo is not None:
        res = res.tz_localize(None)
    return res
df_2021['Data Time S'] = df_2021['Data Time S'].apply(convert_to_datetime).dt.strftime('%Y-%m-%d')

Đưa các cột numerical về đúng kiểu dữ liệu

In [85]:
df_2021['Pressure'] = df_2021['Pressure'].str.replace(',', '')

numerical_labels = ['AQI index', 'CO', 'Dew', 'Humidity', 'NO2', 'O3', 'Pressure', 'PM10', 'PM2.5', 'SO2', 'Temperature', 'Wind']
df_2021[numerical_labels] = df_2021[numerical_labels].astype('float64')


Gom nhóm các tỉnh thành

In [86]:
state_labels = ["Hà Nội", "Bắc Ninh", "Quảng Ninh", "Cao Bằng", "Gia Lai",
                "Lào Cai", "Nha Trang", "Hồ Chí Minh", "Đà Nẵng", "Huế", "Hạ Long", "Hải Phòng"]

def classify_region(stasion_name):
    if (not isinstance(stasion_name, str)):
        return stasion_name
    
    for state in state_labels:
        if state in stasion_name:
            return state
    print(stasion_name)

df_2021['Station name'] = df_2021['Station name'].apply(classify_region)

Fill các cột dữ liệu bị khuyết = trung vị của khu vực đó

In [87]:
def fill_missing_value(x):
    same_station = df_2021[df_2021['Station name'] == x['Station name']]
    for col in numerical_labels:
        if col == ['AQI index']:
            continue
        if np.isnan(x[col]):
            if same_station[col].isna().all():
                x[col] = df_2021[col].median()
            else:
                x[col] = same_station[col].median()
    return x

df_2021 = df_2021.apply(fill_missing_value, axis=1)

Điền giá trị cho các dòng không có AQI index, Dominent pollutant theo công thức tính AQI của Cục Bảo vệ Môi trường Hoa Kỳ (EPA) https://www.airnow.gov/sites/default/files/2020-05/aqi-technical-assistance-document-sept2018.pdf

In [88]:
breakpoints = {
    'O3':  [(0, 54), (55, 70), (71, 85), (86, 105), (106, 200)],
    'PM2.5':  [(0.0, 12.0), (12.1, 35.4), (35.5, 55.4), (55.5, 150.4), (150.5, 250.4), (250.5, 350.4), (350.5, 500.4)],
    'PM10': [(0, 54), (55, 154), (155, 254), (255, 354), (355, 424), (425, 504), (505, 604)],
    'CO': [(0.0, 4.4), (4.5, 9.4), (9.5, 12.4), (12.5, 15.4), (15.5, 30.4), (30.5, 40.4), (40.5, 50.4)],
    'SO2': [(0, 35), (36, 75), (76, 185), (186, 304)],
    'NO2': [(0, 53), (54, 100), (101, 360), (361, 649), (650, 1249), (1250, 1649), (1650, 2049)]
}
aqi_levels = [(0, 50), (51, 100), (101, 150),
              (151, 200), (201, 300), (301, 400), (401, 500)]
# AQI calculation functions dựa trên nồng độ thực tế, khoảng nồng độ cho từng cấp breakpoint và mức AQI tương ứng
def aqi_formula(concentration, breakpoint, aqi_level):
    aqi = ((concentration - breakpoint[0]) / (breakpoint[1] - breakpoint[0])) * (aqi_level[1] - aqi_level[0]) + aqi_level[0]
    return round(aqi)
# Tính toán AQI cho từng chất ô nhiễm dựa trên nồng độ
# Nếu giá trị nồng độ bị khuyết (nan) thì trả về np.nan.
# Nếu không thì lấy các breakpoint của chất ô nhiễm đó.
# Xác định xem nồng độ thuộc khoảng breakpoint nào, rồi dùng aqi_formula để tính ra chỉ số AQI.
# Nếu vượt ngoài breakpoint lớn nhất thì vẫn tính với nhóm lớn nhất.
# Trả về chỉ số AQI đã làm tròn.
def calculate_individual_aqi(pollutant_name, concentration):
    if math.isnan(concentration):
        return np.nan
    bps = breakpoints[pollutant_name]

    for i in range(len(bps)):
        if bps[i][0] <= concentration <= bps[i][1]:
            aqi = aqi_formula(concentration, bps[i], aqi_levels[i])
            return round(aqi)
    last_level = len(bps) - 1
    return round(aqi_formula(concentration, bps[last_level], aqi_levels[last_level]))
# Tính toán chỉ số AQI tổng hợp từ các chất ô nhiễm
# Sử dụng hàm calculate_individual_aqi để tính chỉ số AQI cho từng
def calculate_aqi(pollutant_concentrations):

    # Calculate AQI for each pollutant
    AQI_indexes = [calculate_individual_aqi(
        pollutant, pollutant_concentrations[pollutant]) for pollutant in breakpoints.keys()]

    # Return maximum AQI value
    if np.isnan(AQI_indexes).all():
        return np.nan
    else:
        return max(AQI_indexes, key=lambda index: 0 if np.isnan(index) else index)
    #  Hàm xác định thành phần ô nhiễm chiếm ưu thế
def specify_dominant_pollutant(pollutant_concentrations):
    pollutant_aqi_dict = {pollutant: calculate_individual_aqi(
        pollutant, pollutant_concentrations[pollutant]) for pollutant in breakpoints.keys()}

    return max(breakpoints.keys(), key=lambda pollutant: 0 if np.isnan(pollutant_aqi_dict[pollutant]) else pollutant_aqi_dict[pollutant])


aqi_na_rows = df_2021['AQI index'].isna()
df_2021.loc[aqi_na_rows, 'AQI index'] = df_2021[aqi_na_rows].apply(calculate_aqi, axis=1)

dominant_pollu_trans_dict = {'pm25': 'PM2.5', 'aqi': 'aqi', 'pm10': 'PM10'}
dominant_na_rows = df_2021['Dominent pollutant'].isna()
df_2021.loc[~dominant_na_rows, 'Dominent pollutant'] = df_2021[~dominant_na_rows].apply(lambda x:
                                                                              dominant_pollu_trans_dict[x['Dominent pollutant']], axis=1)
df_2021.loc[dominant_na_rows, 'Dominent pollutant'] = df_2021[dominant_na_rows].apply(
    specify_dominant_pollutant, axis=1)
    

Xây dựng lại cột Status theo tiêu chuẩn
|AQI|Status|
|--|:------:|
|0-50|Good|
|51-100|Moderate|
|101-150|Unhealthy for sensitive groups|
|151-200|Unhealthy|
|201-300|Very unhealthy|
|301+|Hazardous|


In [89]:
def status(x):
    if 0 <= x <= 50:
        return 'Good'
    elif 51 <= x <= 100:
        return 'Moderate'
    elif 101 <= x <= 150:
        return 'Unhealthy for Sensitive Groups'
    elif 151 <= x <= 200:
        return 'Unhealthy'
    elif 201 <= x <= 300:
        return 'Very Unhealthy'
    elif 301 <= x <= 500:
        return 'Hazardous'
df_2021['Status'] = df_2021['AQI index'].apply(status)

In [90]:
df_2021 = df_2021.dropna()
df_2021 = df_2021.drop_duplicates()

In [91]:
location = df_2021['Location'].str.split(",")
df_2021['Latitude'] = location.apply(lambda x: float(x[0]))
df_2021['Longitude'] = location.apply(lambda x: float(x[1]))
df_2021.drop(columns=['Location'], inplace=True)

In [92]:
# df_2021.to_csv(r'datasets\processed_data_2021.csv', index=False)

Xử lý dữ liệu năm 2020

In [93]:
df_2020 = pd.read_csv(r"E:\BTLPTDLL-2025\datasets\aqi_airqualitydata_2020_en.csv")
df_2020.head()

Unnamed: 0,Date,Country,City,Specie,count,min,max,median,variance
0,17/07/2020,VN,Ho Chi Minh City,temperature,24,26.0,34.0,29.5,59.13
1,21/10/2020,VN,Ho Chi Minh City,temperature,24,26.0,31.0,27.0,26.23
2,7/11/2020,VN,Ho Chi Minh City,temperature,24,25.0,32.5,28.0,69.2
3,22/11/2020,VN,Ho Chi Minh City,temperature,24,24.0,33.5,26.0,102.39
4,10/1/2020,VN,Ho Chi Minh City,temperature,24,23.5,34.0,28.5,125.65


In [94]:
num_rows, num_cols = df_2020.shape
num_rows, num_cols

(16227, 9)

In [95]:
col_drop = ['Country']
df_2020 = df_2020.drop(columns=col_drop)

In [96]:
df_2020 = df_2020.replace('-', np.nan)
df_2020.isna().sum()
df_2020 = df_2020.drop_duplicates()


Tiền xử lý dữ liệu df_2020

In [97]:
df_2020['Date'] = pd.to_datetime(df_2020['Date']).dt.date 

  df_2020['Date'] = pd.to_datetime(df_2020['Date']).dt.date


In [98]:
df_2020['City'] = df_2020['City'].replace({
    'Ha Noi': 'Hà Nội',
    'Ho Chi Minh City': 'Hồ Chí Minh',
    'Hue': 'Huế',
    'Ha Long': 'Hạ Long',
    'Hai Phong': 'Hải Phòng'
})
df_2020 = df_2020.rename(columns={'City': 'Station name'})


In [99]:
df_2020['Specie'] = df_2020['Specie'].replace({
    'wind speed': 'Wind',
    'wind-gust': 'Wind gust',
})
require_cols = ['temperature', 'humidity', 'dew', 'Wind', 'pressure', 'pm25', 'pm10', 'o3', 'no2', 'so2', 'co', 'aqi index']
filtered_data = df_2020[df_2020['Specie'].isin(require_cols)]
filtered_data = filtered_data[['Date', 'Station name', 'Specie', 'median']]

pivoted_data = filtered_data.pivot_table(index=['Date', 'Station name'], columns='Specie', values='median', aggfunc='first')
pivoted_data.reset_index(inplace=True)
pivoted_data.to_csv(r"E:\BTLPTDLL-2025\datasets\aqi_aqidata_2020.csv", index=False)

In [100]:
df = pd.read_csv(r"E:\BTLPTDLL-2025\datasets\aqi_aqidata_2020.csv")
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)
df.sort_index(inplace=True)
df.head()

Unnamed: 0_level_0,Station name,Wind,co,dew,humidity,no2,o3,pm10,pm25,pressure,so2,temperature
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2019-12-30,Huế,,11.0,,,8.0,11.0,39.0,67.0,,6.0,
2019-12-30,Hà Nội,,0.4,,,14.2,4.0,70.0,154.0,,4.0,
2019-12-30,Hạ Long,,,,,29.0,7.0,63.0,124.0,,,
2019-12-30,Hồ Chí Minh,,,,,,,,89.0,,,
2019-12-31,Huế,,9.0,,,6.0,10.0,10.0,17.0,,5.0,


In [101]:
df = df.replace('-', np.nan)
df.isna().sum()

Station name       0
Wind            1501
co               799
dew               77
humidity          76
no2              671
o3               955
pm10             423
pm25              45
pressure          76
so2             1113
temperature       76
dtype: int64

Sau khi biến đổi bằng pivot_table, ta thấy dữ liệu vẫn còn bị khuyết rất nhiều
Tiến hành xử lí fill dữ liệu bằng median của các cột

In [102]:
df['Station name'] = df['Station name'].apply(classify_region)

In [103]:
df.rename(columns={'co':'CO',
      'no2':'NO2', 'o3':'O3', 'pm10':'PM10',
       'pm25':'PM2.5','so2':'SO2', 'pressure':'Pressure', 'temperature':'Temperature',
       'humidity':'Humidity', 'dew':'Dew',
       }, inplace=True)

In [104]:

numerical_labels = ['CO','NO2','O3', 
                    'PM10','PM2.5','SO2', 'Wind', 'Pressure', 'Temperature', 'Humidity', 'Dew']
df[numerical_labels] = df[numerical_labels].astype('float64')

In [105]:
"""
Cell generated by Data Wrangler.
"""
def clean_data(df):
    # Replace missing values with the median of each column in: 'Wind', 'Humidity' and 2 other columns
    df = df.fillna({'Wind': df['Wind'].median(), 'Humidity': df['Humidity'].median(), 'Pressure': df['Pressure'].median(), 'Temperature': df['Temperature'].median()})
    return df

df_clean = clean_data(df.copy())
df_clean.head()

Unnamed: 0_level_0,Station name,Wind,CO,Dew,Humidity,NO2,O3,PM10,PM2.5,Pressure,SO2,Temperature
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2019-12-30,Huế,2.0,11.0,,83.0,8.0,11.0,39.0,67.0,1007.0,6.0,26.0
2019-12-30,Hà Nội,2.0,0.4,,83.0,14.2,4.0,70.0,154.0,1007.0,4.0,26.0
2019-12-30,Hạ Long,2.0,,,83.0,29.0,7.0,63.0,124.0,1007.0,,26.0
2019-12-30,Hồ Chí Minh,2.0,,,83.0,,,,89.0,1007.0,,26.0
2019-12-31,Huế,2.0,9.0,,83.0,6.0,10.0,10.0,17.0,1007.0,5.0,26.0


In [106]:
df[numerical_labels].resample('ME').median()

Unnamed: 0_level_0,CO,NO2,O3,PM10,PM2.5,SO2,Wind,Pressure,Temperature,Humidity,Dew
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-12-31,4.7,13.3,6.5,63.5,98.5,4.5,,,,,
2020-01-31,5.5,14.0,5.0,31.0,63.0,5.0,2.0,1013.0,22.0,83.0,19.2
2020-02-29,4.0,14.0,5.0,32.0,64.0,4.0,2.0,1012.75,22.0,83.6,18.1
2020-03-31,5.0,13.0,6.0,23.0,46.0,5.0,2.0,1009.0,24.0,86.75,21.5
2020-04-30,4.0,13.0,6.0,18.0,42.0,6.0,,1010.0,24.5,83.0,21.5
2020-05-31,4.5,12.0,9.0,22.0,40.5,6.0,,1006.0,28.5,79.0,25.0
2020-06-30,2.0,8.0,11.5,17.0,21.0,7.1,,1003.0,29.2,80.1,25.0
2020-07-31,3.0,7.7,11.0,18.0,22.5,9.55,,1004.0,29.2,80.1,25.0
2020-08-31,4.0,8.0,9.0,14.0,19.0,10.8,,1003.0,28.0,85.5,25.0
2020-09-30,4.0,7.0,11.5,12.0,26.0,11.0,,1005.25,27.5,86.7,25.0


In [107]:
monthly_mean = df[numerical_labels].resample('ME').median()
monthly_mean = monthly_mean.fillna(method='ffill')
monthly_mean.head()

  monthly_mean = monthly_mean.fillna(method='ffill')


Unnamed: 0_level_0,CO,NO2,O3,PM10,PM2.5,SO2,Wind,Pressure,Temperature,Humidity,Dew
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-12-31,4.7,13.3,6.5,63.5,98.5,4.5,,,,,
2020-01-31,5.5,14.0,5.0,31.0,63.0,5.0,2.0,1013.0,22.0,83.0,19.2
2020-02-29,4.0,14.0,5.0,32.0,64.0,4.0,2.0,1012.75,22.0,83.6,18.1
2020-03-31,5.0,13.0,6.0,23.0,46.0,5.0,2.0,1009.0,24.0,86.75,21.5
2020-04-30,4.0,13.0,6.0,18.0,42.0,6.0,2.0,1010.0,24.5,83.0,21.5


In [108]:
df.isna().sum()

Station name       0
Wind            1501
CO               799
Dew               77
Humidity          76
NO2              671
O3               955
PM10             423
PM2.5             45
Pressure          76
SO2             1113
Temperature       76
dtype: int64

In [109]:
for index, row in df.iterrows():
    year = index.year
    month = index.month
    last_day = pd.Timestamp(year, month, 1) + pd.offsets.MonthEnd(0)
    if last_day in monthly_mean.index:
        median_row = monthly_mean.loc[last_day]
        for col in numerical_labels:
            if np.isnan(row[col]):
                df.at[index, col] = median_row[col]
df.isna().sum()

Station name    0
Wind            8
CO              0
Dew             8
Humidity        8
NO2             0
O3              0
PM10            0
PM2.5           0
Pressure        8
SO2             0
Temperature     8
dtype: int64

In [110]:
df

Unnamed: 0_level_0,Station name,Wind,CO,Dew,Humidity,NO2,O3,PM10,PM2.5,Pressure,SO2,Temperature
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2019-12-30,Huế,,4.7,,,13.3,6.50,63.5,67.0,,4.5,
2019-12-30,Hà Nội,,4.7,,,13.3,6.50,63.5,154.0,,4.5,
2019-12-30,Hạ Long,,4.7,,,13.3,6.50,63.5,124.0,,4.5,
2019-12-30,Hồ Chí Minh,,4.7,,,13.3,6.50,63.5,89.0,,4.5,
2019-12-31,Huế,,4.7,,,13.3,6.50,63.5,17.0,,4.5,
...,...,...,...,...,...,...,...,...,...,...,...,...
2021-01-20,Huế,2.0,3.0,16.0,93.0,11.5,11.25,32.0,40.0,1017.0,4.8,17.0
2021-01-20,Hà Nội,2.0,3.0,10.0,71.5,11.5,11.25,32.0,142.0,1017.5,4.8,13.0
2021-01-20,Hạ Long,2.0,3.0,12.8,92.4,11.5,11.25,32.0,100.0,1004.2,4.8,13.6
2021-01-20,Hải Phòng,2.0,3.0,12.8,92.4,11.5,11.25,32.0,31.0,1004.2,4.8,13.6


In [111]:
df['AQI index'] = np.full(df.shape[0], np.nan)

In [None]:
aqi_na_rows = df['AQI index'].isna()
df.loc[aqi_na_rows, 'AQI index'] = df[aqi_na_rows].apply(calculate_aqi, axis=1)
df['AQI index']
dominant_pollu_trans_dict = {'pm25': 'PM2.5', 'aqi': 'aqi', 'pm10': 'PM10'}

df['Dominent pollutant'] = np.full(df.shape[0], np.nan).astype('object')
dominant_na_rows = df['Dominent pollutant'].isna()
df.loc[dominant_na_rows, 'Dominent pollutant'] = df[dominant_na_rows].apply(
    specify_dominant_pollutant, axis=1)


In [119]:
df['Status'] =  np.full(df.shape[0], np.nan).astype('object')
df['Status'] = df['AQI index'].apply(status)

In [120]:
df.to_csv(r"E:\BTLPTDLL-2025\datasets\processed_data_2020.csv", index=True)

Kết hợp 2 dataset lại bằng pd.concat

In [115]:
df_processed_1 = pd.read_csv(r"E:\BTLPTDLL-2025\datasets\processed_data_2020.csv")
df_processed_2 = pd.read_csv(r"E:\BTLPTDLL-2025\datasets\processed_data_2021.csv")
df_processed_2.rename(columns={'Data Time S': 'Date'}, inplace=True)
df_processed_2.drop(columns=['Latitude', 'Longitude'], inplace=True)
df_concat = pd.concat([df_processed_1, df_processed_2], ignore_index=True)
df_concat

Unnamed: 0,Date,Station name,Wind,CO,Dew,Humidity,NO2,O3,PM10,PM2.5,Pressure,SO2,Temperature,AQI index,Dominent pollutant,Status
0,2019-12-30,Huế,,4.7,,,13.3,6.5,63.5,67.0,,4.5,,157.0,PM2.5,
1,2019-12-30,Hà Nội,,4.7,,,13.3,6.5,63.5,154.0,,4.5,,204.0,PM2.5,
2,2019-12-30,Hạ Long,,4.7,,,13.3,6.5,63.5,124.0,,4.5,,186.0,PM2.5,
3,2019-12-30,Hồ Chí Minh,,4.7,,,13.3,6.5,63.5,89.0,,4.5,,168.0,PM2.5,
4,2019-12-31,Huế,,4.7,,,13.3,6.5,63.5,17.0,,4.5,,61.0,PM2.5,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3944,2021-11-29,Hà Nội,1.0,11.0,11.5,46.5,40.0,2.0,126.0,168.0,1020.5,2.0,23.5,168.0,PM2.5,Unhealthy
3945,2021-11-29,Hà Nội,1.2,7.0,17.0,85.0,30.0,2.0,68.0,130.0,1021.0,2.0,19.5,65.0,PM2.5,Moderate
3946,2021-11-29,Hà Nội,1.0,20.0,11.0,46.0,37.0,2.0,108.0,162.0,1021.0,2.0,23.0,162.0,PM2.5,Unhealthy
3947,2021-11-29,Hà Nội,1.0,7.0,11.0,46.0,18.0,2.0,76.0,149.0,1021.0,2.0,23.0,149.0,PM2.5,Unhealthy for Sensitive Groups


In [116]:
# df_concat.drop_duplicates(inplace=True)