In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd

import datetime
import pickle

import ndpretty

import data_prep

In [2]:
ndpretty.default()

In [3]:
data = data_prep.load_beijing_data()

HBox(children=(FloatProgress(value=0.0, max=87.0), HTML(value='')))


Loaded air quality data from 87 devices. No weather data for ['fangshan', 'miyun', 'yanqing', 'badaling', 'miyunshuiku', 'yufa', 'liulihe', '1042A', '1043A', '1044A', '1045A', '1046A', '1047A', '1048A', '1049A', '1050A', '1067A', '1068A', '1069A', '1070A', '1074A', '1075A', '1076A']


# PM2.5 GNN

In [4]:
data_path = "Previous work/PM2.5-GNN/data/"

know_air = np.load(data_path + "KnowAir.npy") # time × city × measure (17 features, 1 col for PM25)
cities = pd.read_table(data_path + "city.txt", delimiter=' ', header=None)
altitude = np.load(data_path + "altitude.npy")

with open(data_path + "city_dict.pkl", 'rb') as f:
    city_dict = pickle.load(f)

There aren't any NaN values in the data for PM2.5 GNN:

In [5]:
np.isnan(know_air).any()

False

For one city, multiple sensors seem to be grouped together.

In [6]:
city_dict['Beijing']

[['1001A', 116.366, 39.8673, 'Wanshouxigong'],
 ['1002A', 116.17, 40.2865, 'Dingling'],
 ['1003A', 116.434, 39.9522, 'Dongsi'],
 ['1004A', 116.434, 39.8745, 'Tiantan'],
 ['1005A', 116.473, 39.9716, 'Nongzhanguan'],
 ['1006A', 116.361, 39.9425, 'Guanyuan'],
 ['1007A', 116.315, 39.9934, 'Haidianquwanliu'],
 ['1008A', 116.72, 40.1438, 'Shunyixincheng'],
 ['1009A', 116.64399999999999, 40.3937, 'Huairouzhen'],
 ['1010A', 116.23, 40.1952, 'Changpingzhen'],
 ['1011A', 116.40700000000001, 40.0031, 'Aotizhongxin'],
 ['1012A', 116.225, 39.9279, 'Gucheng']]

The centroid of all sensor coordinates seems to be used as the cities computation location. However, the PM2.5 measurements within Beijing vary a lot. As we are focussing on Beijing, we probably shouldn't average over the whole city.

### Generate my own input data

#### `city.txt`

In [7]:
cities_export = data.metadata[['device_id', 'lon', 'lat']]

In [8]:
with open(data_path + 'city_import.txt', 'w') as city_f:
    city_f.write(cities_export.to_csv(sep=' ', header=False))

#### `KnowAir.npy`

This file seems to be `time × city × measure` whereas `measure` contains 17 weather features and 1 column for PM2.5.

It includes no time steps, just the data. On the `time` dimension, it contains one entry every three hours.

In [9]:
data_start = pd.to_datetime("2015-01-01 00:00")
data_end = pd.to_datetime("2018-12-31 21:00")

know_air.shape[0], pd.Timedelta(data_end - data_start), (1460 * 24 + 21 + 3), (1460 * 24 + 21 + 3) / 3

(11688, Timedelta('1460 days 21:00:00'), 35064, 11688.0)

In [10]:
rel_devices = [ 'qianmen', 'changping', 'daxing', 'shunyi', # Beijing
                '1056A', # Baoding
                # '1069A', # Langfang
                '1061A', # Zhangjiakou
                '1037A', # Tangshan
                ]

relevant = data.measurements
relevant = relevant[relevant['device_id'].isin(rel_devices)]

export_df = pd.DataFrame()
export_df['device_id'] = relevant['device_id']
export_df['time'] = relevant['time']
export_df['2m_temperature'] = relevant['temperature']
export_df['relative_humidity+950'] = relevant['humidity'] * 100
export_df['total_precipitation'] = relevant['precipIntensity']
export_df['wind_speed'] = relevant['windSpeed']
export_df['wind_direction'] = relevant['windBearing']
export_df['pm25'] = relevant['pm25']
export_df.index = relevant['time']

# drop all times at which at least one sensors has a NaN
missing_time = export_df[export_df.isnull().any(axis=1)]['time'].unique()
export_df = export_df[~export_df['time'].isin(missing_time)]

assert len(export_df) > 0, "No entries to export"

len(export_df)

116186

In [11]:
# transform from data frame to shape of KnowAir.npy
export_grouped = export_df.groupby('device_id').apply(pd.DataFrame.to_numpy)
y = list(map(lambda x: x[:,1:], export_grouped))

# time × city × measure (17 features, 1 col for PM25)
z = np.array(y)
z = z.swapaxes(0, 1)
z

(16598×7×7) object ndarray


interactive(children=(Text(value='[:100, :, 0]', description='Slice:', placeholder='e.g. [:100, :, 0]'), Outpu…

