In [1]:
import requests
import pandas as pd
import numpy as np
import pickle

In [2]:
def save_pkl(file_dir, data):
    f = open(file_dir,"wb")
    pickle.dump(data, f, protocol=4)
    f.close()
    
def read_pkl(file_dir):
    f = open(file_dir,"rb")
    data = pickle.load(f)
    return data

In [3]:
# load the file
station_group = read_pkl("London/station_final.pkl")

In [4]:
# This is a process of getting the flatten Y (pollutions in the future 48 hours)
# This is the first step of converting the station_group (station_final.pkl) into X Y (.npy)
def flatten_Y(features, ratio = 1, stations = list(station_group.keys()), start = 0):  
    
    if(start == 0):
        length = int(9000 * ratio)
    else:
        length = 10806
        
    Y_PM25_list = []
    Y_PM10_list = []
    Y_NO2_list = []
    X_list = []
    
    for name in station_group:
        if name in stations:
            # print(name)
            PM25_list = station_group[name]['PM25_Concentration'].values
            PM10_list = station_group[name]['PM10_Concentration'].values
            NO2_list = station_group[name]['NO2_Concentration'].values
            
            X_this = station_group[name][features].values[start: length] # features_matrix
            X_list.append(X_this)
            for i in range(start, length):
                Y_PM25_list.append(PM25_list[i+1 : i+49])
                Y_PM10_list.append(PM10_list[i+1 : i+49])
                Y_NO2_list.append(NO2_list[i+1 : i+49])
            
    Y_PM25 = np.vstack(Y_PM25_list)
    Y_PM10 = np.vstack(Y_PM10_list)
    Y_NO2 = np.vstack(Y_NO2_list)
    X = np.vstack(X_list)
    # print(X[:10])
    # print(Y_PM25[:10])
    return X, Y_PM25, Y_PM10, Y_NO2

# This is a one-hot encoding process that can be used to represent different time periods within the next 48 hours.
# This is the second step of converting the station_group (station_final.pkl) into  X Y (.npy)
def onehot_48_labels(X, Y_PM25, Y_PM10, Y_NO2):
    Y_PM25_list = []
    Y_PM10_list = []
    Y_NO2_list = []
    X_list = []
    for i in range(X.shape[0]):
        for j in range(Y_PM25.shape[1]):
            tmp = np.zeros(48)
            tmp[j] = 1
            X_list.append(np.hstack((X[i, :], tmp)))
            Y_PM25_list.append(Y_PM25[i, j])
            Y_PM10_list.append(Y_PM10[i, j])
            Y_NO2_list.append(Y_NO2[i, j])
    return np.array(X_list), np.array(Y_PM25_list), np.array(Y_PM10_list), np.array(Y_NO2_list)

In [5]:
def generate_train_data (features, k=1, name = 's', start = 0):
    s = ['BL0','CD1','CD9','GN0','GN3','GR4','GR9','HV1','KF1','LW2','MY7','ST5','TH4']
    X, Y_PM25, Y_PM10, Y_NO2 = flatten_Y(features,ratio = k,stations = s, start = start)
    # print(X.shape, Y_PM25.shape,Y_PM10.shape,Y_NO2.shape)
    X, Y_PM25, Y_PM10, Y_NO2 = onehot_48_labels(X, Y_PM25, Y_PM10, Y_NO2)
    print('Generated data shape: ',X.shape, Y_PM25.shape,Y_PM10.shape,Y_NO2.shape)
    # save all of the X Y into .npy file
    np.save("London/X_"+name+".npy", X)
    np.save("London/Y_NO2_"+name+".npy", Y_NO2)
    np.save("London/Y_PM10_"+name+".npy", Y_PM10)
    np.save("London/Y_PM25_"+name+".npy", Y_PM25)
    print('Finish creating and saving the '+name+' version of X, Y_NO2, Y_PM10, and Y_PM25 data')

In [6]:
# If you want to removing some features in the generated X Y file (.npy), you can delete some of the following elements
# And [a,b] is differ from [b,a], which means you can control the feature order of the output data

features = ['PM25_Concentration','PM10_Concentration','NO2_Concentration', # feature 0-2
            'temperature','pressure','humidity','wind_speed', 'wind_direction', # feature 3-7
            'te_0','te_1','te_2','te_3','te_4','te_5','te_6','te_7','te_8','te_9','te_10','te_11','te_12','te_13','te_14','te_15', # feature 8-23
            'pr_0','pr_1','pr_2','pr_3','pr_4','pr_5','pr_6','pr_7','pr_8','pr_9','pr_10','pr_11','pr_12','pr_13','pr_14','pr_15', # feature 24-39
            'hu_0','hu_1','hu_2','hu_3','hu_4','hu_5','hu_6','hu_7','hu_8','hu_9','hu_10','hu_11','hu_12','hu_13','hu_14','hu_15', # feature 40-55
            'wd_0','wd_1','wd_2','wd_3','wd_4','wd_5','wd_6','wd_7','wd_8','wd_9','wd_10','wd_11','wd_12','wd_13','wd_14','wd_15', # feature 56-71
            'holiday','time_month','time_week','time_day','time_hour'] # feature 72-76

In [None]:
# Generate a testing dataset of X Y data (.npy)
# The generated file name is X_test.npy, etc.
generate_train_data(features, name = 'test', start = 9000)

In [7]:
# Generate a small-sized version of X Y data (.npy)
# The generated file name is X_s.npy, etc.
generate_train_data(features, k = 0.01, name = 's')

Generated data shape:  (68016, 125) (68016,) (68016,) (68016,)
Finish creating and saving the s version of X, Y_NO2, Y_PM10, and Y_PM25 data


In [8]:
# You can also generate a small-sized version of X Y data (.npy) with removing some features.

In [9]:
# Generate a medium-sized version of X Y data (.npy)
# The generated file name is X_m.npy, etc.
generate_train_data(features, k = 0.1, name = 'm')

Generated data shape:  (674544, 125) (674544,) (674544,) (674544,)
Finish creating and saving the m version of X, Y_NO2, Y_PM10, and Y_PM25 data


In [10]:
# You can also generate a medium-sized version of X Y data (.npy) with removing some features.

In [11]:
# Generate a whole-sized version of X Y data (.npy)
# The generated file name is X_all.npy, etc.
generate_train_data(features, k = 1, name = 'all')

Generated data shape:  (6742944, 125) (6742944,) (6742944,) (6742944,)
Finish creating and saving the all version of X, Y_NO2, Y_PM10, and Y_PM25 data


In [None]:
# If you want to generate X Y files with other small size, you can change the ratio element
# For example, if you change the input ratio 'k' to 0.05 in the generate_train_data function, then you can get the X Y files with 5% of the whole size