In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


data_init = pd.read_csv('data/train.csv', encoding='big5')

In [2]:
# 处理降水量列 RAINFALL
days = int(len(data_init) / 18)

RAINFALL_set = set()
for i in range(days):
    day_set = data_init[i*18: (i+1)*18]
    set_t = set(day_set.iloc[10, 3:])
    RAINFALL_set = RAINFALL_set.union(set_t)
print(RAINFALL_set)

{'1.8', '56', '8.4', '3.4', '1.2', '7.8', '3.2', '27', '7.6', '4', '2.6', 'NR', '4.6', '17', '7.4', '9.2', '0.8', '38', '7', '23', '20', '2.8', '0', '1.4', '3', '5', '6.4', '1.6', '0.2', '0.6', '0.4', '7.2', '10', '12', '1', '2.2', '3.6', '18', '14', '6.8', '3.8', '13', '2', '4.8', '2.4', '4.2', '5.4', '9.8', '74', '21', '15', '19', '66', '11', '8.2', '8.6'}


In [3]:
# 将 NR 替换为 0
data = data_init.replace('NR', 0).iloc[:, 3:]
raw_data = data.to_numpy()
raw_data

array([['14', '14', '14', ..., '15', '15', '15'],
       ['1.8', '1.8', '1.8', ..., '1.8', '1.8', '1.8'],
       ['0.51', '0.41', '0.39', ..., '0.35', '0.36', '0.32'],
       ...,
       ['36', '55', '72', ..., '118', '100', '105'],
       ['1.9', '2.4', '1.9', ..., '1.5', '2', '2'],
       ['0.7', '0.8', '1.8', ..., '1.6', '1.8', '2']], dtype=object)

In [4]:
data_init[:18].iloc[:,2]

0       AMB_TEMP
1            CH4
2             CO
3           NMHC
4             NO
5            NO2
6            NOx
7             O3
8           PM10
9          PM2.5
10      RAINFALL
11            RH
12           SO2
13           THC
14         WD_HR
15    WIND_DIREC
16    WIND_SPEED
17         WS_HR
Name: 測項, dtype: object

In [5]:
# 将数据按照月度来分隔， 分成 12个18*(20*24)个数据
month_data = []
for month in range(12):
    sample = np.empty([18, 480])
    for day in range(20):
        # 一天的数据
        begin_row = 18 * (month*20 + day)
        end_row = begin_row + 18
        sample[:, day * 24 : (day+1) * 24] = raw_data[begin_row : end_row, :]
    month_data.append(sample)
print(month_data[0].shape)

(18, 480)


In [6]:
# 将数据按照每10小时分隔，前9小时为x，后1小时为y，每个月有471条记录，每个记录有18*9个属性
x = np.empty([12 * 471, 18 * 9], dtype = float)
y = np.empty([12 * 471, 1], dtype = float)
for month in range(12):
    for day in range(20):
        for hour in range(24):
            if day == 19 and hour > 14:
                break
            row = month*471 + day*24 + hour
            begin_column = day*24 + hour
            end_column = begin_column + 9
            x[row, :] = month_data[month][:, begin_column : end_column].reshape(1, -1)
            y[row, 0] = month_data[month][9, end_column]

print(x)
print(y)

[[14.  14.  14.  ...  2.   2.   0.5]
 [14.  14.  13.  ...  2.   0.5  0.3]
 [14.  13.  12.  ...  0.5  0.3  0.8]
 ...
 [17.  18.  19.  ...  1.1  1.4  1.3]
 [18.  19.  18.  ...  1.4  1.3  1.6]
 [19.  18.  17.  ...  1.3  1.6  1.8]]
[[30.]
 [41.]
 [44.]
 ...
 [17.]
 [24.]
 [29.]]
