In [1]:
import pandas as pd
import requests
from config import data as userdata
import os
import time
import numpy as np

# 使用 API 取得實時資料

In [2]:
def getRealTimeData(city='bj', datatype='airquality', day='2018-04-01'):
    datadir = './realTimeData'
    if not os.path.isdir(datadir):
        os.mkdir(datadir)
    url = 'https://biendata.com/competition/' + datatype + '/' + city +  '/' + day + '-0/' + day + '-23/2k0d1d8'
    filename = datadir + "/" + city + '_' + datatype + '_' + day + '.csv'
    if os.path.exists(filename):
        return filename
    responses= requests.get(url)
    if responses.text[0] != '{':
        with open(filename,'w') as f:
            f.write(responses.text)
        return filename
    else:
        print('error:\n',responses.text)
        return ''

In [3]:
name = getRealTimeData(day='2018-04-13')
print(name)
bjdata = pd.read_csv(name)
bjdata.head(5)

./realTimeData/bj_airquality_2018-04-13.csv


Unnamed: 0,id,station_id,time,PM25_Concentration,PM10_Concentration,NO2_Concentration,CO_Concentration,O3_Concentration,SO2_Concentration
0,2957584,dongsi_aq,2018-04-13 00:00:00,44.0,120.0,74.0,1.7,20.0,32.0
1,2957585,tiantan_aq,2018-04-13 00:00:00,47.0,107.0,76.0,1.8,13.0,16.0
2,2957586,guanyuan_aq,2018-04-13 00:00:00,41.0,110.0,73.0,1.6,27.0,26.0
3,2957587,wanshouxigong_aq,2018-04-13 00:00:00,38.0,128.0,69.0,1.8,13.0,21.0
4,2957588,aotizhongxin_aq,2018-04-13 00:00:00,41.0,120.0,78.0,1.7,25.0,24.0


# 輸出答案

要預測未來兩天 48小時 的 PM2.5 PM10 O3 

> 我想 4/11 送答案就是預測 4/12 ~ 4/13 的空氣品質

總共有 48 個點的數值需要預測

每個點要有 48 個小時的資料

站點包含 北京與倫敦 的點

id 格式 站點名#hours

In [6]:
sample = pd.read_csv('sample_submission.csv')
print(sample.shape)

sample_id = sample['test_id']
stations = list()
for id in sample_id.iloc[:]:
    [station, hours] = id.split('#')
    if station not in stations:
        stations.append(station)
print(len(stations), ' stations')

sample.head()

(2304, 4)
48  stations


Unnamed: 0,test_id,PM2.5,PM10,O3
0,dongsi_aq#0,0,0,0
1,dongsi_aq#1,0,0,0
2,dongsi_aq#2,0,0,0
3,dongsi_aq#3,0,0,0
4,dongsi_aq#4,0,0,0


In [8]:
realdata = pd.read_csv(getRealTimeData(day='2018-04-13'))
realdata = realdata['station_id']
realdata_stations = list()
for station in realdata.iloc[:]:
    if station not in realdata_stations:
        realdata_stations.append(station)

print(len(realdata_stations), ' bj stations')
#print(realdata_stations)

realdata = pd.read_csv(getRealTimeData(city='ld',day='2018-04-13'))
realdata = realdata['station_id']
realdata_stations = list()
for station in realdata.iloc[:]:
    if station not in realdata_stations:
        realdata_stations.append(station)

print(len(realdata_stations), ' ld stations')
#print(realdata_stations)

35  bj stations
19  ld stations


In [10]:
def injectAnswer(sample, fromDate, answer):
    formatDate = "%Y-%m-%d %H:%M:%S"
    fromDate = time.mktime(time.strptime(fromDate, formatDate))
    [rows, cols] = answer.shape
    for i in range(rows):
        data = answer.iloc[i]
        thisDate = time.mktime(time.strptime(data['time'],formatDate))
        thisDate = int((thisDate - fromDate) / (60*60))
        thisid = data['station_id'] + '#' + str(thisDate)
        sample.loc[sample['test_id'] == thisid, 'PM2.5'] = data['PM25_Concentration']
        sample.loc[sample['test_id'] == thisid, 'PM10'] = data['PM10_Concentration']
        sample.loc[sample['test_id'] == thisid, 'O3'] = data['O3_Concentration']

def generateAnswer(fromDate, method):
    sample = pd.read_csv('sample_submission.csv')
    sample.loc[:, 'PM2.5'] = np.nan
    sample.loc[:, 'PM10'] = np.nan
    sample.loc[:, 'O3'] = np.nan
    
    formatDate = "%Y-%m-%d %H:%M:%S"
    fromDate_t = time.strptime(fromDate,formatDate)
    
    firstDate = time.strftime("%Y-%m-%d", fromDate_t)
    fromDate_t = time.localtime(time.mktime(fromDate_t) + (24*60*60))
    secondDate = time.strftime("%Y-%m-%d", fromDate_t)
    
    realdata = pd.read_csv(getRealTimeData(city='bj',day=firstDate))
    injectAnswer(sample, fromDate, realdata)

    realdata = pd.read_csv(getRealTimeData(city='bj',day=secondDate))
    injectAnswer(sample, fromDate, realdata)

    realdata = pd.read_csv(getRealTimeData(city='ld',day=firstDate))
    injectAnswer(sample, fromDate, realdata)

    realdata = pd.read_csv(getRealTimeData(city='ld',day=secondDate))
    injectAnswer(sample, fromDate, realdata)
    
    if method == 'mean':
        answer = sample.fillna(sample.mean())
    if method == 'pad':
        answer = sample.fillna(method=method)
    
    todayDate = time.localtime(time.time())
    todayDate = time.strftime("%m%d", todayDate)
    
    answer.to_csv('./answer.csv', index=False, header=True)
    
    datadir = './answer'
    if not os.path.isdir(datadir):
        os.mkdir(datadir)
    
    filename = datadir + '/answer' + todayDate + method + '.csv'
    answer.to_csv(filename, index=False, header=True)
    
    return answer,filename

In [12]:
fromDate = '2018-04-15 00:00:00'

answer,filename = generateAnswer(fromDate, 'mean')

answer.head()

Unnamed: 0,test_id,PM2.5,PM10,O3
0,dongsi_aq#0,23.0,104.0,32.0
1,dongsi_aq#1,17.0,86.0,55.0
2,dongsi_aq#2,16.0,57.0,72.0
3,dongsi_aq#3,10.0,35.0,87.0
4,dongsi_aq#4,11.0,24.0,87.0


# 思考策略

先考慮如何建立模型

輸入是歷史資料, 輸出則是兩天的未來預測

考慮到訓練參數出來

把一年的歷史資料 分成訓練與測試



# SMAPE

計分方式

In [59]:
def SMAPE(actual=list(), forecast=list()):
    if len(actual)!=len(forecast):
        return 2
    
    total = 0
    for i in range(len(actual)):
        tmp = (abs(forecast[i]) + abs(actual[i]))/2
        if tmp != 0:
            tmp = abs(forecast[i] - actual[i]) / tmp
        total += tmp
    
    return total/len(actual)

In [61]:
SMAPE([1,2,3],[1,1,1])

0.5555555555555555