In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, precision_score, accuracy_score, recall_score, f1_score
import xgboost as xgb

In [2]:
data = pd.read_csv("新竹_2020.csv", encoding = 'gb18030')
data = data.iloc[1:,:]

#擷取月份出來
all_time_month = []
for i in range(data.shape[0]): 
    str1 = data.iloc[i,1]
    time_month = int(str1.split("/")[1])
    all_time_month.append(time_month)
    
#合併到原資料
all_time_month = pd.DataFrame(all_time_month,columns = ["month"])
all_time_month.index = all_time_month.index + 1
data_month = pd.concat([data, all_time_month], axis = 1)

In [3]:
#擷取10、11、12月份資料
data_extract = pd.DataFrame()
for i in (10,11,12):
    for j in (np.where(data_month["month"] == i)):
        data_extract = pd.concat([data_extract, data_month.iloc[j][:]])
data_extract_month = data_extract["month"]
data_extract = data_extract.iloc[:, 2:data_extract.shape[1]-1]

In [4]:
#補遺失值 將字串空格消除
for i in range(data_extract.shape[1]):
    for j in range(data_extract.shape[0]):
        data_extract.iloc[j,i] = data_extract.iloc[j,i].split(" ")[0]

In [5]:
#1代表是遺失值
def check_miss(row, col):
    flag = 0
    miss = ["#", "*", "x", "A", " "]
    for i in range(len(miss)):
        if data_extract.iloc[row,col] == miss[i] :
            flag = 1
            break
    return flag

In [6]:
#全部都是遺失值的都變0
for i in range(data_extract.shape[0]):
    total = 0
    for j in range(1,data_extract.shape[1]):
        if (check_miss(i,j) == 1):
            total += 1
    if total == data_extract.shape[1] - 1:
        for j in range(1,data_extract.shape[1]):
            data_extract.iloc[i,j] = 0

In [7]:
for i in range(data_extract.shape[0]):
    for j in range(1,data_extract.shape[1]):
        if check_miss(i,j) == 1:
            if j == 1 : #第一行 從第二行開始看如果後面還是遺失值就一直往後
                count = 2
                while(count < data_extract.shape[1] and check_miss(i,count) == 1):
                    count += 1
                data_extract.iloc[i,j] = data_extract.iloc[i, count]
            elif j == data_extract.shape[1] - 1: #代表最後一行
                count = data_extract.shape[1] - 2
                while(count > 1 and check_miss(i,count) == 1):
                    count -= 1
                data_extract.iloc[i,j] = data_extract.iloc[i, count]
            else: #代表中間的行
                front = j - 1
                back = j + 1
                while(check_miss(i, front) == 1):
                    front -= 1
                while(back < data_extract.shape[1] and check_miss(i, back) == 1): #要有保護機制不能超過最後一行
                    back += 1
                if (back == data_extract.shape[1]):
                    data_extract.iloc[i,j] = data_extract.iloc[i,front]
                else :
                    data_extract.iloc[i,j] = (float(data_extract.iloc[i,front]) + float(data_extract.iloc[i,back])) / 2.0
                    

In [8]:
#把全部值都轉成數字
data_final = pd.concat([data_extract, data_extract_month], axis = 1)
for i in range(data_extract.shape[0]):
    for j in range(1,data_extract.shape[1]):
        data_final.iloc[i,j] = float(data_final.iloc[i,j])
        

In [9]:
#切資料
month12 = np.where(data_final['month'] == 12)[0][0] #12月的初始位置
data_train = data_final.iloc[:month12, :data_final.shape[1] - 1]
data_test = data_final.iloc[month12:, :data_final.shape[1] - 1]

In [10]:
#train data的轉置
data_train_trans = data_train.iloc[0:18,:]
data_train_trans = data_train_trans.reset_index().drop(['index'],axis = 1)
times = int(data_train.shape[0] / 18) - 1

for i in range(times):
    start = 18
    data_temp = data_train.iloc[start:start + 18, 1:]
    data_temp = data_temp.reset_index().drop(['index'],axis = 1)
    data_train_trans = pd.concat([data_train_trans, data_temp], axis = 1)
    start += 18
data_train_trans = data_train_trans.iloc[:,1:]

In [11]:
#test data的轉置
data_test_trans = data_test.iloc[0:18,:]
data_test_trans = data_test_trans.reset_index().drop(['index'],axis = 1)
times = int(data_test.shape[0] / 18) - 1

for i in range(times):
    start = 18
    data_temp = data_test.iloc[start:start + 18, 1:]
    data_temp = data_temp.reset_index().drop(['index'],axis = 1)
    data_test_trans = pd.concat([data_test_trans, data_temp], axis = 1)
    start += 18
data_test_trans = data_test_trans.iloc[:,1:]

In [12]:
#第一種資料及 6小時
first_x = []
first_y = []
first_test_x = []
first_test_y = []

for i in range(1458):
    first_x.append(data_train_trans.iloc[:,i:i+6])
    first_y.append(data_train_trans.iloc[:,i+6])
    
for i in range(738):
    first_test_x.append(data_test_trans.iloc[:,i:i+6])
    first_test_y.append(data_test_trans.iloc[:,i+6])

#第二種資料及 11小時
second_x = []
second_y = []
second_test_x = []
second_test_y = []

for i in range(1453):
    second_x.append(data_train_trans.iloc[:,i:i+6])
    second_y.append(data_train_trans.iloc[:,i+11])
    
for i in range(733):
    second_test_x.append(data_test_trans.iloc[:,i:i+6])
    second_test_y.append(data_test_trans.iloc[:,i+11])

In [13]:
#預測未來6小時 只有pm2.5
pm25_train_x_6 = []
for i in range(len(first_x)):
    pm25_train_x_6.append(first_x[i].iloc[9])
    
pm25_train_y_6 = []
for i in range(len(first_y)):
    pm25_train_y_6.append(first_y[i].iloc[9])
    
pm25_test_x_6 = []
for i in range(len(first_test_x)):
    pm25_test_x_6.append(first_test_x[i].iloc[9])
    
pm25_test_y_6 = []
for i in range(len(first_test_y)):
    pm25_test_y_6.append(first_test_y[i].iloc[9])
    
#----------------------------------------------------  
#預測未來11小時 只有pm2.5

pm25_train_x_11 = []
for i in range(len(second_x)):
    pm25_train_x_11.append(second_x[i].iloc[9])
    
pm25_train_y_11 = []
for i in range(len(second_y)):
    pm25_train_y_11.append(second_y[i].iloc[9])
    
pm25_test_x_11 = []
for i in range(len(second_test_x)):
    pm25_test_x_11.append(second_test_x[i].iloc[9])
    
pm25_test_y_11 = []
for i in range(len(second_test_y)):
    pm25_test_y_11.append(second_test_y[i].iloc[9])
    
#---------------------------------------------------- 
#預測未來6小時 18種屬性

all_train_x_6 = []
for i in range(len(first_x)):
    for j in range(18):
        all_train_x_6.append(first_x[i].iloc[j])
        
all_train_y_6 = []
for i in range(len(first_y)):
    for j in range(18):
        all_train_y_6.append(first_y[i].iloc[j])
    
all_test_x_6 = []
for i in range(len(first_test_x)):
    for j in range(18):
        all_test_x_6.append(first_test_x[i].iloc[j])
    
all_test_y_6 = []
for i in range(len(first_test_y)):
    for j in range(18):
        all_test_y_6.append(first_test_y[i].iloc[j])
        
#----------------------------------------------------   
#預測未來11小時 18種屬性

all_train_x_11 = []
for i in range(len(second_x)):
    for j in range(18):
        all_train_x_11.append(second_x[i].iloc[j])
        
all_train_y_11 = []
for i in range(len(second_y)):
    for j in range(18):
        all_train_y_11.append(second_y[i].iloc[j])
    
all_test_x_11 = []
for i in range(len(second_test_x)):
    for j in range(18):
        all_test_x_11.append(second_test_x[i].iloc[j])
    
all_test_y_11 = []
for i in range(len(second_test_y)):
    for j in range(18):
        all_test_y_11.append(second_test_y[i].iloc[j])


In [14]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(pm25_train_x_6, pm25_train_y_6)
reg_pm25_predicted = reg.predict(pm25_test_x_6)
reg_pm25_mae = np.mean(abs(reg_pm25_predicted - pm25_test_y_6))
print("predict the after sixth hours pm2.5 MAE (LinearRegression) : ",reg_pm25_mae)

predict the after sixth hours pm2.5 MAE (LinearRegression) :  3.7218509428225457


In [15]:
reg = LinearRegression().fit(pm25_train_x_11, pm25_train_y_11)
reg_pm25_predicted = reg.predict(pm25_test_x_11)
reg_pm25_mae = np.mean(abs(reg_pm25_predicted - pm25_test_y_11))
print("predict the after eleven hours pm2.5 MAE (LinearRegression) : ",reg_pm25_mae)

predict the after eleven hours pm2.5 MAE (LinearRegression) :  13.028377521578781


In [16]:
reg = LinearRegression().fit(all_train_x_6, all_train_y_6)
reg_all_predicted = reg.predict(all_test_x_6)
reg_all_mae = np.mean(abs(reg_all_predicted - all_test_y_6))
print("predict the after sixth hours all attributes MAE (LinearRegression) : ",reg_all_mae)

predict the after sixth hours all attributes MAE (LinearRegression) :  1.7032120247145188


In [17]:
reg = LinearRegression().fit(all_train_x_11, all_train_y_11)
reg_all_predicted = reg.predict(all_test_x_11)
reg_all_mae = np.mean(abs(reg_all_predicted - all_test_y_11))
print("predict the after eleven hours all attributes MAE (LinearRegression) : ",reg_all_mae)

predict the after eleven hours all attributes MAE (LinearRegression) :  3.142246215361692


In [18]:
from xgboost import XGBClassifier
xgboostModel = XGBClassifier(n_estimators=100, learning_rate= 0.3)
xgboostModel.fit(pm25_train_x_6, pm25_train_y_6)
xgb_pm25_predicted = xgboostModel.predict(pm25_test_x_6)
xgb_pm25_mae = np.mean(abs(xgb_pm25_predicted - pm25_test_y_6))
print("predict the after sixth hours pm2.5 MAE (XGboost): ",xgb_pm25_mae)



predict the after sixth hours pm2.5 MAE (XGboost):  3.484417344173442


In [19]:
xgboostModel.fit(pm25_train_x_11, pm25_train_y_11)
xgb_pm25_predicted = xgboostModel.predict(pm25_test_x_11)
xgb_pm25_mae = np.mean(abs(xgb_pm25_predicted - pm25_test_y_11))
print("predict the after eleven hours pm2.5 MAE (XGboost): ",xgb_pm25_mae)

predict the after eleven hours pm2.5 MAE (XGboost):  7.014324693042292


In [20]:
xgboostModel = XGBClassifier(n_estimators=100, learning_rate= 0.3)
xgboostModel.fit(all_train_x_6, all_train_y_6)
xgb_all_predicted = xgboostModel.predict(all_test_x_6)
xgb_all_mae = np.mean(abs(xgb_all_predicted - all_test_y_6))
print("predict the after sixth hours all attributes MAE (XGboost): ",xgb_all_mae)

predict the after sixth hours all attributes MAE (XGboost):  10.130837991192413


In [21]:
xgboostModel = XGBClassifier(n_estimators=100, learning_rate= 0.3)
xgboostModel.fit(all_train_x_11, all_train_y_11)
xgb_all_predicted = xgboostModel.predict(all_test_x_11)
xgb_all_mae = np.mean(abs(xgb_all_predicted - all_test_y_11))
print("predict the after eleven hours all attributes MAE (XGboost): ",xgb_all_mae)

predict the after eleven hours all attributes MAE (XGboost):  15.570065039032892
