In [260]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, KFold
import math
import datetime
from sklearn.preprocessing import LabelEncoder

In [325]:
train_data = pd.read_csv('保定2016年.csv')
test_data = pd.read_csv('石家庄20160701-20170701.csv')

In [326]:
train_data.head()

Unnamed: 0,日期,AQI,质量等级,PM2.5,PM10,SO2,CO,NO2,O3_8h,IPRC
0,2016/1/1,293,重度污染,243,324,122,6.1,149,12,2.088378
1,2016/1/2,430,严重污染,395,517,138,7.5,180,18,3.316942
2,2016/1/3,332,严重污染,282,405,72,6.3,130,10,2.516425
3,2016/1/4,204,重度污染,154,237,73,3.5,72,34,1.505693
4,2016/1/5,169,中度污染,128,186,99,3.2,66,39,1.210233


In [327]:
quality_le = LabelEncoder()
quality_le.fit(train_data['质量等级'].values)
train_data['质量等级'] = quality_le.transform(train_data['质量等级'].values) 
quality_le.fit(test_data['质量等级'].values)
test_data['质量等级'] = quality_le.transform(test_data['质量等级'].values) 

In [328]:
train_data.head()

Unnamed: 0,日期,AQI,质量等级,PM2.5,PM10,SO2,CO,NO2,O3_8h,IPRC
0,2016/1/1,293,5,243,324,122,6.1,149,12,2.088378
1,2016/1/2,430,0,395,517,138,7.5,180,18,3.316942
2,2016/1/3,332,0,282,405,72,6.3,130,10,2.516425
3,2016/1/4,204,5,154,237,73,3.5,72,34,1.505693
4,2016/1/5,169,1,128,186,99,3.2,66,39,1.210233


In [329]:
x_train = np.array(train_data.values[:, 1:-1]).astype('float64')
y_train = np.array(train_data.values[:, -1]).astype('float64')
x_test = np.array(test_data.values[:, 1:]).astype('float64')

In [330]:
x_train

array([[293. ,   5. , 243. , ...,   6.1, 149. ,  12. ],
       [430. ,   0. , 395. , ...,   7.5, 180. ,  18. ],
       [332. ,   0. , 282. , ...,   6.3, 130. ,  10. ],
       ...,
       [328. ,   0. , 278. , ...,   4.5,  96. ,  13. ],
       [279. ,   5. , 229. , ...,   3.7,  92. ,  11. ],
       [377. ,   0. , 327. , ...,   6.7, 117. ,   9. ]])

In [331]:
mean = np.mean(x_train, axis = 0) 
std = np.std(x_train, axis = 0)
for i in range(x_train.shape[0]):
    for j in range(x_train.shape[1]):
        if not std[j] == 0 :
            x_train[i][j] = (x_train[i][j]- mean[j]) / std[j]

In [332]:
# mean = np.mean(x_train, axis = 0) 
# std = np.std(x_train, axis = 0)
for i in range(x_test.shape[0]):
    for j in range(x_test.shape[1]):
        if not std[j] == 0 :
            x_test[i][j] = (x_test[i][j]- mean[j]) / std[j]

In [221]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()

In [333]:
x_train[0]

array([ 2.17168779,  1.46391351,  2.17122922,  1.97048821,  2.97623229,
        3.31018235,  3.25580701, -1.4521449 ])

In [224]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

In [184]:
import xgboost as xgb
from sklearn.model_selection  import train_test_split
from sklearn.metrics import mean_squared_error

In [279]:
dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test)
dtrain

<xgboost.core.DMatrix at 0x7f83ca8b5710>

In [280]:
# XGBoost模型构建
# 1. 参数构建
params = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'reg:linear'}
num_round = 2
# 2. 模型训练
bst = xgb.train(params, dtrain, num_round)
# 3. 模型保存
# bst.save_model('xgb.model') 

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [281]:
y_pred = bst.predict(dtest)
# print(mean_squared_error(y_test, y_pred))

In [282]:
y_pred.shape

(366,)

In [283]:
y_pred[0]

0.43104538

In [334]:
model = keras.Sequential(
    [
        layers.Dense(64, activation='relu', name='layer1'),
        layers.Dense(128, activation='relu', name='layer2'),
        layers.Dense(128, activation='relu', name='layer3'),
        layers.Dense(64, activation='relu', name='layer4'),
        layers.Dense(32, activation='relu', name='layer5'),
        layers.Dense(1, activation='relu', name='layer6'),
    ]
)

In [335]:
model.compile(loss='mse', optimizer='adam', metrics=['mae'])

In [336]:
model.compile(loss='mse', optimizer='adam', metrics=[tf.keras.metrics.RootMeanSquaredError()])

In [382]:
def kfold_train(model, train_data, test_data, x_train, x_test, model_title):
    import time
    now = time.strftime('%d-%m-%Y %H:%M:%S')
    mean = np.mean(x_train, axis = 0) 
    std = np.std(x_train, axis = 0)
    for i in range(x_train.shape[0]):
        for j in range(x_train.shape[1]):
            if not std[j] == 0 :
                x_train[i][j] = (x_train[i][j]- mean[j]) / std[j]
    
    for i in range(x_test.shape[0]):
        for j in range(x_test.shape[1]):
            if not std[j] == 0 :
                x_test[i][j] = (x_test[i][j]- mean[j]) / std[j]

    answers = []
    score = 0
    n_fold = 5
    folds = KFold(n_splits=n_fold, shuffle=True,random_state=1314)
    for fold_n, (train_index, valid_index) in enumerate(folds.split(train_data)):
        print("[==============================] --- kfold")
        X_train, X_valid = x_train[train_index], x_train[valid_index]
        y_train, y_valid = train_data['IPRC'][train_index].values, train_data['IPRC'][valid_index].values
    #     y_train = np.array(y_train)
        model.fit(X_train, y_train, batch_size=64, epochs=30, verbose=1)
        y_pre=model.predict(X_valid)
        print('每一折验证分数:'+str(mean_squared_error(y_valid,y_pre)))
        score = score + mean_squared_error(y_valid,y_pre)
        y_pred_valid = model.predict(x_test)
        answers.append(y_pred_valid)
    model_pre=sum(answers)/n_fold
    print('模型验证分数'+str(math.sqrt(score/n_fold)))
    result=pd.DataFrame()
    result['date']=test_data['日期']
    result['IPRC']=model_pre
    result.to_csv(f'./{now}-{model_title}-submit.csv',index=False)#保存结果

In [383]:
kfold_train(model, train_data, test_data, x_train, x_test, "tf")

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
每一折验证分数:1.463104675774738e-05
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
每一折验证分数:2.8504209932267252e-05
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 

In [370]:
# answers = []
# score = 0
# n_fold = 5
# folds = KFold(n_splits=n_fold, shuffle=True,random_state=1314)
# for fold_n, (train_index, valid_index) in enumerate(folds.split(train_data)):
#     print("[==============================] --- kfold")
#     X_train, X_valid = x_train[train_index], x_train[valid_index]
#     y_train, y_valid = train_data['IPRC'][train_index].values, train_data['IPRC'][valid_index].values
# #     y_train = np.array(y_train)
#     model.fit(X_train, y_train, batch_size=64, epochs=30, verbose=1)
#     y_pre=model.predict(X_valid)
#     print('每一折验证分数:'+str(mean_squared_error(y_valid,y_pre)))
#     score = score + mean_squared_error(y_valid,y_pre)
#     y_pred_valid = model.predict(x_test)
#     answers.append(y_pred_valid)

In [357]:
X_train.shape

(293, 8)

In [371]:
# np.array(y_train)

In [319]:
x_train[0]

array([ 2.17168779,  1.46391351,  2.17122922,  1.97048821,  2.97623229,
        3.31018235,  3.25580701, -1.4521449 ])

In [298]:
X_train.head()

Unnamed: 0,日期,AQI,质量等级,PM2.5,PM10,SO2,CO,NO2,O3_8h,IPRC
0,2016/1/1,293,5,243,324,122,6.1,149,12,2.088378
1,2016/1/2,430,0,395,517,138,7.5,180,18,3.316942
2,2016/1/3,332,0,282,405,72,6.3,130,10,2.516425
3,2016/1/4,204,5,154,237,73,3.5,72,34,1.505693
4,2016/1/5,169,1,128,186,99,3.2,66,39,1.210233


In [None]:
kfold_train(model)

In [337]:
model.fit(x_train, y_train, batch_size=64, epochs=30, verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f83cb154470>

In [200]:
x_train[0]

array([ 2.17122922,  1.97048821,  2.97623229,  3.31018235,  3.25580701,
       -1.4521449 ])

In [198]:
x_test[0].shape

(6,)

In [205]:
# model.predict(x_test)

In [228]:
model.predict(x_test[0].reshape(1, 6)).round(3)

array([[0.375]], dtype=float32)

In [229]:
tf_y_pred = model.predict(x_test)

In [207]:
tf_y_pred.shape

(366, 1)

In [237]:
# tf_y_pred
tf_y_pred_v1 = pd.read_csv("./tf_submit.csv", header=None)
tf_y_pred_v1.columns

Int64Index([0, 1], dtype='int64')

In [251]:
# tf_y_pred_v1.values[:, 1]

In [255]:
tf_y_pred.shape

(366, 1)

In [258]:
tf_y_pred_v1.values[:, 1].reshape(x_test.shape[0], 1).shape

(366, 1)

In [259]:
np.sum(np.power((tf_y_pred - tf_y_pred_v1.values[:, 1].reshape(x_test.shape[0], 1)), 2))

1.468087638531257

In [209]:
# tf_y_pred

In [None]:
# tf_y_pred

In [212]:
import csv

In [219]:
f = open('tf_submit.csv', "w")
w = csv.writer(f)
title = ["date", "IPRC"]
for i in range(x_test.shape[0]):
    
    content = ['date' + str(i), tf_y_pred[i][0]]
    w.writerow(content)

In [None]:
result=pd.DataFrame()
result['date']=test_data['日期']
result['IPRC']=y_pred
result.to_csv('空气质量.csv',index=False)#保存结果