In [186]:
# Author: QiangZiBro
# Time: 2021-10-14
# Contact: Github/QiangZiBro

# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
import warnings
# visualization
import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use('ggplot')
# MacOS上画图中文乱码的问题
# https://blog.csdn.net/minixuezhen/article/details/81516949?utm_medium=distribute.pc_relevant.none-task-blog-
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']

# %matplotlib inline

import sys
import os

from pprint import pprint

if not os.path.exists("results"):
    os.mkdir("results")

warnings.filterwarnings("ignore")
FIGSIZE = (30, 25)
FORMAT = "jpg"

## 数据加载工具

In [49]:
PLACES = ['A', 'B', 'C', 'A1', 'A2', 'A3']
TYPES = [0, 1, 2]
def _load_data():
    table_files = """data/附件1 监测点A空气质量预报基础数据.xlsx
data/附件2 监测点B、C空气质量预报基础数据.xlsx
data/附件3 监测点A1、A2、A3空气质量预报基础数据.xlsx""".split("\n")
    tables = [pd.read_excel("../"+i, engine='openpyxl', sheet_name=None) for i in table_files] # 表格全读取
    keys = [list(i.keys()) for i in tables] # 每个表格的sheet list
    return tables, keys

def _process_to_json(tables):
    sheetnames = {0:'监测点{}逐小时污染物浓度与气象一次预报数据', 1:'监测点{}逐小时污染物浓度与气象实测数据', 2:'监测点{}逐日污染物浓度实测数据'}
    result = {i:{} for i in PLACES}
    
    for k,v in sheetnames.items():
        tables[0][v.format('A')].name = v.format('A')
        result['A'][k] = tables[0][v.format('A')]
    for c in ['B', 'C']:
        for k,v in sheetnames.items():
            tables[1][v.format(c)].name = v.format(c)
            result[c][k] = tables[1][v.format(c)]
    for c in ['A1', 'A2', 'A3']:
        for k,v in sheetnames.items():
            tables[2][v.format(c)].name = v.format(c)
            result[c][k] = tables[2][v.format(c)]
    return result

def load(): 
    """得到一个Json格式的数据项
    
    Return:
        {
            "A":{
                "0":DataFrame,
                "1":DataFrame,
                "2":DataFrame,
            }
            ...
        }
    """
    
    tables, keys = _load_data()
    return _process_to_json(tables)

data = load()

## 一键报告（仅供参考）

In [None]:
import pandas_profiling
from pathlib import Path
for p in PLACES:
    for i in range(3):
        profile = data[p][i].profile_report(title = f"{p}_{i}")
        profile.to_file(output_file = Path(f"./{p}_{i}_report.html"))

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=5.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=5.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=5.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=5.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=5.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=5.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=5.0, style=ProgressStyle(descript…

In [192]:
import sweetviz as sv
my_report = sv.analyze(data['A'][2])
my_report.show_html()

for p in PLACES:
    for i in range(3):
        my_report = sv.analyze(data[p][i])
        my_report.show_html(Path(f"./sv_{p}_{i}_report.html"))

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…


Report SWEETVIZ_REPORT.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=25.0), HTML(value='')), layout=Layout(dis…


Report sv_A_0_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=14.0), HTML(value='')), layout=Layout(dis…


Report sv_A_1_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…


Report sv_A_2_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=25.0), HTML(value='')), layout=Layout(dis…


Report sv_B_0_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=14.0), HTML(value='')), layout=Layout(dis…


Report sv_B_1_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…


Report sv_B_2_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=25.0), HTML(value='')), layout=Layout(dis…


Report sv_C_0_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=14.0), HTML(value='')), layout=Layout(dis…


Report sv_C_1_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…


Report sv_C_2_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=25.0), HTML(value='')), layout=Layout(dis…


Report sv_A1_0_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=13.0), HTML(value='')), layout=Layout(dis…


Report sv_A1_1_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…


Report sv_A1_2_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=25.0), HTML(value='')), layout=Layout(dis…


Report sv_A2_0_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=14.0), HTML(value='')), layout=Layout(dis…


Report sv_A2_1_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…


Report sv_A2_2_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=25.0), HTML(value='')), layout=Layout(dis…


Report sv_A3_0_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=14.0), HTML(value='')), layout=Layout(dis…


Report sv_A3_1_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…


Report sv_A3_2_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [3]:
print(data.keys())
print(data['A'].keys())
# for p in PLACES:
#     for i in range(3): 
#         print(data[p][i].min())

dict_keys(['A', 'B', 'C', 'A1', 'A2', 'A3'])
dict_keys([0, 1, 2])


# 数据处理
- 缺失值填充
- 负值替换

In [None]:
def create_new_table_for_place(p):
    writer = pd.ExcelWriter("../data/" + p + '.xlsx', engine='xlsxwriter')
    for i in range(3):
        df = data[p][i]
        # 数值化
        index = list(df.columns).index('地点')+1
        df1 = df[df.columns[index:]]
        # 异常值处理
        df1 = df1.apply(pd.to_numeric, errors='coerce')
        # 负值处理
        num = df1._get_numeric_data()
        num[num < 0] = np.nan
        # 空值用中位数填充
        df1 = df1.fillna(df.median(numeric_only=True))
        df[df.columns[index:]] = df1
        df.to_excel(writer, sheet_name=str(i))
        
    writer.save()
for p in PLACES:
    create_new_table_for_place(p)

# create_new_table_for_place(PLACES[0])

In [173]:
df = pd.read_excel("../data/{}.xlsx".format('A'), engine='openpyxl', sheet_name=None)
df = df['1']
df.isna().sum()

Unnamed: 0            0
监测时间                  0
地点                    0
SO2监测浓度(μg/m³)      231
NO2监测浓度(μg/m³)        0
PM10监测浓度(μg/m³)       0
PM2.5监测浓度(μg/m³)      0
O3监测浓度(μg/m³)         0
CO监测浓度(mg/m³)         0
温度(℃)                 0
湿度(%)                 0
气压(MBar)              0
风速(m/s)               0
风向(°)                 0
dtype: int64

In [176]:
df.fillna(df.mean()).iloc[500]

  df.fillna(df.mean()).iloc[500]


Unnamed: 0                          500
监测时间                2019-05-06 21:00:00
地点                                 监测点A
SO2监测浓度(μg/m³)                  6.99734
NO2监测浓度(μg/m³)                       27
PM10监测浓度(μg/m³)                      38
PM2.5监测浓度(μg/m³)                     19
O3监测浓度(μg/m³)                        41
CO监测浓度(mg/m³)                       0.7
温度(℃)                              26.1
湿度(%)                                71
气压(MBar)                           1010
风速(m/s)                             1.3
风向(°)                            108.55
Name: 500, dtype: object

In [177]:
df.iloc[500]

Unnamed: 0                          500
监测时间                2019-05-06 21:00:00
地点                                 监测点A
SO2监测浓度(μg/m³)                      NaN
NO2监测浓度(μg/m³)                       27
PM10监测浓度(μg/m³)                      38
PM2.5监测浓度(μg/m³)                     19
O3监测浓度(μg/m³)                        41
CO监测浓度(mg/m³)                       0.7
温度(℃)                              26.1
湿度(%)                                71
气压(MBar)                           1010
风速(m/s)                             1.3
风向(°)                            108.55
Name: 500, dtype: object

In [159]:
# 数值化
index = list(df.columns).index('地点')+1
df1 = df[df.columns[index+1:]]
# 异常值处理
df1 = df1.apply(pd.to_numeric, errors='coerce')
# 负值处理
num = df1._get_numeric_data()
num[num < 0] = np.nan
# 空值用中位数填充
df1 = df1.fillna(df.median(numeric_only=True))
df[df.columns[index+1:]] = df1

In [160]:
df.iloc[208]

Unnamed: 0                          208
监测时间                2019-04-24 16:00:00
地点                                 监测点A
SO2监测浓度(μg/m³)                        —
NO2监测浓度(μg/m³)                       27
PM10监测浓度(μg/m³)                      38
PM2.5监测浓度(μg/m³)                     19
O3监测浓度(μg/m³)                        41
CO监测浓度(mg/m³)                       0.7
温度(℃)                              26.1
湿度(%)                                71
气压(MBar)                           1010
风速(m/s)                             1.3
风向(°)                            108.55
Name: 208, dtype: object

In [165]:
df.columns[index:]

Index(['SO2监测浓度(μg/m³)', 'NO2监测浓度(μg/m³)', 'PM10监测浓度(μg/m³)',
       'PM2.5监测浓度(μg/m³)', 'O3监测浓度(μg/m³)', 'CO监测浓度(mg/m³)', '温度(℃)', '湿度(%)',
       '气压(MBar)', '风速(m/s)', '风向(°)'],
      dtype='object')

## 预处理完分析

In [151]:
df = data['A'][1]
index = list(data['A'][0].columns).index('地点')+1
df1 = df[df.columns[index+1:]]
# 异常值处理
df1 = df1.apply(pd.to_numeric, errors='coerce')
# 负值处理
num = df1._get_numeric_data()
num[num < 0] = np.nan
# 空值用中位数填充
df1 = df1.fillna(df.median(numeric_only=True))
df[df.columns[index+1:]] = df1
df.iloc[182]

监测时间                2019-04-23 14:00:00
地点                                 监测点A
SO2监测浓度(μg/m³)                        5
NO2监测浓度(μg/m³)                       14
PM10监测浓度(μg/m³)                      26
PM2.5监测浓度(μg/m³)                     13
O3监测浓度(μg/m³)                        41
CO监测浓度(mg/m³)                       0.4
温度(℃)                              32.2
湿度(%)                                59
气压(MBar)                           1007
风速(m/s)                               2
风向(°)                             235.6
Name: 182, dtype: object

In [135]:
type(data['A'][0].columns)

pandas.core.indexes.base.Index

In [129]:
data['A'][0].columns

Index(['模型运行日期', '预测时间', '地点', '近地2米温度（℃）', '地表温度（K）', '比湿（kg/kg）', '湿度（%）',
       '近地10米风速（m/s）', '近地10米风向（°）', '雨量（mm）', '云量', '边界层高度（m）', '大气压（Kpa）',
       '感热通量（W/m²）', '潜热通量（W/m²）', '长波辐射（W/m²）', '短波辐射（W/m²）', '地面太阳能辐射（W/m²）',
       'SO2小时平均浓度(μg/m³)', 'NO2小时平均浓度(μg/m³)', 'PM10小时平均浓度(μg/m³)',
       'PM2.5小时平均浓度(μg/m³)', 'O3小时平均浓度(μg/m³)', 'CO小时平均浓度(mg/m³)'],
      dtype='object')

In [125]:
list(data['A'][0].columns).index('地点')

2

In [None]:
data['A'][0].columns.get_indexer_for

In [99]:
df.iloc[182]

监测时间                2019-04-23 14:00:00
地点                                 监测点A
SO2监测浓度(μg/m³)                        5
NO2监测浓度(μg/m³)                       14
PM10监测浓度(μg/m³)                      26
PM2.5监测浓度(μg/m³)                     13
O3监测浓度(μg/m³)                         —
CO监测浓度(mg/m³)                       0.4
温度(℃)                              32.2
湿度(%)                                59
气压(MBar)                           1007
风速(m/s)                               2
风向(°)                             235.6
Name: 182, dtype: object

In [96]:
df.apply(pd.to_numeric, errors='coerce')

监测时间                1.556028e+18
地点                           NaN
SO2监测浓度(μg/m³)      5.000000e+00
NO2监测浓度(μg/m³)      1.400000e+01
PM10监测浓度(μg/m³)     2.600000e+01
PM2.5监测浓度(μg/m³)    1.300000e+01
O3监测浓度(μg/m³)                NaN
CO监测浓度(mg/m³)       4.000000e-01
温度(℃)               3.220000e+01
湿度(%)               5.900000e+01
气压(MBar)            1.007000e+03
风速(m/s)             2.000000e+00
风向(°)               2.356000e+02
Name: 182, dtype: float64

In [89]:
df["SO2监测浓度(μg/m³)"].isna().sum()

0

## 预处理完分析

In [33]:
for p in PLACES:
    df = pd.read_excel("../data/{}.xlsx".format(p), engine='openpyxl', sheet_name=None)
    for i in range(3):
        num = df[str(i)]._get_numeric_data()
        if (num<0).sum().sum() != 0:
            print(p,i)

In [152]:
df = pd.read_excel("../data/{}.xlsx".format('A'), engine='openpyxl', sheet_name=None)
df = df['1']
df.fillna(df.mean()).iloc[182]

  df.fillna(df.mean()).iloc[182]


Unnamed: 0                          182
监测时间                2019-04-23 14:00:00
地点                                 监测点A
SO2监测浓度(μg/m³)                        5
NO2监测浓度(μg/m³)                       14
PM10监测浓度(μg/m³)                      26
PM2.5监测浓度(μg/m³)                     13
O3监测浓度(μg/m³)                        41
CO监测浓度(mg/m³)                       0.4
温度(℃)                              32.2
湿度(%)                                59
气压(MBar)                           1007
风速(m/s)                               2
风向(°)                             235.6
Name: 182, dtype: object

In [153]:
df.iloc[182]

Unnamed: 0                          182
监测时间                2019-04-23 14:00:00
地点                                 监测点A
SO2监测浓度(μg/m³)                        5
NO2监测浓度(μg/m³)                       14
PM10监测浓度(μg/m³)                      26
PM2.5监测浓度(μg/m³)                     13
O3监测浓度(μg/m³)                        41
CO监测浓度(mg/m³)                       0.4
温度(℃)                              32.2
湿度(%)                                59
气压(MBar)                           1007
风速(m/s)                               2
风向(°)                             235.6
Name: 182, dtype: object

In [110]:
df

Unnamed: 0.1,Unnamed: 0,监测时间,地点,SO2监测浓度(μg/m³),NO2监测浓度(μg/m³),PM10监测浓度(μg/m³),PM2.5监测浓度(μg/m³),O3监测浓度(μg/m³),CO监测浓度(mg/m³),温度(℃),湿度(%),气压(MBar),风速(m/s),风向(°)
0,0,1555372800000000000,,5,75,61,38,1,1.0,21.1,89,1013.0,1.2,346.5
1,1,1555376400000000000,,5,64,60,41,25,0.8,20.4,92,1012.4,1.1,18.0
2,2,1555380000000000000,,4,60,36,40,25,0.8,20.0,93,1011.3,1.1,65.9
3,3,1555383600000000000,,4,39,35,33,41,0.7,19.8,93,1010.4,1.5,78.4
4,4,1555387200000000000,,4,39,28,39,36,0.8,19.9,92,1010.0,1.5,73.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19427,19427,1626145200000000000,,6,12,12,0,14,0.4,29.2,78,1006.2,0.8,350.5
19428,19428,1626148800000000000,,7,15,9,0,11,0.4,29.2,78,1006.3,0.7,246.3
19429,19429,1626152400000000000,,9,19,9,2,8,0.4,29.2,78,1006.4,1.0,253.2
19430,19430,1626156000000000000,,8,20,22,5,6,0.4,29.0,81,1006.8,0.5,267.9


In [206]:
df['监测点A1逐日污染物浓度实测数据'] = df['监测点A1逐日污染物浓度实测数据'].fillna(df['监测点A1逐日污染物浓度实测数据'].mean())

  df['监测点A1逐日污染物浓度实测数据'] = df['监测点A1逐日污染物浓度实测数据'].fillna(df['监测点A1逐日污染物浓度实测数据'].mean())


## 空值分析

In [117]:
def null_values_analysis(data):
    NULL_COUNTS = {}
    for p in PLACES:
        for t in TYPES:  
            if data[p][t].isnull().sum().sum():
                NULL_COUNTS.update({p+"_"+str(t):data[p][t].isnull().sum()})
    return NULL_COUNTS

def null_values_analysis_plot(null_table):
    fig, axs = plt.subplots(3, 3, figsize=FIGSIZE)
    fig.tight_layout(h_pad=10)
    fig.suptitle('空值分析', size=20)
    
    for i, (k,v) in enumerate(null_table.items()):
        axs[i//3][i%3].bar(v.keys(), v, 0.35)
        axs[i//3][i%3].set_title(k)
        axs[i//3][i%3].set_xticklabels(labels=v.keys(),rotation=45)
        axs[i//3][i%3].grid(True)

    font = {'family': 'serif',
            'color':  'darkred',
            'weight': 'normal',
            'size': 16,
            }

    plt.savefig("results/nan_value_analysis."+FORMAT, format=FORMAT)
    plt.close()
null_table = null_values_analysis(data)
null_values_analysis_plot(null_table)

  axs[i//3][i%3].set_xticklabels(labels=v.keys(),rotation=45)


# 数据分析
下面首先对各量随时间的分布进行分析
注意数字和子表格的对应：
- 0 逐小时污染物浓度与气象一次预报数据
- 1 逐小时污染物浓度与气象实测数据
- 2 逐日污染物浓度实测数据

## 0 对逐小时污染物浓度与气象一次预报数据的分析

In [118]:
key = data['A'][0].columns
key, len(key[3:])

(Index(['模型运行日期', '预测时间', '地点', '近地2米温度（℃）', '地表温度（K）', '比湿（kg/kg）', '湿度（%）',
        '近地10米风速（m/s）', '近地10米风向（°）', '雨量（mm）', '云量', '边界层高度（m）', '大气压（Kpa）',
        '感热通量（W/m²）', '潜热通量（W/m²）', '长波辐射（W/m²）', '短波辐射（W/m²）', '地面太阳能辐射（W/m²）',
        'SO2小时平均浓度(μg/m³)', 'NO2小时平均浓度(μg/m³)', 'PM10小时平均浓度(μg/m³)',
        'PM2.5小时平均浓度(μg/m³)', 'O3小时平均浓度(μg/m³)', 'CO小时平均浓度(mg/m³)'],
       dtype='object'),
 21)

In [119]:
PREDICTED_KEYS = key[3:]
print("预测天气状况",PREDICTED_KEYS[:-6])
print("预测污染因素",PREDICTED_KEYS[-6:])

预测天气状况 Index(['近地2米温度（℃）', '地表温度（K）', '比湿（kg/kg）', '湿度（%）', '近地10米风速（m/s）',
       '近地10米风向（°）', '雨量（mm）', '云量', '边界层高度（m）', '大气压（Kpa）', '感热通量（W/m²）',
       '潜热通量（W/m²）', '长波辐射（W/m²）', '短波辐射（W/m²）', '地面太阳能辐射（W/m²）'],
      dtype='object')
预测污染因素 Index(['SO2小时平均浓度(μg/m³)', 'NO2小时平均浓度(μg/m³)', 'PM10小时平均浓度(μg/m³)',
       'PM2.5小时平均浓度(μg/m³)', 'O3小时平均浓度(μg/m³)', 'CO小时平均浓度(mg/m³)'],
      dtype='object')


In [120]:
def plot_prediction_hist_by_time(place):
    "对逐小时污染物浓度与气象一次预报数据的分析"
    key = data[place][0].columns
    PREDICTED_KEYS = key[3:]
    fig, axes = plt.subplots(nrows=7, ncols=3, figsize=FIGSIZE)
    fig.tight_layout(h_pad=3)
    plt.suptitle(place+"_0", size=30)
    
    for i, k in enumerate(PREDICTED_KEYS):
        data[place][0].plot.line(ax = axes[i//3][i%3], x = key[1] , y=k)
    plt.savefig("results/{}_0逐小时污染物浓度与气象一次预报数据.{}".format(place, FORMAT), format=FORMAT)
    plt.close()


for p in PLACES:
    plot_prediction_hist_by_time(p)

## 1 对逐小时污染物浓度与气象实测数据的分析

In [121]:
key = data['A'][1].columns
key, len(key[2:])

(Index(['监测时间', '地点', 'SO2监测浓度(μg/m³)', 'NO2监测浓度(μg/m³)', 'PM10监测浓度(μg/m³)',
        'PM2.5监测浓度(μg/m³)', 'O3监测浓度(μg/m³)', 'CO监测浓度(mg/m³)', '温度(℃)', '湿度(%)',
        '气压(MBar)', '风速(m/s)', '风向(°)'],
       dtype='object'),
 11)

In [122]:
MEASURED_KEYS = key[2:]
print("测量天气状况",MEASURED_KEYS[:6])
print("测量污染因素",MEASURED_KEYS[6:])

测量天气状况 Index(['SO2监测浓度(μg/m³)', 'NO2监测浓度(μg/m³)', 'PM10监测浓度(μg/m³)',
       'PM2.5监测浓度(μg/m³)', 'O3监测浓度(μg/m³)', 'CO监测浓度(mg/m³)'],
      dtype='object')
测量污染因素 Index(['温度(℃)', '湿度(%)', '气压(MBar)', '风速(m/s)', '风向(°)'], dtype='object')


In [123]:
def plot_measured_hist_by_hour(place):
    "对逐小时污染物浓度与气象一次预报数据的分析"
    key = data[place][1].columns
    MEASURED_KEYS = key[2:]
    fig, axes = plt.subplots(nrows=4, ncols=3, figsize=FIGSIZE)
    for i, k in enumerate(MEASURED_KEYS):
        axes[i//3][i%3].set_xticks([])
        # 预处理
        data[place][1][k].fillna(0)
        # 将object类型转为数值类型
        data[place][1][k] = pd.to_numeric(data[place][1][k], errors='coerce')
        data[place][1].plot.line(ax = axes[i//3][i%3], x = key[0] , y=k)
    fig.tight_layout(h_pad=3)
    plt.suptitle(place+"_1", size=30)
    plt.savefig("results/{}_1逐小时污染物浓度与气象实测数据.{}".format(place, FORMAT), format=FORMAT)
    plt.close()
    
for p in PLACES:
    plot_measured_hist_by_hour(p)

## 2 对逐日污染物浓度实测数据的分析

In [124]:
key = data['A'][2].columns
key, len(key[2:])

(Index(['监测日期', '地点', 'SO2监测浓度(μg/m³)', 'NO2监测浓度(μg/m³)', 'PM10监测浓度(μg/m³)',
        'PM2.5监测浓度(μg/m³)', 'O3最大八小时滑动平均监测浓度(μg/m³)', 'CO监测浓度(mg/m³)'],
       dtype='object'),
 6)

In [125]:
MEASURED_KEYS = key[2:]
print("测量污染因素",MEASURED_KEYS[:])

测量污染因素 Index(['SO2监测浓度(μg/m³)', 'NO2监测浓度(μg/m³)', 'PM10监测浓度(μg/m³)',
       'PM2.5监测浓度(μg/m³)', 'O3最大八小时滑动平均监测浓度(μg/m³)', 'CO监测浓度(mg/m³)'],
      dtype='object')


In [126]:
def plot_measured_polution_hist_by_day(place):
    "对逐小时污染物浓度与气象一次预报数据的分析"
    key = data[place][2].columns
    MEASURED_KEYS = key[2:]
    fig, axes = plt.subplots(nrows=2, ncols=3, figsize=FIGSIZE)
    for i, k in enumerate(MEASURED_KEYS):
        # 预处理
        data[place][2][k].fillna(0)
        # 将object类型转为数值类型
        data[place][2][k] = pd.to_numeric(data[place][2][k], errors='coerce')
        data[place][2].plot.line(ax = axes[i//3][i%3], x = key[0] , y=k)
    fig.tight_layout(h_pad=3)
    plt.suptitle(place+"_2", size=30)
    plt.savefig("results/{}_2逐日污染物浓度实测数据.{}".format(place, FORMAT), format=FORMAT)
    plt.close()
# 解锁运行  
for p in PLACES:
    plot_measured_polution_hist_by_day(p)