In [3]:
import datetime
import requests
import pandas as pd
from io import StringIO
import pandas
pd.set_option('display.expand_frame_repr', False)  # 別折行

# 資料來源：倫敦、巴黎的 CO2,PM25 空氣品質調查!
data_url = "https://raw.githubusercontent.com/pandas-dev/pandas/master/doc/data/air_quality_long.csv"
air_quality = pd.read_csv(data_url, index_col="date.utc", parse_dates=True)
air_quality.head()

Unnamed: 0_level_0,city,country,location,parameter,value,unit
date.utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-06-18 06:00:00+00:00,Antwerpen,BE,BETR801,pm25,18.0,µg/m³
2019-06-17 08:00:00+00:00,Antwerpen,BE,BETR801,pm25,6.5,µg/m³
2019-06-17 07:00:00+00:00,Antwerpen,BE,BETR801,pm25,18.5,µg/m³
2019-06-17 06:00:00+00:00,Antwerpen,BE,BETR801,pm25,16.0,µg/m³
2019-06-17 05:00:00+00:00,Antwerpen,BE,BETR801,pm25,7.5,µg/m³


### 由長到寬的資料表

In [4]:
# filter for no2 data only
no2 = air_quality[air_quality["parameter"] == "no2"]
no2

Unnamed: 0_level_0,city,country,location,parameter,value,unit
date.utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-06-21 00:00:00+00:00,Paris,FR,FR04014,no2,20.0,µg/m³
2019-06-20 23:00:00+00:00,Paris,FR,FR04014,no2,21.8,µg/m³
2019-06-20 22:00:00+00:00,Paris,FR,FR04014,no2,26.5,µg/m³
2019-06-20 21:00:00+00:00,Paris,FR,FR04014,no2,24.9,µg/m³
2019-06-20 20:00:00+00:00,Paris,FR,FR04014,no2,21.4,µg/m³
...,...,...,...,...,...,...
2019-04-09 06:00:00+00:00,London,GB,London Westminster,no2,41.0,µg/m³
2019-04-09 05:00:00+00:00,London,GB,London Westminster,no2,41.0,µg/m³
2019-04-09 04:00:00+00:00,London,GB,London Westminster,no2,41.0,µg/m³
2019-04-09 03:00:00+00:00,London,GB,London Westminster,no2,67.0,µg/m³


### 依照地點抓出前兩筆資料

In [5]:
# use 2 measurements (head) for each location (groupby)
no2_subset = no2.sort_index().groupby(["location"]).head(2)
no2_subset

Unnamed: 0_level_0,city,country,location,parameter,value,unit
date.utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-04-09 01:00:00+00:00,Antwerpen,BE,BETR801,no2,22.5,µg/m³
2019-04-09 01:00:00+00:00,Paris,FR,FR04014,no2,24.4,µg/m³
2019-04-09 02:00:00+00:00,London,GB,London Westminster,no2,67.0,µg/m³
2019-04-09 02:00:00+00:00,Antwerpen,BE,BETR801,no2,53.5,µg/m³
2019-04-09 02:00:00+00:00,Paris,FR,FR04014,no2,27.4,µg/m³
2019-04-09 03:00:00+00:00,London,GB,London Westminster,no2,67.0,µg/m³


### 將地點與資料作轉置的動作!

In [6]:
no2_subset.pivot(columns="location", values="value")

location,BETR801,FR04014,London Westminster
date.utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-04-09 01:00:00+00:00,22.5,24.4,
2019-04-09 02:00:00+00:00,53.5,27.4,67.0
2019-04-09 03:00:00+00:00,,,67.0


### Wide to long format. 反正規化過程
 將location內的資料變成欄位，再用reset_index() 消滅date.utc。

In [19]:
no2
no2_pivoted = no2.pivot(columns="location", values="value").reset_index()
no2_pivoted.head(90)

location,date.utc,BETR801,FR04014,London Westminster
0,2019-04-09 01:00:00+00:00,22.5,24.4,
1,2019-04-09 02:00:00+00:00,53.5,27.4,67.0
2,2019-04-09 03:00:00+00:00,54.5,34.2,67.0
3,2019-04-09 04:00:00+00:00,34.5,48.5,41.0
4,2019-04-09 05:00:00+00:00,46.5,59.5,41.0
...,...,...,...,...
85,2019-04-12 14:00:00+00:00,,21.8,39.0
86,2019-04-12 15:00:00+00:00,,21.9,47.0
87,2019-04-12 16:00:00+00:00,,24.1,47.0
88,2019-04-12 17:00:00+00:00,,25.9,41.0


### 將no2的資料整理成一個欄位
pandas.melt()是簡化的版本。

In [22]:
no2_pivoted
no_2 = no2_pivoted.melt(id_vars="date.utc")
no_2

location,date.utc,BETR801,FR04014,London Westminster
0,2019-04-09 01:00:00+00:00,22.5,24.4,
1,2019-04-09 02:00:00+00:00,53.5,27.4,67.0
2,2019-04-09 03:00:00+00:00,54.5,34.2,67.0
3,2019-04-09 04:00:00+00:00,34.5,48.5,41.0
4,2019-04-09 05:00:00+00:00,46.5,59.5,41.0
...,...,...,...,...
1700,2019-06-20 20:00:00+00:00,,21.4,
1701,2019-06-20 21:00:00+00:00,,24.9,
1702,2019-06-20 22:00:00+00:00,,26.5,
1703,2019-06-20 23:00:00+00:00,,21.8,


### 這是melt更詳細定義的版本。
* id_vars :固定不會被消滅的欄位。
* value_vars: 要融合再一起的欄位名稱有哪些?
* value_name: provides a custom column name for the values column instead of the default column name value
* var_name: provides a custom column name for the column collecting the column header names. Otherwise it takes the index name or a default variable

In [25]:
no2_pivoted
no_2 = no2_pivoted.melt(
    id_vars="date.utc",
    value_vars=["BETR801", "FR04014", "London Westminster"],
    value_name="NO_2",
    var_name="id_location",
)
no_2

Unnamed: 0,date.utc,id_location,NO_2
0,2019-04-09 01:00:00+00:00,BETR801,22.5
1,2019-04-09 02:00:00+00:00,BETR801,53.5
2,2019-04-09 03:00:00+00:00,BETR801,54.5
3,2019-04-09 04:00:00+00:00,BETR801,34.5
4,2019-04-09 05:00:00+00:00,BETR801,46.5
...,...,...,...
5110,2019-06-20 20:00:00+00:00,London Westminster,
5111,2019-06-20 21:00:00+00:00,London Westminster,
5112,2019-06-20 22:00:00+00:00,London Westminster,
5113,2019-06-20 23:00:00+00:00,London Westminster,
