# 統計應用實戰分析
## 愛爾蘭風速資料
Pandas read_csv : https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html<br>
正規化參考網站：https://cheatography.com/davechild/cheat-sheets/regular-expressions/

In [1]:
import pandas as pd
import datetime

data_url = 'https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/06_Stats/Wind_Stats/wind.data'
data = pd.read_csv(data_url, sep = "\s+", parse_dates = [[0,1,2]]) 
data.head()

Unnamed: 0,Yr_Mo_Dy,RPT,VAL,ROS,KIL,SHA,BIR,DUB,CLA,MUL,CLO,BEL,MAL
0,2061-01-01,15.04,14.96,13.17,9.29,,9.87,13.67,10.25,10.83,12.58,18.5,15.04
1,2061-01-02,14.71,,10.83,6.5,12.62,7.67,11.5,10.04,9.79,9.67,17.54,13.83
2,2061-01-03,18.5,16.88,12.33,10.13,11.17,6.17,11.25,,8.5,7.67,12.75,12.71
3,2061-01-04,10.58,6.63,11.75,4.58,4.54,2.88,8.63,1.79,5.83,5.88,5.46,10.88
4,2061-01-05,13.33,13.25,11.42,6.17,10.71,8.21,11.92,6.54,10.92,10.34,12.92,11.83


## 修正日期錯誤
* 資料的年份，從兩千開始補上前面的字元
* 資料年份來到70時，又從19開始

<br>修正方式:大於2060年的資料減去100，建立新的日期欄位資訊

In [2]:
def fix(x):
    year = x.year - 100 if x.year > 2060 else x.year
    return datetime.datetime(year, x.month, x.day)

data['Yr_Mo_Dy'] = data['Yr_Mo_Dy'].apply(fix)
data

Unnamed: 0,Yr_Mo_Dy,RPT,VAL,ROS,KIL,SHA,BIR,DUB,CLA,MUL,CLO,BEL,MAL
0,1961-01-01,15.04,14.96,13.17,9.29,,9.87,13.67,10.25,10.83,12.58,18.50,15.04
1,1961-01-02,14.71,,10.83,6.50,12.62,7.67,11.50,10.04,9.79,9.67,17.54,13.83
2,1961-01-03,18.50,16.88,12.33,10.13,11.17,6.17,11.25,,8.50,7.67,12.75,12.71
3,1961-01-04,10.58,6.63,11.75,4.58,4.54,2.88,8.63,1.79,5.83,5.88,5.46,10.88
4,1961-01-05,13.33,13.25,11.42,6.17,10.71,8.21,11.92,6.54,10.92,10.34,12.92,11.83
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6569,1978-12-27,17.58,16.96,17.62,8.08,13.21,11.67,14.46,15.59,14.04,14.00,17.21,40.08
6570,1978-12-28,13.21,5.46,13.46,5.00,8.12,9.42,14.33,16.25,15.25,18.05,21.79,41.46
6571,1978-12-29,14.00,10.29,14.42,8.71,9.71,10.54,19.17,12.46,14.50,16.42,18.88,29.58
6572,1978-12-30,18.50,14.04,21.29,9.13,12.75,9.71,18.08,12.87,12.46,12.12,14.67,28.79


## 將日期欄位設定為index

In [3]:
data.index = pd.to_datetime(data['Yr_Mo_Dy'])
data

Unnamed: 0_level_0,Yr_Mo_Dy,RPT,VAL,ROS,KIL,SHA,BIR,DUB,CLA,MUL,CLO,BEL,MAL
Yr_Mo_Dy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1961-01-01,1961-01-01,15.04,14.96,13.17,9.29,,9.87,13.67,10.25,10.83,12.58,18.50,15.04
1961-01-02,1961-01-02,14.71,,10.83,6.50,12.62,7.67,11.50,10.04,9.79,9.67,17.54,13.83
1961-01-03,1961-01-03,18.50,16.88,12.33,10.13,11.17,6.17,11.25,,8.50,7.67,12.75,12.71
1961-01-04,1961-01-04,10.58,6.63,11.75,4.58,4.54,2.88,8.63,1.79,5.83,5.88,5.46,10.88
1961-01-05,1961-01-05,13.33,13.25,11.42,6.17,10.71,8.21,11.92,6.54,10.92,10.34,12.92,11.83
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1978-12-27,1978-12-27,17.58,16.96,17.62,8.08,13.21,11.67,14.46,15.59,14.04,14.00,17.21,40.08
1978-12-28,1978-12-28,13.21,5.46,13.46,5.00,8.12,9.42,14.33,16.25,15.25,18.05,21.79,41.46
1978-12-29,1978-12-29,14.00,10.29,14.42,8.71,9.71,10.54,19.17,12.46,14.50,16.42,18.88,29.58
1978-12-30,1978-12-30,18.50,14.04,21.29,9.13,12.75,9.71,18.08,12.87,12.46,12.12,14.67,28.79


## 取得每一個地區風速的統計資料，包含平均值、標準差等統計資訊

In [4]:
data.describe()

Unnamed: 0,RPT,VAL,ROS,KIL,SHA,BIR,DUB,CLA,MUL,CLO,BEL,MAL
count,6568.0,6571.0,6572.0,6569.0,6572.0,6574.0,6571.0,6572.0,6571.0,6573.0,6574.0,6570.0
mean,12.362987,10.644314,11.660526,6.306468,10.455834,7.092254,9.797343,8.495053,8.49359,8.707332,13.121007,15.599079
std,5.618413,5.267356,5.00845,3.605811,4.936125,3.968683,4.977555,4.499449,4.166872,4.503954,5.835037,6.699794
min,0.67,0.21,1.5,0.0,0.13,0.0,0.0,0.0,0.0,0.04,0.13,0.67
25%,8.12,6.67,8.0,3.58,6.75,4.0,6.0,5.09,5.37,5.33,8.71,10.71
50%,11.71,10.17,10.92,5.75,9.96,6.83,9.21,8.08,8.17,8.29,12.5,15.0
75%,15.92,14.04,14.67,8.42,13.54,9.67,12.96,11.42,11.19,11.63,16.88,19.83
max,35.8,33.37,33.84,28.46,37.54,26.16,30.37,31.08,25.88,28.21,42.38,42.54


## 改變統計資料的軸線，取得每一天的風速最小值、最大值、平均值和標準差

In [5]:
day_stats = pd.DataFrame()
day_stats['min'] = data.min(axis = 1)
day_stats['max'] = data.max(axis = 1)
day_stats['mean'] = data.mean(axis = 1)
day_stats['std'] = data.std(axis = 1)
day_stats

Unnamed: 0_level_0,min,max,mean,std
Yr_Mo_Dy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1961-01-01,9.29,18.50,13.018182,2.808875
1961-01-02,6.50,17.54,11.336364,3.188994
1961-01-03,6.17,18.50,11.641818,3.681912
1961-01-04,1.79,11.75,6.619167,3.198126
1961-01-05,6.17,13.33,10.630000,2.445356
...,...,...,...,...
1978-12-27,8.08,40.08,16.708333,7.868076
1978-12-28,5.00,41.46,15.150000,9.687857
1978-12-29,8.71,29.58,14.890000,5.756836
1978-12-30,9.13,28.79,15.367500,5.540437


## 每個地區一月的平均風速

In [7]:
data.loc[data.index.month == 1].mean()

RPT    14.847325
VAL    12.914560
ROS    13.299624
KIL     7.199498
SHA    11.667734
BIR     8.054839
DUB    11.819355
CLA     9.512047
MUL     9.543208
CLO    10.053566
BEL    14.550520
MAL    18.028763
dtype: float64

## 個地區每一年每個月的平均風速
偏移別名參考網址：https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-offset-aliases

In [9]:
data.index.to_period('M')

PeriodIndex(['1961-01', '1961-01', '1961-01', '1961-01', '1961-01', '1961-01',
             '1961-01', '1961-01', '1961-01', '1961-01',
             ...
             '1978-12', '1978-12', '1978-12', '1978-12', '1978-12', '1978-12',
             '1978-12', '1978-12', '1978-12', '1978-12'],
            dtype='period[M]', name='Yr_Mo_Dy', length=6574, freq='M')

In [10]:
data.groupby(data.index.to_period('M')).mean()

Unnamed: 0_level_0,RPT,VAL,ROS,KIL,SHA,BIR,DUB,CLA,MUL,CLO,BEL,MAL
Yr_Mo_Dy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1961-01,14.841333,11.988333,13.431613,7.736774,11.072759,8.588065,11.184839,9.245333,9.085806,10.107419,13.880968,14.703226
1961-02,16.269286,14.975357,14.441481,9.230741,13.852143,10.937500,11.890714,11.846071,11.821429,12.714286,18.583214,15.411786
1961-03,10.890000,11.296452,10.752903,7.284000,10.509355,8.866774,9.644194,9.829677,10.294138,11.251935,16.410968,15.720000
1961-04,10.722667,9.427667,9.998000,5.830667,8.435000,6.495000,6.925333,7.094667,7.342333,7.237000,11.147333,10.278333
1961-05,9.860968,8.850000,10.818065,5.905333,9.490323,6.574839,7.604000,8.177097,8.039355,8.499355,11.900323,12.011613
...,...,...,...,...,...,...,...,...,...,...,...,...
1978-08,9.645161,8.259355,9.032258,4.502903,7.368065,5.935161,5.650323,5.417742,7.241290,5.536774,10.466774,12.054194
1978-09,10.913667,10.895000,10.635000,5.725000,10.372000,9.278333,10.790333,9.583000,10.069333,8.939000,15.680333,19.391333
1978-10,9.897742,8.670968,9.295806,4.721290,8.525161,6.774194,8.115484,7.337742,8.297742,8.243871,13.776774,17.150000
1978-11,16.151667,14.802667,13.508000,7.317333,11.475000,8.743000,11.492333,9.657333,10.701333,10.676000,17.404667,20.723000


## 每個地區每個月的平均風速、最小風速、最大風速

In [11]:
monthly = data.groupby(data.index.to_period('M')).agg(['mean', 'min', 'max'])
monthly

Unnamed: 0_level_0,RPT,RPT,RPT,VAL,VAL,VAL,ROS,ROS,ROS,KIL,...,MUL,CLO,CLO,CLO,BEL,BEL,BEL,MAL,MAL,MAL
Unnamed: 0_level_1,mean,min,max,mean,min,max,mean,min,max,mean,...,max,mean,min,max,mean,min,max,mean,min,max
Yr_Mo_Dy,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1961-01,14.841333,4.92,25.04,11.988333,3.42,23.91,13.431613,7.08,25.84,7.736774,...,21.46,10.107419,2.67,19.95,13.880968,5.25,27.71,14.703226,5.17,27.63
1961-02,16.269286,6.04,25.80,14.975357,7.08,24.21,14.441481,6.08,22.42,9.230741,...,18.96,12.714286,5.50,20.04,18.583214,9.17,29.63,15.411786,6.67,23.87
1961-03,10.890000,4.88,18.25,11.296452,2.58,17.00,10.752903,4.79,16.38,7.284000,...,17.54,11.251935,5.09,15.79,16.410968,7.21,23.45,15.720000,5.54,22.95
1961-04,10.722667,4.00,21.09,9.427667,3.71,15.41,9.998000,2.54,18.29,5.830667,...,13.29,7.237000,2.71,14.37,11.147333,5.13,19.21,10.278333,2.67,20.46
1961-05,9.860968,3.54,23.00,8.850000,3.58,19.79,10.818065,3.58,21.21,5.905333,...,17.33,8.499355,1.83,17.92,11.900323,4.79,28.08,12.011613,3.33,26.58
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1978-08,9.645161,3.04,19.33,8.259355,1.75,18.25,9.032258,4.54,20.17,4.502903,...,15.87,5.536774,0.79,15.09,10.466774,5.00,21.79,12.054194,3.92,27.25
1978-09,10.913667,0.79,26.75,10.895000,1.21,19.21,10.635000,3.42,21.50,5.725000,...,17.00,8.939000,0.71,18.12,15.680333,4.71,26.83,19.391333,5.13,33.50
1978-10,9.897742,2.04,21.67,8.670968,0.21,21.00,9.295806,3.54,16.25,4.721290,...,14.62,8.243871,1.92,15.37,13.776774,5.09,21.29,17.150000,4.58,28.21
1978-11,16.151667,7.38,30.21,14.802667,4.54,27.63,13.508000,7.29,25.46,7.317333,...,18.46,10.676000,3.29,18.08,17.404667,4.92,24.17,20.723000,11.08,29.04


## 前12筆資料中，從地區RPT到KIL的平均風速、最小風速、最大風速

In [12]:
monthly.loc[monthly.index[0:12], 'RPT':'KIL']

Unnamed: 0_level_0,RPT,RPT,RPT,VAL,VAL,VAL,ROS,ROS,ROS,KIL,KIL,KIL
Unnamed: 0_level_1,mean,min,max,mean,min,max,mean,min,max,mean,min,max
Yr_Mo_Dy,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1961-01,14.841333,4.92,25.04,11.988333,3.42,23.91,13.431613,7.08,25.84,7.736774,0.67,18.54
1961-02,16.269286,6.04,25.8,14.975357,7.08,24.21,14.441481,6.08,22.42,9.230741,2.21,15.37
1961-03,10.89,4.88,18.25,11.296452,2.58,17.0,10.752903,4.79,16.38,7.284,2.75,12.0
1961-04,10.722667,4.0,21.09,9.427667,3.71,15.41,9.998,2.54,18.29,5.830667,1.58,11.63
1961-05,9.860968,3.54,23.0,8.85,3.58,19.79,10.818065,3.58,21.21,5.905333,1.04,14.46
1961-06,9.904138,4.0,15.92,8.520333,3.25,14.54,8.867,5.13,15.5,6.083,2.88,10.54
1961-07,10.614194,3.04,22.5,8.221613,2.92,19.29,9.110323,4.08,15.67,6.340968,1.46,12.42
1961-08,12.035,2.88,18.91,10.133871,4.42,22.0,10.335806,2.46,17.83,6.845806,2.37,12.42
1961-09,12.531,5.0,28.75,9.656897,1.13,22.08,10.776897,2.42,26.5,7.155517,2.62,21.09
1961-10,14.289667,3.13,28.62,10.915806,2.75,21.25,12.236452,4.21,23.09,8.154839,1.46,19.38
