### Pandas Data Frame

In [2]:
import pandas as pd

groups = ["Modern Web", "DevOps", "Cloud", "Big Data", "Security", "自我挑戰組"]
ironmen = [59, 9, 19, 14, 6, 77]
ironmen_dict = {
                "groups": groups,
                "ironmen": ironmen
}
ironmen_df = pd.DataFrame(ironmen_dict)
ironmen_df

Unnamed: 0,groups,ironmen
0,Modern Web,59
1,DevOps,9
2,Cloud,19
3,Big Data,14
4,Security,6
5,自我挑戰組,77


In [12]:
print(ironmen_df.ndim)
print("---") # 分隔線
print(ironmen_df.shape)
print("---") # 分隔線
print(ironmen_df.dtypes)

2
---
(6, 2)
---
groups     object
ironmen     int64
dtype: object


In [13]:
print(ironmen_df.sum()) # 計算總鐵人數
print(ironmen_df.mean()) # 計算平均鐵人數
print(ironmen_df.median()) # 計算中位數
print(ironmen_df.describe()) # 描述統計

groups     Modern WebDevOpsCloudBig DataSecurity自我挑戰組
ironmen                                           184
dtype: object
ironmen    30.666667
dtype: float64
ironmen    16.5
dtype: float64
         ironmen
count   6.000000
mean   30.666667
std    29.803803
min     6.000000
25%    10.250000
50%    16.500000
75%    49.000000
max    77.000000


In [14]:
import numpy as np
import pandas as pd
groups = ["Modern Web", "DevOps", np.nan, "Big Data", "Security", "自我挑戰組"]
ironmen = [59, 9, 19, 14, 6, np.nan]
ironmen_dict = {
                "groups": groups,
                "ironmen": ironmen
}
# 建立 data frame
ironmen_df = pd.DataFrame(ironmen_dict)
print(ironmen_df.loc[:, "groups"].isnull()) # 判斷哪些組的組名是遺失值
print(ironmen_df.loc[:, "ironmen"].notnull()) # 判斷哪些組的鐵人數不是遺失值

0    False
1    False
2     True
3    False
4    False
5    False
Name: groups, dtype: bool
0     True
1     True
2     True
3     True
4     True
5    False
Name: ironmen, dtype: bool


In [15]:
ironmen_df_na_dropped = ironmen_df.dropna() # 有遺失值的觀測值都刪除
print(ironmen_df_na_dropped)
print("---") # 分隔線
ironmen_df_na_filled = ironmen_df.fillna(0) # 有遺失值的觀測值填補 0
print(ironmen_df_na_filled)
print("---") # 分隔線
ironmen_df_na_filled = ironmen_df.fillna({"groups": "Cloud", "ironmen": 71}) # 依欄位填補遺失值
print(ironmen_df_na_filled)

       groups  ironmen
0  Modern Web     59.0
1      DevOps      9.0
3    Big Data     14.0
4    Security      6.0
---
       groups  ironmen
0  Modern Web     59.0
1      DevOps      9.0
2           0     19.0
3    Big Data     14.0
4    Security      6.0
5       自我挑戰組      0.0
---
       groups  ironmen
0  Modern Web     59.0
1      DevOps      9.0
2       Cloud     19.0
3    Big Data     14.0
4    Security      6.0
5       自我挑戰組     71.0


In [16]:
import pandas as pd

groups = ["Modern Web", "DevOps", "Cloud", "Big Data", "Security", "自我挑戰組"]
ironmen = [59, 9, 19, 14, 6, 77]

# 建立 data frame
ironmen_df = pd.DataFrame(ironmen, columns = ["ironmen"], index = groups)

# 用數值排序
ironmen_df.sort_values(by = "ironmen")

Unnamed: 0,ironmen
Security,6
DevOps,9
Big Data,14
Cloud,19
Modern Web,59
自我挑戰組,77


### iloc 利用索引來篩選資料
### loc 利用label來篩選資料

In [3]:
#  隨機生DataFrame 型別資料
import numpy as np
import pandas as pd
frame = pd.DataFrame(np.random.rand(4,4),index=list('abcd'),
                 columns=list('ABCD'))
frame

Unnamed: 0,A,B,C,D
a,0.418301,0.928158,0.825037,0.583013
b,0.564331,0.171577,0.666998,0.137546
c,0.936742,0.882623,0.742222,0.638585
d,0.349898,0.048077,0.743017,0.906269


In [4]:
frame.describe()

Unnamed: 0,A,B,C,D
count,4.0,4.0,4.0,4.0
mean,0.567318,0.507609,0.744318,0.566354
std,0.262018,0.462452,0.06455,0.318807
min,0.349898,0.048077,0.666998,0.137546
25%,0.4012,0.140702,0.723416,0.471647
50%,0.491316,0.5271,0.742619,0.610799
75%,0.657434,0.894007,0.763522,0.705506
max,0.936742,0.928158,0.825037,0.906269


In [5]:
# 橫的叫列，直的叫欄
#.loc先列後欄，中間用逗號（,）分割，例如取 a 和 A 對應的資料
frame.loc['a','A']

0.418301248390315

In [6]:
#取前兩行對應資料
frame.loc['a':'b',:]

Unnamed: 0,A,B,C,D
a,0.418301,0.928158,0.825037,0.583013
b,0.564331,0.171577,0.666998,0.137546


In [7]:
# 取前兩欄對應資料
frame.loc[:,'A':'B']

Unnamed: 0,A,B
a,0.418301,0.928158
b,0.564331,0.171577
c,0.936742,0.882623
d,0.349898,0.048077


In [8]:
print(frame.loc[['a','c'],:])
print(frame.loc[:,['A','C']])
# 上面的例子取的都是連續的列和欄，取第一列和第四列,與第一欄和第四欄對應的資料
print(frame.loc[['a','d'],['A','D']])

          A         B         C         D
a  0.418301  0.928158  0.825037  0.583013
c  0.936742  0.882623  0.742222  0.638585
          A         C
a  0.418301  0.825037
b  0.564331  0.666998
c  0.936742  0.742222
d  0.349898  0.743017
          A         D
a  0.418301  0.583013
d  0.349898  0.906269


### iloc 基於行索引和列索引（index，columns） 都是從 0 開始

In [9]:
frame.iloc[0,0]

0.418301248390315

In [10]:
# 取前兩列對應資料
frame.iloc[0:2,:]

Unnamed: 0,A,B,C,D
a,0.418301,0.928158,0.825037,0.583013
b,0.564331,0.171577,0.666998,0.137546


In [18]:
# 取前兩列和前兩欄對應資料
frame.iloc[0:2,0:2]

Unnamed: 0,A,B
a,0.418301,0.928158
b,0.564331,0.171577


In [19]:
print(frame.iloc[[0,2],:])
print(frame.iloc[:,[0,2]])
#上面的例子取的都是連續的行和列，若取第一行和第四行、第一列和第四列對應的資料，則
print(frame.iloc[[0,3],[0,3]])

          A         B         C         D
a  0.418301  0.928158  0.825037  0.583013
c  0.936742  0.882623  0.742222  0.638585
          A         C
a  0.418301  0.825037
b  0.564331  0.666998
c  0.936742  0.742222
d  0.349898  0.743017
          A         D
a  0.418301  0.583013
d  0.349898  0.906269
