# DataFrame  基礎操作

In [1]:
import pandas as pd

## 創建

In [2]:
numbers = [9, 23, 33, 91, 13]
players = ["Ron Harper", "Michael Jordan", "Scottie Pippen", "Dennis Rodman", "Luc Longley"]
df2 = pd.DataFrame()
df2["number"] = numbers
df2["player"] = players
df2.index = ["PG", "SG", "SF", "PF", "C"]
df2

Unnamed: 0,number,player
PG,9,Ron Harper
SG,23,Michael Jordan
SF,33,Scottie Pippen
PF,91,Dennis Rodman
C,13,Luc Longley


## 讀檔

In [3]:
csv_url = "https://storage.googleapis.com/learn_pd_like_tidyverse/gapminder.csv"
df = pd.read_csv(csv_url)
df

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.853030
2,Afghanistan,Asia,1962,31.997,10267083,853.100710
3,Afghanistan,Asia,1967,34.020,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
...,...,...,...,...,...,...
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.449960
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623


## 檢視

In [4]:
df.head()     # 查看前五列觀測值

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


In [5]:
df.tail()     # 查看末五列觀測值

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.44996
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623
1703,Zimbabwe,Africa,2007,43.487,12311143,469.709298


In [6]:
df.info()     # 查看資料框的複合資訊
              # 可以檢索出資料的大小、是否有空值、每個欄位的型態

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
country      1704 non-null object
continent    1704 non-null object
year         1704 non-null int64
lifeExp      1704 non-null float64
pop          1704 non-null int64
gdpPercap    1704 non-null float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB


In [7]:
df.describe() # 查看數值變數的描述性統計

Unnamed: 0,year,lifeExp,pop,gdpPercap
count,1704.0,1704.0,1704.0,1704.0
mean,1979.5,59.474439,29601210.0,7215.327081
std,17.26533,12.917107,106157900.0,9857.454543
min,1952.0,23.599,60011.0,241.165877
25%,1965.75,48.198,2793664.0,1202.060309
50%,1979.5,60.7125,7023596.0,3531.846989
75%,1993.25,70.8455,19585220.0,9325.462346
max,2007.0,82.603,1318683000.0,113523.1329


In [8]:
df.shape      # 查看資料框的外觀

(1704, 6)

## 選取

In [9]:
df2.loc[["SG", "SF", "PF"], ["number", "player"]] # 以索引為準

Unnamed: 0,number,player
SG,23,Michael Jordan
SF,33,Scottie Pippen
PF,91,Dennis Rodman


In [10]:
df2.iloc[[1, 2, 3], [0, 1]]                       # 以位置為準

Unnamed: 0,number,player
SG,23,Michael Jordan
SF,33,Scottie Pippen
PF,91,Dennis Rodman


In [11]:
is_trio = df2["number"].isin([23, 33, 91]) # 透過球衣背號
print(is_trio)
df2[is_trio]

PG    False
SG     True
SF     True
PF     True
C     False
Name: number, dtype: bool


Unnamed: 0,number,player
SG,23,Michael Jordan
SF,33,Scottie Pippen
PF,91,Dennis Rodman


In [12]:
is_trio = df2["player"].isin(["Michael Jordan", "Scottie Pippen", "Dennis Rodman"]) # 透過球員姓名
print(is_trio)
df2[is_trio]

PG    False
SG     True
SF     True
PF     True
C     False
Name: player, dtype: bool


Unnamed: 0,number,player
SG,23,Michael Jordan
SF,33,Scottie Pippen
PF,91,Dennis Rodman


In [13]:
df2[["player", "number"]]

Unnamed: 0,player,number
PG,Ron Harper,9
SG,Michael Jordan,23
SF,Scottie Pippen,33
PF,Dennis Rodman,91
C,Luc Longley,13


In [14]:
df2.iloc[:, 1]

PG        Ron Harper
SG    Michael Jordan
SF    Scottie Pippen
PF     Dennis Rodman
C        Luc Longley
Name: player, dtype: object

In [15]:
df2.iloc[:, [1, 0]]

Unnamed: 0,player,number
PG,Ron Harper,9
SG,Michael Jordan,23
SF,Scottie Pippen,33
PF,Dennis Rodman,91
C,Luc Longley,13


## 排序

In [16]:
df2.sort_index()                # 依照索引遞增排序

Unnamed: 0,number,player
C,13,Luc Longley
PF,91,Dennis Rodman
PG,9,Ron Harper
SF,33,Scottie Pippen
SG,23,Michael Jordan


In [17]:
df2.sort_index(ascending=False) # 依照索引遞減排序

Unnamed: 0,number,player
SG,23,Michael Jordan
SF,33,Scottie Pippen
PG,9,Ron Harper
PF,91,Dennis Rodman
C,13,Luc Longley


In [18]:
df.sort_values(by="year").head()                  # 依照 year 遞增排序

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
528,France,Europe,1952,67.41,42459667,7029.809327
540,Gabon,Africa,1952,37.003,420702,4293.476475
1656,West Bank and Gaza,Asia,1952,43.16,1030585,1515.592329
552,Gambia,Africa,1952,30.0,284320,485.230659


In [19]:
df.sort_values(by="year", ascending=False).head() # 依照 year 遞減排序

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
1703,Zimbabwe,Africa,2007,43.487,12311143,469.709298
491,Equatorial Guinea,Africa,2007,51.579,551201,12154.08975
515,Ethiopia,Africa,2007,52.947,76511887,690.805576
527,Finland,Europe,2007,79.313,5238460,33207.0844
539,France,Europe,2007,80.657,61083916,30470.0167


In [20]:
df.sort_values(by=["year", "continent"], ascending=[True, False]).head() 
# 依照 year 遞增排序再依照 continent 遞減排序

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
60,Australia,Oceania,1952,69.12,8691212,10039.59564
1092,New Zealand,Oceania,1952,69.39,1994794,10556.57566
12,Albania,Europe,1952,55.23,1282697,1601.056136
72,Austria,Europe,1952,66.8,6927772,6137.076492
108,Belgium,Europe,1952,68.0,8730405,8343.105127


## 新增欄位(新增變數)

In [21]:
df2["last_name"] = df2["player"].map(lambda x: x.split()[1]) #用.map()與lambda建立衍生變數
df2

Unnamed: 0,number,player,last_name
PG,9,Ron Harper,Harper
SG,23,Michael Jordan,Jordan
SF,33,Scottie Pippen,Pippen
PF,91,Dennis Rodman,Rodman
C,13,Luc Longley,Longley


In [22]:
df2["team"] = "Chicago Bulls"
df2["height"] = ["6-6", "6-6", "6-8", "6-7", "7-2"]
df2

Unnamed: 0,number,player,last_name,team,height
PG,9,Ron Harper,Harper,Chicago Bulls,6-6
SG,23,Michael Jordan,Jordan,Chicago Bulls,6-6
SF,33,Scottie Pippen,Pippen,Chicago Bulls,6-8
PF,91,Dennis Rodman,Rodman,Chicago Bulls,6-7
C,13,Luc Longley,Longley,Chicago Bulls,7-2


## 新增觀測值

In [23]:
toni_kukoc = pd.DataFrame()
toni_kukoc["number"] = [7]
toni_kukoc["player"] = ["Toni Kukoc"]
df2 = df2.append(toni_kukoc)
df2 = df2.reset_index(drop=True) # 重新設定索引
df2 

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


Unnamed: 0,height,last_name,number,player,team
0,6-6,Harper,9,Ron Harper,Chicago Bulls
1,6-6,Jordan,23,Michael Jordan,Chicago Bulls
2,6-8,Pippen,33,Scottie Pippen,Chicago Bulls
3,6-7,Rodman,91,Dennis Rodman,Chicago Bulls
4,7-2,Longley,13,Luc Longley,Chicago Bulls
5,,,7,Toni Kukoc,


## 統計值摘要

In [24]:
df[df.year == 2007]["pop"].sum()

6251013179

In [25]:
grouped = df[df.year == 2007].groupby("continent")
grouped["pop"].sum()

continent
Africa       929539692
Americas     898871184
Asia        3811953827
Europe       586098529
Oceania       24549947
Name: pop, dtype: int64

In [26]:
grouped = df.groupby(["year", "continent"])
grouped["pop"].sum().tail(n = 10)

year  continent
2002  Africa        833723916
      Americas      849772762
      Asia         3601802203
      Europe        578223869
      Oceania        23454829
2007  Africa        929539692
      Americas      898871184
      Asia         3811953827
      Europe        586098529
      Oceania        24549947
Name: pop, dtype: int64