In [2]:
# 讀入 pandas 套件
import pandas as pd
import numpy as np

In [3]:
# area 是一個 Series
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})
# pop 是一個 Series
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})

In [5]:
# 將 area 和 pop 兩個 Series 組成一個 DataFrame，並設定 column name 為 'area' 和 'pop'
data = pd.DataFrame({'area': area, 'pop': pop})
print(data)

              area       pop
California  423967  38332521
Texas       695662  26448193
New York    141297  19651127
Florida     170312  19552860
Illinois    149995  12882135


In [6]:
# 我們可以存取個別的 Series
print(data['area'])

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64


In [7]:
# 另一個存取個別的 Series 的方法 (但不是每種情況都適用此方法!!)
print(data.area)

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64


In [8]:
# 我們可以檢查取用 Series 的方法
print('data.area is data[\'data\']? ', data.area is data['area'])
print('data.pop is data[\'pop\']? ', data.pop is data['pop'])

data.area is data['data']?  True
data.pop is data['pop']?  False


In [9]:
# 類似於 dictionary-like 的語法，我們也可以直接在現有的 DataFrame 新增一個欄位 (Series)
data['density'] = data['pop'] / data['area']
print(data)

              area       pop     density
California  423967  38332521   90.413926
Texas       695662  26448193   38.018740
New York    141297  19651127  139.076746
Florida     170312  19552860  114.806121
Illinois    149995  12882135   85.883763


In [10]:
# 單純檢視 DataFrame 資料
data.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
       [6.95662000e+05, 2.64481930e+07, 3.80187404e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02],
       [1.70312000e+05, 1.95528600e+07, 1.14806121e+02],
       [1.49995000e+05, 1.28821350e+07, 8.58837628e+01]])

In [11]:
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [12]:
# Transpose；對調列數和欄位
data.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
area,423967.0,695662.0,141297.0,170312.0,149995.0
pop,38332520.0,26448190.0,19651130.0,19552860.0,12882140.0
density,90.41393,38.01874,139.0767,114.8061,85.88376


In [13]:
data['density'] = data['pop'] / data['area']
print(data)

              area       pop     density
California  423967  38332521   90.413926
Texas       695662  26448193   38.018740
New York    141297  19651127  139.076746
Florida     170312  19552860  114.806121
Illinois    149995  12882135   85.883763


In [14]:
data.values[0]

array([4.23967000e+05, 3.83325210e+07, 9.04139261e+01])

In [15]:
data['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [16]:
print(data)

              area       pop     density
California  423967  38332521   90.413926
Texas       695662  26448193   38.018740
New York    141297  19651127  139.076746
Florida     170312  19552860  114.806121
Illinois    149995  12882135   85.883763


In [17]:
# 取出第 0 到第 2 列的第 0 到第 1 欄位的資料
data.iloc[:3, :2]

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127


In [18]:
# 上述的"取出第 0 到第 2 列的第 0 到第 1 欄位的資料"，可以用 loc 嗎?
data.loc[:'New York', :'pop']

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127


In [19]:
# 合併 masking (data.density > 100) 和 fancy indexing (['pop', 'density'])
data.loc[data.density > 100, ['pop', 'density']]

Unnamed: 0,pop,density
New York,19651127,139.076746
Florida,19552860,114.806121


In [20]:
# 更改 data 內容
data.iloc[0, 2] = 90
print(data)

              area       pop     density
California  423967  38332521   90.000000
Texas       695662  26448193   38.018740
New York    141297  19651127  139.076746
Florida     170312  19552860  114.806121
Illinois    149995  12882135   85.883763


In [21]:
# 直接從 URL 讀取資料集 Chiporders，並存入 orders 資料框 (DataFrame)
orders = pd.read_table('http://bit.ly/chiporders')

In [22]:
# 查看前 5 筆的 orders 資料
orders.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


In [29]:
#請取出 chipotle 索引第 0 筆到第 10 筆資料的 item_name 到 item_price 欄位內容。
orders.loc[:10,['item_name','item_price']]

Unnamed: 0,item_name,item_price
0,Chips and Fresh Tomato Salsa,$2.39
1,Izze,$3.39
2,Nantucket Nectar,$3.39
3,Chips and Tomatillo-Green Chili Salsa,$2.39
4,Chicken Bowl,$16.98
5,Chicken Bowl,$10.98
6,Side of Chips,$1.69
7,Steak Burrito,$11.75
8,Steak Soft Tacos,$9.25
9,Steak Burrito,$9.25


In [30]:
orders.loc[:10,['quantity','item_name','item_price']]

Unnamed: 0,quantity,item_name,item_price
0,1,Chips and Fresh Tomato Salsa,$2.39
1,1,Izze,$3.39
2,1,Nantucket Nectar,$3.39
3,1,Chips and Tomatillo-Green Chili Salsa,$2.39
4,2,Chicken Bowl,$16.98
5,1,Chicken Bowl,$10.98
6,1,Side of Chips,$1.69
7,1,Steak Burrito,$11.75
8,1,Steak Soft Tacos,$9.25
9,1,Steak Burrito,$9.25


In [31]:
# 讀取資料集
movies = pd.read_csv('http://bit.ly/imdbratings')

In [32]:
# 操作方法會有小括號
movies.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


In [33]:
ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [34]:
# 檢視欄位名稱
ufo.columns

Index(['City', 'Colors Reported', 'Shape Reported', 'State', 'Time'], dtype='object')

In [35]:
# 重新命名欄位名稱
# 方法一: 使用 rename() 方法和 python dictionary
ufo.rename(columns={'Colors Reported':'UFO顏色', 'Shape Reported':'UFO形狀'}).head()

Unnamed: 0,City,UFO顏色,UFO形狀,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [36]:
ufo.rename(columns={'Colors Reported':'UFO顏色', 'Shape Reported':'UFO形狀'}, inplace=True)
ufo.head()

Unnamed: 0,City,UFO顏色,UFO形狀,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [37]:
# 重新命名欄位名稱
# 方法二: 重新指定所有欄位名稱
ufo_cols = ['城市', 'UFO顏色', 'UFO形狀', '州', '時間']
ufo.columns = ufo_cols
ufo.head()

Unnamed: 0,城市,UFO顏色,UFO形狀,州,時間
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [40]:
# 重新命名欄位名稱
# 方法三: 讀取檔案時，直接重新命名欄位名稱
ufo = pd.read_csv('http://bit.ly/uforeports', header=0, names = ufo_cols)
ufo.head()

Unnamed: 0,城市,UFO顏色,UFO形狀,州,時間
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [41]:
# 再讀取一次 users
users = pd.read_csv('http://bit.ly/movieusers', sep='|', names=col_names)
#users = pd.read_table('http://bit.ly/movieusers', sep='|')
users.head()

Unnamed: 0,序號,編號,性別,工作職稱,代號
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [42]:
#users 資料集沒有預設的 header，所以讀取成為 DataFrame 後，需要再加入 header
# 設定 欄位名稱
col_names = ['序號', '編號', '性別', '工作職稱', '代號']
# 寫入 users
users.columns = col_names

In [43]:
# 再顯示前 5 筆資料
users.head()

Unnamed: 0,序號,編號,性別,工作職稱,代號
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [44]:
ufo.head()

Unnamed: 0,城市,UFO顏色,UFO形狀,州,時間
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [45]:
ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [46]:
# 以底線 '_' 取代空格，使用 replace() 函式
# ufo.columns.str 會回傳一個 StringMethods 物件，使用其 replace() 將空格以底線取代 
ufo.columns = ufo.columns.str.replace(' ', '_')

In [47]:
# 再檢查 ufo.columns 的欄位，空格已經被底線取代
ufo.columns

Index(['City', 'Colors_Reported', 'Shape_Reported', 'State', 'Time'], dtype='object')

In [49]:
ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [50]:
# 方法一 使用 Bracket notation
ufo.drop(['City'], axis=1).head()

Unnamed: 0,Colors Reported,Shape Reported,State,Time
0,,TRIANGLE,NY,6/1/1930 22:00
1,,OTHER,NJ,6/30/1930 20:00
2,,OVAL,CO,2/15/1931 14:00
3,,DISK,KS,6/1/1931 13:00
4,,LIGHT,NY,4/18/1933 19:00


In [51]:
# 但是...事實上 'City' 並沒有被刪除
ufo.head() 

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [52]:
ufo.drop(['City'], axis=1, inplace=True)
ufo.head()

Unnamed: 0,Colors Reported,Shape Reported,State,Time
0,,TRIANGLE,NY,6/1/1930 22:00
1,,OTHER,NJ,6/30/1930 20:00
2,,OVAL,CO,2/15/1931 14:00
3,,DISK,KS,6/1/1931 13:00
4,,LIGHT,NY,4/18/1933 19:00


In [53]:
# 方法二 使用 Dot notation
ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [54]:
ufo.drop('City', axis=1, inplace=True)
ufo.head()

Unnamed: 0,Colors Reported,Shape Reported,State,Time
0,,TRIANGLE,NY,6/1/1930 22:00
1,,OTHER,NJ,6/30/1930 20:00
2,,OVAL,CO,2/15/1931 14:00
3,,DISK,KS,6/1/1931 13:00
4,,LIGHT,NY,4/18/1933 19:00


In [55]:
ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [56]:
# 一次移除多個欄位
# 同時移除 'City', 'State'
ufo.drop(['City', 'State'], axis=1, inplace=True)

In [57]:
ufo.head()

Unnamed: 0,Colors Reported,Shape Reported,Time
0,,TRIANGLE,6/1/1930 22:00
1,,OTHER,6/30/1930 20:00
2,,OVAL,2/15/1931 14:00
3,,DISK,6/1/1931 13:00
4,,LIGHT,4/18/1933 19:00


In [4]:
ufo = pd.read_csv('Air.csv')
ufo.head()

Unnamed: 0,SiteName,County,SO2,PM10,PM2.5,NO2,NOx,NO,Longitude,Latitude
0,高雄(左營),高雄市,4.1,65.0,32.0,27.0,33.0,5.3,120.316744,22.691622
1,桃園(觀音工業區),桃園市,0.9,32.0,11.0,9.5,13.0,3.7,121.128044,25.063039
2,高雄(楠梓),高雄市,4.4,69.0,36.0,32.0,36.0,4.0,120.300819,22.718297
3,屏東(琉球),屏東縣,2.9,37.0,23.0,9.0,11.0,1.7,120.377222,22.352222
4,新北(樹林),新北市,2.1,35.0,16.0,29.0,41.0,12.0,121.383528,24.949028


In [59]:
# 若想要刪除 index = 1 (city 為 'Willingboro' 開頭的列數)
# 首先要確認 ufo.index[1] 會還回傳 1
ufo.drop(ufo.index[1], axis = 0, inplace=True)
ufo.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00
5,Valley City,,DISK,ND,9/15/1934 15:30


In [60]:
# 連續刪除 rows [2:5]
ufo.drop(ufo.index[2:5], axis=0, inplace=True)
ufo.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
6,Crater Lake,,CIRCLE,CA,6/15/1935 0:00
7,Alma,,DISK,MI,7/15/1936 0:00
8,Eklutna,,CIGAR,AK,10/15/1936 17:00


In [13]:
Air = pd.read_csv('Air.csv')
Air.head()

Unnamed: 0,SiteName,County,SO2,PM10,PM2.5,NO2,NOx,NO,Longitude,Latitude
0,高雄(左營),高雄市,4.1,65.0,32.0,27.0,33.0,5.3,120.316744,22.691622
1,桃園(觀音工業區),桃園市,0.9,32.0,11.0,9.5,13.0,3.7,121.128044,25.063039
2,高雄(楠梓),高雄市,4.4,69.0,36.0,32.0,36.0,4.0,120.300819,22.718297
3,屏東(琉球),屏東縣,2.9,37.0,23.0,9.0,11.0,1.7,120.377222,22.352222
4,新北(樹林),新北市,2.1,35.0,16.0,29.0,41.0,12.0,121.383528,24.949028


In [61]:
#請再讀取 Air.csv 檔案，並依照 PM2.5 欄位由大到小的順序排序每筆資料。
Air = pd.read_csv('Air.csv')
Air.sort_values(['PM2.5'],ascending=False)

Unnamed: 0,SiteName,County,SO2,PM10,PM2.5,NO2,NOx,NO,Longitude,Latitude
30,左營,高雄市,5.0,70.0,39.0,29.0,35.0,6.1,120.292917,22.674861
33,大寮,高雄市,17.0,82.0,38.0,35.0,44.0,9.5,120.425081,22.565747
2,高雄(楠梓),高雄市,4.4,69.0,36.0,32.0,36.0,4.0,120.300819,22.718297
28,前鎮,高雄市,7.9,63.0,34.0,33.0,41.0,8.3,120.307564,22.605386
39,安南,臺南市,2.3,47.0,33.0,14.0,16.0,2.1,120.217500,23.048197
...,...,...,...,...,...,...,...,...,...,...
36,橋頭,高雄市,,58.0,,,,,120.305689,22.757506
38,臺南,臺南市,3.3,,,21.0,28.0,7.0,120.202617,22.984581
61,竹東,新竹縣,2.8,,,12.0,15.0,3.0,121.088903,24.740644
63,龍潭,桃園市,,23.0,,,,,121.216350,24.863869


In [59]:
#請篩選出緯度 (Latitude) 大於等於 21.95 且緯度小於 23.55 地區的資料，並依照 PM10的數值由大到小排列。
Air['Longitude']=Air['Longitude'].astype(float) 
Air['Latitude']=Air['Latitude'].astype(float) 
Air.loc[(Air.Longitude> 21.95)&(Air.Latitude<23.55)].sort_values('PM2.5', ascending=False)

Unnamed: 0,SiteName,County,SO2,PM10,PM2.5,NO2,NOx,NO,Longitude,Latitude
30,左營,高雄市,5.0,70.0,39.0,29.0,35.0,6.1,120.292917,22.674861
33,大寮,高雄市,17.0,82.0,38.0,35.0,44.0,9.5,120.425081,22.565747
2,高雄(楠梓),高雄市,4.4,69.0,36.0,32.0,36.0,4.0,120.300819,22.718297
28,前鎮,高雄市,7.9,63.0,34.0,33.0,41.0,8.3,120.307564,22.605386
39,安南,臺南市,2.3,47.0,33.0,14.0,16.0,2.1,120.2175,23.048197
31,楠梓,高雄市,5.2,56.0,33.0,22.0,26.0,3.3,120.328289,22.733667
0,高雄(左營),高雄市,4.1,65.0,32.0,27.0,33.0,5.3,120.316744,22.691622
35,仁武,高雄市,5.3,53.0,31.0,26.0,31.0,4.6,120.332631,22.689056
14,復興,高雄市,6.5,53.0,30.0,38.0,42.0,4.3,120.312017,22.608711
27,小港,高雄市,8.6,55.0,30.0,32.0,43.0,11.0,120.337736,22.565833


In [67]:
'''
請讀取一個網址 'http://bit.ly/kaggletrain' 的檔案，此檔案是 1912 年鐵達尼號船難時乘客的部分資料紀錄，讀取後請將之命名為 'titanic'，
並顯示前 5 筆資料。titanic 共有 12 個欄位，其中 Survived 欄位若為 0，表示船難時身亡；欄位若為 1 表示順利存活；Pclass 欄位表示乘客的
艙等， 1 表示最好的 1 等艙, 2 表示尚可的 2 等艙, 3 表示較差的 3 等艙。
'''
titanic=pd.read_csv('http://bit.ly/kaggletrain')
titanic.head()
#len(titanic)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [84]:
#請計算出不同艙等的乘客人數各是多少人。
G=0
N=0
B=0
G=len(titanic[titanic.Pclass == 1])
N=len(titanic[titanic.Pclass == 2])
B=len(titanic[titanic.Pclass == 3])
print('最好的 1 等艙有:', G ,'個人')
print('尚可的 2 等艙有:', N ,'個人')
print('較差的 3 等艙有:', B ,'個人')
print('總共有:',G+N+B,'個人')

最好的 1 等艙有: 216 個人
尚可的 2 等艙有: 184 個人
較差的 3 等艙有: 491 個人
總共有: 891 個人


In [83]:
#請計算出不同艙等的存活的人數各是多少人
G1=len(titanic.loc[(titanic.Pclass == 1)&(titanic.Survived == 1)])
N1=len(titanic.loc[(titanic.Pclass == 2)&(titanic.Survived == 1)])
B1=len(titanic.loc[(titanic.Pclass == 3)&(titanic.Survived == 1)])
print('最好的 1 等艙有:', G1 ,'個人存活')
print('尚可的 2 等艙有:', N1 ,'個人存活')
print('較差的 3 等艙有:', B1 ,'個人存活')
print('總共有:',G1+N1+B1,'個人存活')

最好的 1 等艙有: 136 個人存活
尚可的 2 等艙有: 87 個人存活
較差的 3 等艙有: 119 個人存活
總共有: 342 個人存活


In [95]:
#請計算出各個艙等的存活率各是多少。
Gpercent=G1/G
print(Gpercent)
Npercent=N1/N
print(Npercent)
Bpercent=B1/B
print(Bpercent)

Gpercent='%.2f' % Gpercent
Npercent='%.2f' % Npercent
Bpercent='%.2f' % Bpercent
print('最好的 1 等艙存活率為:', Gpercent)
print('尚可的 2 等艙存活率為:', Npercent)
print('較差的 3 等艙存活率為:', Bpercent)

0.6296296296296297
0.47282608695652173
0.24236252545824846
最好的 1 等艙存活率為: 0.63
尚可的 2 等艙存活率為: 0.47
較差的 3 等艙存活率為: 0.24


In [114]:
#請計算出船票價格 (Fare) 排名前 20 名和後 20 名的乘客的存活率。
Low=titanic.sort_values(['Fare']).head(20)#費用低至高
Top=titanic.sort_values(['Fare'],ascending=False).head(20)#費用高至低
LowTotal=len(Low)
LowSurvived=len(Low[Low.Survived==1])
TopTotal=len(Top)
TopSurvived=len(Top[Top.Survived==1])

LowPercent=LowSurvived/LowTotal
TopPercent=TopSurvived/TopTotal

print('付費最高的前二十位存活率為:', TopPercent)
print('付費最低的前二十位存活率為:', LowPercent)

付費最高的前二十位存活率為: 0.7
付費最低的前二十位存活率為: 0.05


In [123]:
#請計算出性別 (Sex 欄位) 是男性 (male) 且存活的人數以及存活率
F1=titanic.loc[(titanic.Sex == 'male')]
F1Total=len(F1)
F1Survived=len(F1[F1.Survived==1])
F1percent=F1Survived/F1Total
print('男性存活人數為:',F1Survived,'個人')
F1percent='%.2f' % F1percent
print('男性存活率為:',F1percent)

男性存活人數為: 109 個人
男性存活率為: 0.19


In [130]:
#請計算出性別 (Sex 欄位) 是女性 (female) 且存活的人數以及存活率。
F2=titanic.loc[(titanic.Sex == 'female')]
F2Total=len(F2)
F2Survived=len(F2[F2.Survived==1])
F2percent=F2Survived/F2Total
print('女性存活人數為:',F2Survived,'個人')
F2percent='%.2f' % F2percent
print('女性存活率為:',F2percent)

女性存活人數為: 233 個人
女性存活率為: 0.74


In [136]:
#請計算出性別 (Sex 欄位) 是男性 (male) 且乘坐 1 等艙乘客的存活率。
F3=titanic.loc[(titanic.Sex == 'male') & (titanic.Pclass == 1)]
F3Total=len(F3)
F3Survived=len(F3[F3.Survived==1])
F3percent=F3Survived/F3Total
F3percent='%.2f' % F3percent
print('乘坐 1 等艙的男性存活率為:',F3percent)

乘坐 1 等艙的男性存活率為: 0.37


In [140]:
#請計算出性別 (Sex 欄位) 是女性 (female) 且乘坐 1 等艙乘客的存活率。
F4=titanic.loc[(titanic.Sex == 'female') & (titanic.Pclass == 1)]
F4Total=len(F4)
F4Survived=len(F4[F4.Survived==1])
F4percent=F4Survived/F4Total
F4percent='%.2f' % F4percent
print('乘坐 1 等艙的女性存活率為:',F4percent)

乘坐 1 等艙的女性存活率為: 0.97
