# 安裝套件

In [None]:
!pip install numpy
!pip install pandas



# 引入套件

In [None]:
import pandas as pd
import numpy as np

# Numpy 的基本操作

## 創造 numpy.ndarray

In [None]:
# 將 list 轉成 numpy.ndarray，就可以使用 numpy 提供的各種功能
a = np.array([1, 2, 3])
print(type(a))
print(a.shape, end = '\n\n')

b = np.array([[1,2,3],[4,5,6]])
print(type(b))
print(b.shape)

<class 'numpy.ndarray'>
(3,)

<class 'numpy.ndarray'>
(2, 3)


In [None]:
# 可以用 numpy 裡的各種 function 建立全部 0、全部 1、隨機等各種 numpy.ndarray
a = np.zeros((2,2))
print(a, end = '\n\n')

b = np.ones((1,2))
print(b, end = '\n\n')

c = np.full((2,2), 7)
print(c, end = '\n\n')

d = np.eye(2)
print(d, end = '\n\n')

e = np.random.random((2,2))
print(e)

[[0. 0.]
 [0. 0.]]

[[1. 1.]]

[[7 7]
 [7 7]]

[[1. 0.]
 [0. 1.]]

[[0.97870584 0.70825039]
 [0.61034856 0.12338432]]


## 取值

In [None]:
a = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]])
print(a, end = '\n\n')

# ndarray 用中括號接數字取值，以二維陣列來說逗點前是取 row，逗點後是取 column
print(a[1], end = '\n\n')
print(a[1, 1], end = '\n\n')

# ndarray 可以用冒號來取一段空間
# 下面這個就是 row 取 0 到 2，column 取 1 到 3
b = a[0:2, 1:3]
print(b, end = '\n\n')

# ndarray 也可以用 list 跳著取
# 下面這個就是 row 取 0 和 2，column 取 1 到 3
c = a[[0, 2], 1:3]
print(c, end = '\n\n')

# ndarray 也可以設條件取值
d = a[a > 2]
print(d)

[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]]

[5 6 7 8]

6

[[2 3]
 [6 7]]

[[ 2  3]
 [10 11]]

[ 3  4  5  6  7  8  9 10 11 12]


## Array 數學運算

In [None]:
# 兩個 array 做一般的加減乘除運算的話，他會將對應位置的元素做運算，位置[0, 0]就對另一個陣列的位置[0, 0]做運算，以此類推
x = np.array([[1,2],[3,4]])
y = np.array([[5,6],[7,8]])

print(x + y, end = '\n\n')

print(x - y, end = '\n\n')

print(x * y, end = '\n\n')

print(x / y, end = '\n\n')

print(np.sqrt(x))

[[ 6  8]
 [10 12]]

[[-4 -4]
 [-4 -4]]

[[ 5 12]
 [21 32]]

[[0.2        0.33333333]
 [0.42857143 0.5       ]]

[[1.         1.41421356]
 [1.73205081 2.        ]]


In [None]:
# 也可以對陣列做點積
x = np.array([[1,2],[3,4]])
y = np.array([[5,6],[7,8]])

v = np.array([9,10])
w = np.array([11,12])

print(np.dot(v, w), end = '\n\n')

print(np.dot(x, v), end = '\n\n')

print(np.dot(x, y))

219

[29 67]

[[19 22]
 [43 50]]


In [None]:
# 兩個 array 做加減乘除運算不見得要 shape 相同，numpy 會幫你做 broadcasting
x = np.array([[1,2,3], [4,5,6], [7,8,9], [10,11,12]])
y = np.array([1,0,1])
x + y

array([[ 2,  2,  4],
       [ 5,  5,  7],
       [ 8,  8, 10],
       [11, 11, 13]])

# Pandas 的基本操作

## 自製 DataFrame

In [None]:
df = pd.DataFrame({
    "Name": ["Jerry", "Mary", "Tom"], 
    "Age": [25, 29, 30], 
    "Sex": ["male", "female", "male"]}
    )
print(type(df))
df

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Name,Age,Sex
0,Jerry,25,male
1,Mary,29,female
2,Tom,30,male


## 取出特定 column

In [None]:
print(type(df["Name"]))
df["Name"]

<class 'pandas.core.series.Series'>


0    Jerry
1     Mary
2      Tom
Name: Name, dtype: object

## 自製 Series 並加進 DataFrame

In [None]:
height = pd.Series([172, 180, 181], name="height")
print(type(height))
height

<class 'pandas.core.series.Series'>


0    172
1    180
2    181
Name: height, dtype: int64

In [None]:
df["height"] = height
df

Unnamed: 0,Name,Age,Sex,height
0,Jerry,25,male,172
1,Mary,29,female,180
2,Tom,30,male,181


## 計算各 column 的統計數值

In [None]:
df.describe()

Unnamed: 0,Age,height
count,3.0,3.0
mean,28.0,177.666667
std,2.645751,4.932883
min,25.0,172.0
25%,27.0,176.0
50%,29.0,180.0
75%,29.5,180.5
max,30.0,181.0


## 讀取 csv 檔

In [None]:
titanic = pd.read_csv("titanic_data/train.csv")
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## 取前幾個 row 或後幾個 row 出來看

In [None]:
titanic.head(7)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S


In [None]:
titanic.tail(7)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.05,,S
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.125,,Q
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


## 確認各 column 的資料型態

In [None]:
titanic.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

## 查看各 column 缺失值狀況及資料型態

In [None]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


## 一次取多個 column

In [None]:
sex_survived = titanic[["Sex", "Survived"]]
sex_survived.head(10)

Unnamed: 0,Sex,Survived
0,male,0
1,female,1
2,female,1
3,female,1
4,male,0
5,male,0
6,male,0
7,male,0
8,female,1
9,female,1


## 根據條件篩選想看的 row

In [None]:
female_survived = sex_survived[sex_survived["Sex"] == "female"]
female_survived.head(10)

Unnamed: 0,Sex,Survived
1,female,1
2,female,1
3,female,1
8,female,1
9,female,1
10,female,1
11,female,1
14,female,0
15,female,1
18,female,0


In [None]:
# 以下兩種寫法等價
# 第二種寫法要注意判斷式兩邊要加括號、然後 or 是用 | 符號，and 是用 & 符號

# 方法 1
# pclass_1_3 = titanic[titanic["Pclass"].isin([1, 3])]

# 方法 2
pclass_1_3 = titanic[(titanic["Pclass"] == 1) | (titanic["Pclass"] == 3)]
pclass_1_3.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S


## 取出及設定 [row, column] 位置的值

In [None]:
pclass_age = titanic.loc[titanic["Pclass"] == 1, "Age"]
pclass_age.head(10)

1     38.0
3     35.0
6     54.0
11    58.0
23    28.0
27    19.0
30    40.0
31     NaN
34    28.0
35    42.0
Name: Age, dtype: float64

In [None]:
some_data = titanic.iloc[5:15, 1:3]
some_data

Unnamed: 0,Survived,Pclass
5,0,3
6,0,1
7,0,3
8,1,3
9,1,2
10,1,3
11,1,1
12,0,3
13,0,3
14,0,3


In [None]:
titanic.iloc[0:5, 3] = "Jerry"
titanic.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,Jerry,male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,Jerry,female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,Jerry,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,Jerry,female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,Jerry,male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


# 練習一
## 找出 Sex 是 male 且 Age > 50 的資料，把他們的 Age 都改為 50 

## 最後將 DataFrame 存成 excel 檔，再嘗試讀取 excel 檔，確定存的檔案沒有問題

In [None]:
titanic.to_excel("titanic_data/train.xlsx", sheet_name = "sheet_1",index = False)

In [None]:
titanic = pd.read_excel("titanic_data/train.xlsx", sheet_name="sheet_1")
titanic.head(7)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,Jerry,male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,Jerry,female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,Jerry,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,Jerry,female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,Jerry,male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
