# 作業 : (Kaggle)鐵達尼生存預測

In [17]:
# 載入套件與資料
import pandas as pd
import numpy as np

data_path = 'data/'
df_train = pd.read_csv(data_path + 'titanic_train.csv')
df_test = pd.read_csv(data_path + 'titanic_test.csv')
df_train.shape

(891, 12)

In [18]:
# 重組資料成為訓練 / 預測用格式
train_Y = df_train['Survived']
ids = df_test['PassengerId']
df_train = df_train.drop(['PassengerId', 'Survived'] , axis=1)
df_test = df_test.drop(['PassengerId'] , axis=1)
df = pd.concat([df_train,df_test])
df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [19]:
# 秀出資料欄位的類型與數量
dtype_df = df.dtypes.reset_index()
dtype_df.columns = ["Count", "Column Type"]
dtype_df = dtype_df.groupby("Column Type").aggregate('count').reset_index()
dtype_df

Unnamed: 0,Column Type,Count
0,int64,3
1,float64,2
2,object,5


In [20]:
#確定只有 int64, float64, object 三種類型後, 分別將欄位名稱存於三個 list 中
int_features = []
float_features = []
object_features = []
for dtype, feature in zip(df.dtypes, df.columns):
    if dtype == 'float64':
        float_features.append(feature)
    elif dtype == 'int64':
        int_features.append(feature)
    else:
        object_features.append(feature)
print(f'{len(int_features)} Integer Features : {int_features}\n')
print(f'{len(float_features)} Float Features : {float_features}\n')
print(f'{len(object_features)} Object Features : {object_features}')

3 Integer Features : ['Pclass', 'SibSp', 'Parch']

2 Float Features : ['Age', 'Fare']

5 Object Features : ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']


# 作業1 
* 試著執行作業程式，觀察三種類型 (int / float / object) 的欄位分別進行( 平均 mean / 最大值 Max / 相異值 nunique )  
中的九次操作會有那些問題? 並試著解釋那些發生Error的程式區塊的原因?  

> <font color='blue'> [Answer]:
> - (int) 欄位進行(平均 mean)操作: OK 
> - (int) 欄位進行(最大值 Max)操作: OK
> - (int) 欄位進行(相異值 nunique)操作: OK (useful if integers are used to represent different categories)
</font> 

> <font color='blue'> [Answer]:
> - (float) 欄位進行(平均 mean)操作: OK (meaningful to presenting statistics)
> - (float) 欄位進行(最大值 Max)操作: OK (meaningful to presenting statistics)
> - (float) 欄位進行(相異值 nunique)操作: no error but there are usually many different values in real number space (usually not helpful)
</font> 

> <font color='blue'> [Answer]:  
> - (object) 欄位進行(平均 mean)操作: NaN (string will be ignored in pandas arithmetic operations)
> - (object) 欄位進行(最大值 Max)操作: NaN (string will be ignored in pandas arithmetic operations)
> - (object) 欄位進行(相異值 nunique)操作: Error induced by comparison operators used by np.unique(). Can use pandas.Series.value_counts instead.
</font> 

# 作業2
* 思考一下，試著舉出今天五種類型以外的一種或多種資料類型，你舉出的新類型是否可以歸在三大類中的某些大類?  
所以三大類特徵中，哪一大類處理起來應該最複雜?

> <font color='blue'> [Answer]:  
> - object 欄位, 如果含有很多種不同值的話, 處理起來最複雜.
</font>

In [21]:
# 例 : 整數 (int) 欄位進行( 平均 mean / 最大值 Max / 相異值 nunique )操作
df_int = df[int_features].copy()
df_int.loc['mean'] = df_int.mean()
df_int.loc['max'] = df_int.max()
df_int.tail()

Unnamed: 0,Pclass,SibSp,Parch
415,3.0,0.0,0.0
416,3.0,0.0,0.0
417,3.0,1.0,1.0
mean,2.294882,0.498854,0.385027
max,3.0,8.0,9.0


In [28]:
for c in df_int:
    _ = np.unique(df_int[c])
    print(f'{len(_)} unique values in {c}: {_}\n')

4 unique values in Pclass: [1.         2.         2.29488159 3.        ]

8 unique values in SibSp: [0.         0.49885409 1.         2.         3.         4.
 5.         8.        ]

9 unique values in Parch: [0.         0.38502674 1.         2.         3.         4.
 5.         6.         9.        ]



In [29]:
# 例 : float 欄位進行( 平均 mean / 最大值 Max / 相異值 nunique )操作
df_flot = df[float_features].copy()
df_flot.loc['mean'] = df_flot.mean()
df_flot.loc['max'] = df_flot.max()
df_flot.tail()

Unnamed: 0,Age,Fare
415,38.5,7.25
416,,8.05
417,,22.3583
mean,29.881138,33.295479
max,80.0,512.3292


In [32]:
for c in df_flot:
    _ = np.unique(df_flot[c])
    print(f'{len(_)} unique values in {c}\n')

362 unique values in Age

283 unique values in Fare



In [33]:
# 例 : object 欄位進行( 平均 mean / 最大值 Max / 相異值 nunique )操作
df_object = df[object_features].copy()
df_object.loc['mean'] = df_object.mean()
df_object.loc['max'] = df_object.max()
df_object.tail()

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
415,"Saether, Mr. Simon Sivertsen",male,SOTON/O.Q. 3101262,,S
416,"Ware, Mr. Frederick",male,359309,,S
417,"Peter, Master. Michael J",male,2668,,C
mean,,,,,
max,,,,,


In [34]:
for c in df_object:
    _ = np.unique(df_object[c])
    print(f'{len(_)} unique values in {c}\n')

TypeError: '<' not supported between instances of 'float' and 'str'

In [40]:
df_object_count = pd.DataFrame()
for c in df_object:
    print(f'In column-{c}, there are {len(df_object[c].value_counts())} different values')

In column-Name, there are 1307 different values
In column-Sex, there are 2 different values
In column-Ticket, there are 929 different values
In column-Cabin, there are 186 different values
In column-Embarked, there are 3 different values
