<a href="https://colab.research.google.com/github/fongn1/titanic-analysis/blob/main/2025_Task_03_%E4%BD%9C%E6%A5%AD_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd

# 載入資料集
df_train = pd.read_csv('https://raw.githubusercontent.com/dsindy/kaggle-titanic/master/data/train.csv')

# --- 步驟 1: 資料探索與清洗 ---

# 顯示資料集的概覽，確認資料結構和非空值數量
print("資料集概覽:")
df_train.info()

# 計算每個欄位的遺失值數量
print("\n各欄位的遺失值數量:")
print(df_train.isnull().sum())

# 處理遺失值
# 1. 填補 Age 的遺失值（使用中位數）
df_train['Age'] = df_train['Age'].fillna(df_train['Age'].median())

# 2. 填補 Fare 的遺失值（使用中位數）
df_train['Fare'] = df_train['Fare'].fillna(df_train['Fare'].median())

# 3. 填補 Embarked 的遺失值（使用眾數）
df_train['Embarked'] = df_train['Embarked'].fillna(df_train['Embarked'].mode()[0])

# 4. 刪除 Cabin 欄位，因為遺失值太多
df_train = df_train.drop('Cabin', axis=1)

# 處理類別型變數
# 將 Sex 轉換為數值：0 代表女性，1 代表男性
df_train['Sex'] = df_train['Sex'].map({'female': 0, 'male': 1}).astype(int)

# 將 Embarked 進行 One-Hot Encoding
df_train = pd.get_dummies(df_train, columns=['Embarked'], drop_first=True)

# 再次檢查遺失值，確認所有遺失值都已處理完畢
print("\n處理後的資料集概覽:")
df_train.info()
print("\n處理後各欄位的遺失值數量:")
print(df_train.isnull().sum())


print ("-"*20)

# 選擇特徵欄位和目標欄位
# 'PassengerId', 'Name', 'Ticket' 這些欄位通常對模型預測沒有幫助，因此將它們移除。
columns_X = df_train.drop(['PassengerId', 'Name', 'Ticket', 'Survived'], axis=1).columns
columns_y = ['Survived']

train_X = df_train[columns_X]
train_y = df_train[columns_y]


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score


log = LogisticRegression(random_state=0, max_iter=3000)
scores = cross_val_score(log, train_X, train_y.values.ravel(),cv=5,scoring='accuracy')
print("交叉驗證的準確度分數:", scores)
print("平均準確度:", scores.mean())

資料集概覽:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB

各欄位的遺失值數量:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked        