<a href="https://colab.research.google.com/github/fongn1/titanic-analysis/blob/main/%E3%80%8C2025_Task_04_%E4%BD%9C%E6%A5%AD_01_ipynb%E3%80%8D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# 載入資料集
df_train = pd.read_csv('https://raw.githubusercontent.com/dsindy/kaggle-titanic/master/data/train.csv')

# --- 定義評估模型準確度的函式 ---
def evaluate_model(df):
    """
    這個函式會接收一個 DataFrame，並使用交叉驗證評估 Logistic Regression 模型的準確度。
    """
    # 選擇特徵欄位和目標欄位
    columns_X = df.drop(['PassengerId', 'Name', 'Ticket', 'Survived'], axis=1).columns
    columns_y = ['Survived']

    train_X = df[columns_X]
    train_y = df[columns_y]

    # 建立 Logistic Regression 模型並進行交叉驗證
    log = LogisticRegression(random_state=0, max_iter=3000)
    scores = cross_val_score(log, train_X, train_y.values.ravel(), cv=5, scoring='accuracy')

    return scores.mean()

# --- 處理非數值型欄位 (此步驟對三種策略都一樣) ---
# 刪除 Cabin 欄位
df_train = df_train.drop('Cabin', axis=1)

# 填補 Embarked 的遺失值，並進行 One-Hot Encoding
df_train['Embarked'] = df_train['Embarked'].fillna(df_train['Embarked'].mode()[0])
df_train = pd.get_dummies(df_train, columns=['Embarked'], drop_first=True)

# 將 Sex 轉換為數值
df_train['Sex'] = df_train['Sex'].map({'female': 0, 'male': 1}).astype(int)

# --- 開始測試不同的填補策略 ---

# 策略一：使用中位數填補 Age 和 Fare
df_median_fill = df_train.copy()
df_median_fill['Age'] = df_median_fill['Age'].fillna(df_median_fill['Age'].median())
df_median_fill['Fare'] = df_median_fill['Fare'].fillna(df_median_fill['Fare'].median())
median_score = evaluate_model(df_median_fill)
print(f"使用中位數填補的平均準確度: {median_score:.4f}")

# 策略二：使用平均數填補 Age 和 Fare
df_mean_fill = df_train.copy()
df_mean_fill['Age'] = df_mean_fill['Age'].fillna(df_mean_fill['Age'].mean())
df_mean_fill['Fare'] = df_mean_fill['Fare'].fillna(df_mean_fill['Fare'].mean())
mean_score = evaluate_model(df_mean_fill)
print(f"使用平均數填補的平均準確度: {mean_score:.4f}")

# 策略三：使用常數 0 填補 Age 和 Fare
df_constant_fill = df_train.copy()
df_constant_fill['Age'] = df_constant_fill['Age'].fillna(0)
df_constant_fill['Fare'] = df_constant_fill['Fare'].fillna(0)
constant_score = evaluate_model(df_constant_fill)
print(f"使用常數 0 填補的平均準確度: {constant_score:.4f}")





使用中位數填補的平均準確度: 0.7912
使用平均數填補的平均準確度: 0.7890
使用常數 0 填補的平均準確度: 0.7845


In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder

# 載入資料集
df_train = pd.read_csv('https://raw.githubusercontent.com/dsindy/kaggle-titanic/master/data/train.csv')

# --- 定義評估模型準確度的函式 ---
def evaluate_model(df):
    """
    這個函式會接收一個 DataFrame，並使用交叉驗證評估 Logistic Regression 模型的準確度。
    """
    # 選擇特徵欄位和目標欄位
    columns_X = df.drop(['PassengerId', 'Name', 'Ticket', 'Survived'], axis=1).columns
    columns_y = ['Survived']

    train_X = df[columns_X]
    train_y = df[columns_y]

    # 建立 Logistic Regression 模型並進行交叉驗證
    log = LogisticRegression(random_state=0, max_iter=3000)
    scores = cross_val_score(log, train_X, train_y.values.ravel(), cv=5, scoring='accuracy')

    return scores.mean()

# --- 處理缺失值 (此步驟對兩種策略都一樣) ---
# 填補 Age 和 Fare 的遺失值
df_train['Age'] = df_train['Age'].fillna(df_train['Age'].median())
df_train['Fare'] = df_train['Fare'].fillna(df_train['Fare'].median())
df_train['Embarked'] = df_train['Embarked'].fillna(df_train['Embarked'].mode()[0])

# 刪除 Cabin 欄位
df_train = df_train.drop('Cabin', axis=1)

# --- 開始測試不同的編碼策略 ---

# 策略一：使用 One-Hot Encoding
df_onehot = df_train.copy()
df_onehot = pd.get_dummies(df_onehot, columns=['Sex', 'Embarked'], drop_first=True)
onehot_score = evaluate_model(df_onehot)
print(f"使用 One-Hot Encoding 的平均準確度: {onehot_score:.4f}")

# 策略二：使用 Label Encoding
df_label = df_train.copy()
le = LabelEncoder()
df_label['Sex'] = le.fit_transform(df_label['Sex'])
df_label['Embarked'] = le.fit_transform(df_label['Embarked'])
label_score = evaluate_model(df_label)
print(f"使用 Label Encoding 的平均準確度: {label_score:.4f}")

使用 One-Hot Encoding 的平均準確度: 0.7912
使用 Label Encoding 的平均準確度: 0.7890


In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# 載入資料集
df_train = pd.read_csv('https://raw.githubusercontent.com/dsindy/kaggle-titanic/master/data/train.csv')

# --- 1. 資料前處理 ---

# 填補 Age、Fare 和 Embarked 的遺失值
df_train['Age'] = df_train['Age'].fillna(df_train['Age'].median())
df_train['Fare'] = df_train['Fare'].fillna(df_train['Fare'].median())
df_train['Embarked'] = df_train['Embarked'].fillna(df_train['Embarked'].mode()[0])

# 刪除 Cabin 欄位
df_train = df_train.drop('Cabin', axis=1)

# --- 2. 特徵工程 ---

# 從 Name 欄位提取稱謂 (Title)
df_train['Title'] = df_train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
common_titles = ['Mr', 'Miss', 'Mrs', 'Master', 'Dr']
df_train['Title'] = df_train['Title'].apply(
    lambda x: x if x in common_titles else 'Other'
)

# 建立家庭大小 (FamilySize) 特徵
df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch'] + 1

# --- 3. 處理類別型變數 ---

# 將 Sex、Embarked 和 Title 進行 One-Hot Encoding
df_train = pd.get_dummies(df_train, columns=['Sex', 'Embarked', 'Title'], drop_first=True)

# 刪除不再需要的原始欄位
df_train = df_train.drop(['PassengerId', 'Name', 'Ticket'], axis=1)

# --- 4. 模型訓練與評估 ---

print ("-"*20)

# 選擇特徵欄位和目標欄位
target = 'Survived'
# 這種列表生成式 (list comprehension) 的寫法會自動選取所有非目標變數的欄位
features = [col for col in df_train.columns if col != target]

train_X = df_train[features]
train_y = df_train[target]

log = LogisticRegression(random_state=0, max_iter=3000)
scores = cross_val_score(log, train_X, train_y.values.ravel(), cv=5, scoring='accuracy')

print("交叉驗證的準確度分數:", scores)
print("平均準確度:", scores.mean())

--------------------
交叉驗證的準確度分數: [0.83240223 0.82022472 0.80337079 0.80337079 0.86516854]
平均準確度: 0.8249074132195091
