# 01 – Exploratory Data Analysis (EDA)

本 Notebook 對 Titanic 資料進行探索性資料分析，包括：

- 整體生存率 (baseline)
- 性別與生存率的關係
- 艙等與生存率
- 年齡分布與存活關係
- 家庭大小 FamilySize vs Survived
- 缺失值檢查

**目的：找出哪些特徵有潛力放進模型。**

In [None]:
import pandas as pd 
import matplotlib as plt
import seaborn as sns
import numpy as np
from datetime import datetime, timedelta
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [None]:

train = pd.read_csv('train.csv')

print(train.head())
train.info()
train.isnull().sum()

In [None]:
### 生存率
survival_rate = train["Survived"].value_counts(normalize=True)
print(f"{survival_rate[1]*100:.2f}% survived, {survival_rate[0]*100:.2f}% did not survive.")

In [None]:
### 性別
sex_survival = train.groupby("Sex")["Survived"].mean()
print(sex_survival)

In [None]:
#### 看艙等是否有差異
pclass_survival = train.groupby("Pclass")["Survived"].mean()
print(pclass_survival)

In [None]:
##### 看年紀
train["Age"].describe()

In [None]:
plt.figure(figsize=(8,5))
train["Age"].hist(bins=20)
plt.xlabel("Age")
plt.ylabel("Count")
plt.title("Age Distribution")
plt.show()

In [None]:
import plotly.express as px

# 計算每個年齡區間、每個性別的生存率
grouped = train.groupby(["AgeGroup", "Sex"])["Survived"].mean().reset_index()

fig = px.bar(
    grouped,
    x="AgeGroup",
    y="Survived",
    color="Sex",
    barmode="group",
    title="生存率：不同年齡區間 x 性別",
    text_auto=True
)

fig.show()

In [None]:
family_plot = train.groupby("FamilySize")["Survived"].mean().reset_index()

fig = px.bar(
    family_plot,
    x="FamilySize",
    y="Survived",
    title="FamilySize vs Survived",
    color="FamilySize",
    color_continuous_scale="Earth"  # 大地色系（Plotly 的 Earth 色盤）
)

fig.show()

In [None]:
train["FamilySize"] = train["SibSp"] + train["Parch"] + 1
family_survival = train.groupby("FamilySize")["Survived"].mean()
print(family_survival)