In [9]:
import pandas as pd

pd.set_option("display.max_rows", 20)
pd.set_option("display.width", 120)

url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

df.info()  # no print()

# Clean column names: lowercased, underscores
df.columns = (df.columns
              .str.strip()
              .str.lower()
              .str.replace(" ", "_", regex=False))

# Ensure important columns are numeric
for col in ["age","fare","sibsp","parch","survived","pclass"]:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# Drop rows with missing essential values
df = df.dropna(subset=["age","fare","survived","pclass","sex"])

df.info()

# Survival by sex
by_sex = (df.groupby("sex")["survived"]
            .mean()
            .reset_index(name="survival_rate")
            .sort_values("survival_rate", ascending=False))
print(by_sex)

# Multi-agg per class
pclass_summary = (df.groupby("pclass")
                    .agg(
                        passengers=("survived","count"),
                        survival_rate=("survived","mean"),
                        avg_age=("age","mean"),
                        total_fare=("fare","sum"),
                    )
                    .reset_index())
print(pclass_summary)

# Top-K per class (set k=3 to match the name)
k = 3
df_sorted = df.sort_values(["pclass","fare"], ascending=[True, False])
topk_by_class = (df_sorted
                 .groupby("pclass", group_keys=False)
                 .head(k))
print(topk_by_class[["pclass","sex","age","fare","survived"]])

# Best per class by age (max survival rate)
rate_by_age_class = (df.groupby(["pclass","age"])["survived"]
                       .mean()
                       .reset_index(name="survival_rate"))
idx = rate_by_age_class.groupby("pclass")["survival_rate"].idxmax()
best_per_class = rate_by_age_class.loc[idx]
print(best_per_class)

# Pivot: survival by sex x class (as %)
pivot_pct = (pd.pivot_table(
    df, values="survived", index="pclass", columns="sex", aggfunc="mean", margins=True
) * 100).round(1)
print(pivot_pct)

# Distributions
print(df["embarked"].value_counts(dropna=False))
print(df["pclass"].value_counts(normalize=True))

# Assertions
assert by_sex.shape[0] == df["sex"].nunique()
assert {"pclass","survival_rate"}.issubset(pclass_summary.columns)

# Export
by_sex.to_csv("day3_by_sex.csv", index=False)
pclass_summary.to_csv("day3_pclass_summary.csv", index=False)
topk_by_class.to_csv("day3_top3_fare_by_class.csv", index=False)
pivot_pct.to_csv("day3_pivot_survival_sex_pclass.csv")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 714 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerid  714 non-null    int64  
 1   survived     