In [1]:
from os import name
import pandas as pd
import seaborn as sns

titanic = sns.load_dataset("titanic")

print("Head - first 5 rows: \n", titanic.head(5))
print("\n dataset(matrix) shape: \n", titanic.shape)
print("\n make a list of column names: \n", titanic.columns.tolist())


print("\n Information of dataset: \n", titanic.info())
print("\n Description - numeric summary: \n", titanic.describe())
print("\n How many nulls per column: \n", titanic.isna().sum())

# Keep a working copy
df = titanic.copy()

# drop rows missing age, keep things simple
df = df.dropna(subset=["age"])

print("\nSelect multiple columns: \n", df[["survived","sex","class"]].head())

# Rows by label vs position
print("\n Inclusive row slice by label: \n", df.loc[0:4 , ["sex","age","fare"]])

print("\n Exclusive stop by position: \n", df.iloc[0:5, [
                                            df.columns.get_loc("sex"),
                                              df.columns.get_loc("age"),
                                                df.columns.get_loc("fare")]])

# Boolean filter examples
print("\n Print fares higher than 50: \n", df[df["fare"] > 50])

print("\n Print only females and first class: \n", df[(df["sex"] == "female") & (df["class"] == "First")])


#Survival rate overall
overall_survival = df["survived"].mean()
print("\n Overall survival average: ", overall_survival)


#Survival rate by sex
by_sex = df.groupby("sex")["survived"].mean().reset_index(name= "survival_rate")
print("\n Survival rate by sex: \n", by_sex)


#Survival rate by (sex, class)
by_sex_class = (df.groupby(["sex","class"])["survived"].mean().reset_index(name="survival_rate")
                                                        .sort_values(["sex","survival_rate"], ascending=[True, False]))

print("\n Survival rate by sex and class: \n", by_sex_class)


#Ranking & Sorting
top10_fare = df.sort_values("fare", ascending=False)[["sex", "age", "class", "fare"]].head(10)
print("\n Top 10 fares: \n", top10_fare)


df["family_size"] = df["sibsp"] + df["parch"] + 1

print("\n Family size: \n", df[["sibsp","parch","family_size"]].head(8))


# Bin family sizes: 1, 2-3, 4+
bins = [0, 1, 3, 20]
labels = ["solo", "small", "large"]
df["family_band"] = pd.cut(df["family_size"], bins=bins, labels=labels, right=True, include_lowest=True)

survival_by_family = (df
    .groupby("family_band")["survived"]
    .mean()
    .reset_index(name="survival_rate")
    .sort_values("survival_rate", ascending=False))
print("\n Survival by family: \n", survival_by_family)

by_sex.to_csv("titanic_survival_by_sex.csv", index=False)
by_sex_class.to_csv("titanic_survival_by_sex_class.csv", index=False)
survival_by_family.to_csv("titanic_survival_by_family_band.csv", index=False)




















Head - first 5 rows: 
    survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  

 dataset(matrix) shape: 
 (891, 15)

 make a list of column names: 
 ['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town'

  by_sex_class = (df.groupby(["sex","class"])["survived"].mean().reset_index(name="survival_rate")
  .groupby("family_band")["survived"]
