In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

titanic = pd.read_csv('train.csv')
titanic.head()

In [None]:
titanic.info()

In [None]:
survival_rate = titanic['Survived'].mean()*100

sns.set(style='darkgrid')
plt.figure(figsize=(2, 1))
sns.countplot(x='Survived', data=titanic)
plt.xlabel('Survived')
plt.ylabel('Passenger Count')
plt.title('Survival Rate: {:.2f}%'.format(survival_rate))
plt.xticks([0, 1], ['No', 'Yes'])
plt.show()

In [None]:
# How does survival rate vary by gender?
survival_by_gender = titanic.groupby('Sex')['Survived'].mean()*100
sns.set(style='darkgrid')
plt.figure(figsize=(2, 1))
sns.barplot(x=survival_by_gender.index, y=survival_by_gender.values)
plt.xlabel('Gender')
plt.ylabel('Survival Rate')
plt.title('Survival Rate by Gender')
plt.show()

In [None]:
# Passenger Age distribution
sns.set(style='darkgrid')
plt.figure(figsize=(4,3))
sns.histplot(data=titanic, x='Age', bins=20, kde=True)
plt.xlabel('Age')
plt.ylabel('Count')
plt.title('Distribution of Passenger Ages')
plt.show()

In [None]:
# Survival Rate by passenger class
survival_by_class = titanic.groupby('Pclass')['Survived'].mean()*100
sns.set(style='darkgrid')
plt.figure(figsize=(3, 2))
sns.barplot(x=survival_by_class.index, y=survival_by_class.values)
plt.xlabel('Passenger Class')
plt.ylabel('Survival Rate')
plt.title('Survival Rate by Passenger Class')
plt.show()

In [None]:
# Survival rate among different age groups
age_bins = [0, 12, 18, 30, 50, 100]
age_labels = ['Child', 'Teenage', 'Young Adult', 'Adult', 'Elderly']
titanic['AgeGroup'] = pd.cut(titanic['Age'], bins=age_bins, labels=age_labels, right=False)

survival_by_age_group = titanic.groupby('AgeGroup')['Survived'].mean()*100
sns.set(style='darkgrid')
plt.figure(figsize=(4, 3))
sns.barplot(x=survival_by_age_group.index, y=survival_by_age_group.values)
plt.xlabel('Age Group')
plt.ylabel('Survival Rate')
plt.title('Survival Rate by Age Group')
plt.show()
titanic = titanic.drop('AgeGroup', axis=1)

In [None]:
survival_by_embarkation = titanic.groupby('Embarked')['Survived'].mean()*100
sns.set(style='darkgrid')
plt.figure(figsize=(3, 2))
sns.barplot(x=survival_by_embarkation.index, y=survival_by_embarkation.values)
plt.xlabel('Port of Embarkation')
plt.ylabel('Survival Rate')
plt.title('Survival Rate by Port of Embarkation')
plt.show()

In [None]:
fare_bins = [0, 50, 100, 150, 200, 300, 1000]
fare_labels = ['0-50', '50-100', '100-150', '150-200', '200-300', '300+']

titanic['FareGroup'] = pd.cut(titanic['Fare'], bins=fare_bins, labels=fare_labels, right=False)
survival_by_fare_group = titanic.groupby('FareGroup')['Survived'].mean()*100

sns.set(style='darkgrid')
plt.figure(figsize=(4, 3))
sns.barplot(x=survival_by_fare_group.index, y=survival_by_fare_group.values)
plt.xlabel('Fare Group')
plt.ylabel('Survival Rate')
plt.title('Survival Rate by Fare Group')
plt.show()
titanic = titanic.drop('FareGroup', axis=1)

In [None]:
titanic['CabinDeck'] = titanic['Cabin'].str.extract(r'([A-Za-z])')
sns.set(style='darkgrid')
plt.figure(figsize=(2, 1))
sns.countplot(data=titanic, x='CabinDeck', order=sorted(titanic['CabinDeck'].dropna().unique()))
plt.xlabel('Cabin Deck')
plt.ylabel('Count')
plt.title('Distribution of Passenger Cabin Locations')
plt.show()

survival_by_cabin_deck = titanic.groupby('CabinDeck')['Survived'].mean() * 100
sns.set(style='darkgrid')
plt.figure(figsize=(4, 3))
sns.barplot(x=survival_by_cabin_deck.index, y=survival_by_cabin_deck.values)
plt.xlabel('Cabin Deck')
plt.ylabel('Survival Rate')
plt.title('Survival Rate by Cabin Deck')
plt.show()

titanic = titanic.drop('CabinDeck', axis=1)

In [None]:
survival_by_age_gender = titanic.groupby(['AgeGroup', 'Sex'])['Survived'].mean() * 100
survival_pivot = survival_by_age_gender.unstack()
sns.set(style='darkgrid')
plt.figure(figsize=(3, 2))
sns.heatmap(data=survival_pivot, annot=True, cmap='coolwarm', fmt=".2f", cbar=True)
plt.xlabel('Gender')
plt.ylabel('Age Group')
plt.title('Survival Rate by Age Group and Gender')
plt.show()

In [None]:
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch'] + 1
survival_by_family_size = titanic.groupby('FamilySize')['Survived'].mean() * 100
sns.set(style='darkgrid')
plt.figure(figsize=(2, 1))
sns.barplot(x=survival_by_family_size.index, y=survival_by_family_size.values)
plt.xlabel('Family Size')
plt.ylabel('Survival Rate')
plt.title('Survival Rate by Family Size')
plt.show()
titanic = titanic.drop('FamilySize', axis=1)

In [None]:
survival_by_embark_class = titanic.groupby(['Embarked', 'Pclass'])['Survived'].mean() * 100
survival_pivot = survival_by_embark_class.unstack()
sns.set(style='darkgrid')
plt.figure(figsize=(3, 2))
sns.heatmap(data=survival_pivot, annot=True, cmap='coolwarm', fmt=".2f", cbar=True)
plt.xlabel('Passenger Class')
plt.ylabel('Port of Embarkation')
plt.title('Survival Rate by Port of Embarkation and Passenger Class')
plt.show()

In [None]:
sns.heatmap(titanic.corr())