In [1]:
'''
Title: Titanic Survival Analysis - Exploratory Data Analysis
Name: Daniel Muthama
Date: 30 May 2025
Dataset: https://www.kaggle.com/competitions/titanic
'''

# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

# Load dataset
df = pd.read_csv('/kaggle/input/titanic/train.csv')

# 1. INITIAL DATA EXPLORATION
print("=== INITIAL DATA EXPLORATION ===")
print(f"Dataset Dimensions: {df.shape}")
print("\nFirst 5 Rows:")
print(df.head())
print("\nData Types and Missing Values:")
print(df.info())
print("\nSummary Statistics:")
print(df.describe(include='all'))
print("\nUnique Values per Column:")
print(df.nunique())
print(f"\nDuplicate Rows: {df.duplicated().sum()}")


FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/titanic/train.csv'

In [None]:
# 2. HANDLING MISSING VALUES & OUTLIERS
print("\n=== MISSING VALUES & OUTLIERS ===")
# Visualize missing values
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.savefig('missing_values.png')
plt.show()

# Missing data treatment
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df.drop('Cabin', axis=1, inplace=True)
print("\nMissing Values After Treatment:")
print(df.isnull().sum())

# Outlier detection and treatment
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
sns.boxplot(x=df['Fare'])
plt.title('Fare Distribution Before Treatment')

# Cap fare outliers using IQR method
Q1 = df['Fare'].quantile(0.25)
Q3 = df['Fare'].quantile(0.75)
IQR = Q3 - Q1
df['Fare'] = np.where(df['Fare'] > Q3 + 3*IQR, Q3 + 3*IQR, df['Fare'])

plt.subplot(1, 2, 2)
sns.boxplot(x=df['Fare'])
plt.title('Fare Distribution After Treatment')
plt.tight_layout()
plt.savefig('outlier_treatment.png')
plt.show()

In [None]:
# 3. UNIVARIATE ANALYSIS
print("\n=== UNIVARIATE ANALYSIS ===")
# Categorical features
cat_cols = ['Survived', 'Pclass', 'Sex', 'Embarked']
plt.figure(figsize=(15, 10))
for i, col in enumerate(cat_cols, 1):
    plt.subplot(2, 2, i)
    sns.countplot(data=df, x=col)
    plt.title(f'{col} Distribution')
plt.tight_layout()
plt.savefig('categorical_distributions.png')
plt.show()

# Numerical features
num_cols = ['Age', 'Fare']
plt.figure(figsize=(12, 5))
for i, col in enumerate(num_cols, 1):
    plt.subplot(1, 2, i)
    sns.histplot(df[col], kde=True)
    plt.title(f'{col} Distribution')
plt.tight_layout()
plt.savefig('numerical_distributions.png')
plt.show()

In [None]:
# 4. BIVARIATE ANALYSIS
print("\n=== BIVARIATE ANALYSIS ===")
# Survival vs categorical features
plt.figure(figsize=(15, 10))
for i, col in enumerate(['Pclass', 'Sex', 'Embarked'], 1):
    plt.subplot(2, 2, i)
    sns.countplot(data=df, x=col, hue='Survived')
    plt.title(f'Survival by {col}')
plt.tight_layout()
plt.savefig('survival_categorical.png')
plt.show()

# Survival vs numerical features
plt.figure(figsize=(12, 5))
for i, col in enumerate(['Age', 'Fare'], 1):
    plt.subplot(1, 2, i)
    sns.boxplot(data=df, x='Survived', y=col)
    plt.title(f'Survival by {col}')
plt.tight_layout()
plt.savefig('survival_numerical.png')
plt.show()

In [None]:
# 5. MULTIVARIATE ANALYSIS
print("\n=== MULTIVARIATE ANALYSIS ===")
# Interaction: Pclass, Age, and Survival
plt.figure(figsize=(10, 6))
sns.violinplot(data=df, x='Pclass', y='Age', hue='Survived', split=True)
plt.title('Survival by Class and Age')
plt.savefig('pclass_age_survival.png')
plt.show()

# Interaction: Embarked, Pclass, and Survival
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='Embarked', hue='Survived', palette='viridis')
plt.title('Survival by Embarkation Port')
plt.savefig('embarked_survival.png')
plt.show()

# Correlation matrix
plt.figure(figsize=(10, 6))
corr = df.corr(numeric_only=True)
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Feature Correlation Matrix')
plt.savefig('correlation_matrix.png')
plt.show()

In [None]:
# 6. TARGET VARIABLE ANALYSIS
print("\n=== TARGET VARIABLE (SURVIVED) ANALYSIS ===")
# Target distribution
plt.figure(figsize=(8, 5))
sns.countplot(data=df, x='Survived')
plt.title('Survival Distribution (0 = Died, 1 = Survived)')
plt.savefig('target_distribution.png')
plt.show()

print(f"Survival Rate: {df['Survived'].mean()*100:.1f}%")

# Key survival factors
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.barplot(data=df, x='Sex', y='Survived', ci=None)
plt.title('Survival Rate by Gender')

plt.subplot(1, 2, 2)
sns.barplot(data=df, x='Pclass', y='Survived', ci=None)
plt.title('Survival Rate by Class')
plt.tight_layout()
plt.savefig('key_survival_factors.png')
plt.show()

# Interactive effect: Class and Gender
plt.figure(figsize=(10, 6))
sns.pointplot(data=df, x='Pclass', y='Survived', hue='Sex', ci=None)
plt.title('Survival Rate: Class-Gender Interaction')
plt.savefig('class_gender_interaction.png')
plt.show()

print("\n=== ANALYSIS COMPLETE ===")