# Titanic Dataset - Exploratory Data Analysis (EDA)

Steps included:
1. Load dataset & quick look
2. Handle missing values & duplicates
3. Univariate analysis (numeric + categorical)
4. Bivariate analysis (features vs survival)
5. Correlation & pairwise relationships
6. Outlier detection
7. Feature engineering (FamilySize, Title, Age imputation)
8. Multicollinearity check (VIF)
9. Summary of findings



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import plotly.express as px
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Display settings
pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")


In [None]:
# Load dataset (ensure titanic.csv is in data/ folder)
df = pd.read_csv('data/titanic.csv')
print(df.shape)
df.head()


In [None]:
# Info and summary stats
df.info()
df.describe(include='all').T

In [None]:
# Missing values & duplicates
print("Duplicates:", df.duplicated().sum())

missing = df.isnull().mean().sort_values(ascending=False)*100
print(missing[missing > 0])

# Visualize missingness
msno.matrix(df)


In [None]:
# Univariate numeric analysis
num_cols = df.select_dtypes(include=['int64','float64']).columns.tolist()
df[num_cols].hist(figsize=(12,10))
plt.tight_layout()
plt.show()

# Boxplots
for col in ['Age','Fare']:
    plt.figure(figsize=(6,3))
    sns.boxplot(x=df[col].dropna())
    plt.title(col)
    plt.show()

# Skewness
df[num_cols].skew().sort_values(ascending=False)


In [None]:
# Univariate categorical analysis
cat_cols = ['Sex','Pclass','Embarked']
for c in cat_cols:
    plt.figure(figsize=(6,3))
    sns.countplot(data=df, x=c, order=sorted(df[c].dropna().unique()))
    plt.title(c)
    plt.show()

# Survival percent by category
for c in cat_cols:
    print(c)
    print(pd.crosstab(df[c], df['Survived'], margins=False, normalize='index')*100)
    print()


In [None]:
# Bivariate analysis: target vs features

# Age vs survival
plt.figure(figsize=(8,4))
sns.boxplot(x='Survived', y='Age', data=df)
plt.title('Age vs Survived')
plt.show()

# Survival by class and sex
plt.figure(figsize=(8,4))
sns.barplot(x='Pclass', y='Survived', hue='Sex', data=df)
plt.title('Survival by class and sex')
plt.show()


In [None]:
# Correlation matrix
num_cols = ['Age','Fare','SibSp','Parch']
corr = df[num_cols + ['Survived']].corr()
plt.figure(figsize=(6,4))
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation matrix')
plt.show()

# Pairplot (subset only)
sns.pairplot(df[['Age','Fare','Pclass','Survived']].dropna(), hue='Survived', diag_kind='hist')


In [None]:
# Outlier detection with IQR
def iqr_outliers(series):
    q1, q3 = series.quantile([0.25, 0.75])
    iqr = q3 - q1
    lower, upper = q1 - 1.5*iqr, q3 + 1.5*iqr
    return ~series.between(lower, upper)

outliers_fare = iqr_outliers(df['Fare'].dropna())
df.loc[outliers_fare, 'Fare'].describe()


In [None]:
# Feature engineering

# Family size
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

# Extract Title from Name
df['Title'] = df['Name'].str.extract(r',\s*([^\.]+)\.', expand=False)
df['Title'] = df['Title'].str.strip()
df['Title'].value_counts().head()

# Age imputation by Title median
df['Age'] = df.groupby('Title')['Age'].apply(lambda x: x.fillna(x.median()))
df['Age'] = df['Age'].fillna(df['Age'].median())


In [None]:
# Multicollinearity check (VIF)
X = pd.get_dummies(df[['Pclass','Age','Fare','FamilySize']], drop_first=True).dropna()
vif = pd.DataFrame()
vif['feature'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif
