In [1]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Step 1: Load dataset
df = pd.read_csv("Titanic-Dataset.csv")

# Step 2: Handle missing values
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df.drop(['Cabin', 'Name', 'Ticket', 'PassengerId'], axis=1, inplace=True)

# Step 3: Encode categorical variables
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)

# Step 4: Visualize outliers
plt.figure(figsize=(10, 5))
sns.boxplot(data=df[['Age', 'Fare']])
plt.title("Boxplots of Age and Fare (Before Outlier Removal)")
plt.show()

# Step 5: Remove outliers using IQR
Q1 = df[['Age', 'Fare']].quantile(0.25)
Q3 = df[['Age', 'Fare']].quantile(0.75)
IQR = Q3 - Q1
df_clean = df[~((df[['Age', 'Fare']] < (Q1 - 1.5 * IQR)) | (df[['Age', 'Fare']] > (Q3 + 1.5 * IQR))).any(axis=1)]

# Step 6: Normalize numerical features
scaler = StandardScaler()
df_clean[['Age', 'Fare', 'SibSp', 'Parch']] = scaler.fit_transform(df_clean[['Age', 'Fare', 'SibSp', 'Parch']])

# Step 7: Save cleaned dataset
df_clean.to_csv("Titanic_Cleaned.csv", index=False)

# Step 8: Visualizations

# Survival count
plt.figure(figsize=(6, 4))
sns.countplot(x='Survived', data=df_clean)
plt.title('Survival Count')
plt.xticks([0, 1], ['Not Survived', 'Survived'])
plt.show()

# Survival by sex
plt.figure(figsize=(6, 4))
sns.countplot(x='Sex', hue='Survived', data=df_clean)
plt.title('Survival by Sex')
plt.xticks([0, 1], ['Male', 'Female'])
plt.show()

# Survival by Pclass
plt.figure(figsize=(6, 4))
sns.countplot(x='Pclass', hue='Survived', data=df_clean)
plt.title('Survival by Passenger Class')
plt.show()

# Correlation heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df_clean.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Feature Correlation Heatmap')
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: 'Titanic-Dataset.csv'