In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [None]:
# Loading the dataset
location = r'C:\Users\91963\Documents\DS\Elevate labs\Task 1- Data Cleaning & Preprocessing\titanic.csv'
titanic = pd.read_csv(location)

# Basic info
print("Dataset shape:", titanic.shape)
print("\nFirst 5 rows:")
print(titanic.head())
print("\nDataset info:")
print(titanic.info())
print("\nSummary statistics:")
print(titanic.describe(include='all'))

In [None]:
# Checking for missing values
print("Missing values per column:")
print(titanic.isnull().sum())

titanic['Age'].fillna(titanic['Age'].median(), inplace=True)

# For Cabin missing values
titanic.drop('Cabin', axis=1, inplace=True)

titanic['Embarked'].fillna(titanic['Embarked'].mode()[0], inplace=True)

print("\nMissing values after treatment:")
print(titanic.isnull().sum())

In [None]:
# Identifying categorical columns
categorical_cols = ['Sex', 'Embarked', 'Name', 'Ticket']

# Label encoding (binary)
titanic['Sex'] = LabelEncoder().fit_transform(titanic['Sex'])

# One-hot encoding for Embarked
titanic = pd.get_dummies(titanic, columns=['Embarked'], prefix='Embarked')

titanic.drop(['Name', 'Ticket'], axis=1, inplace=True)

print("\nAfter encoding:")
print(titanic.head())

In [None]:
# Identifying numerical columns to scale (excluding binary/target variables)
numerical_cols = ['Age', 'Fare', 'Pclass']

scaler = StandardScaler()

titanic[numerical_cols] = scaler.fit_transform(titanic[numerical_cols])

print("\nAfter scaling:")
print(titanic[numerical_cols].head())

In [None]:
# Creating boxplots for numerical features
plt.figure(figsize=(15, 5))

for i, col in enumerate(numerical_cols, 1):
    plt.subplot(1, len(numerical_cols), i)
    sns.boxplot(y=titanic[col])
    plt.title(col)

plt.tight_layout()
plt.show()

def remove_outliers(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

titanic = remove_outliers(titanic, 'Age')
titanic = remove_outliers(titanic, 'Fare')

print("Dataset shape after outlier removal:", titanic.shape)

In [None]:
# Final cleaned dataset info
print("\nFinal cleaned dataset info:")
print(titanic.info())
print("\nFirst 5 rows of cleaned data:")
print(titanic.head())

# Saved dataset
titanic.to_csv('cleaned_titanic.csv', index=False)