# Task 1 â€” Data Cleaning & Preprocessing
Dataset used: **Titanic Dataset**

Below are the steps performed to clean and preprocess the dataset.

## 1) Import libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
%matplotlib inline
sns.set(style='whitegrid')
print('Libraries imported successfully')

## 2) Load dataset
Make sure the file is named **titanic.csv**.

In [None]:
df = pd.read_csv('titanic.csv')
df.head()

## 3) Basic overview

In [None]:
df.info()
df.describe()
df.isnull().sum()

## 4) Handling missing values

In [None]:
# Fill Age with median
if 'Age' in df.columns:
    df['Age'] = df['Age'].fillna(df['Age'].median())

# Fill Embarked with mode
if 'Embarked' in df.columns:
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# Drop Cabin if exists
if 'Cabin' in df.columns:
    df = df.drop(columns=['Cabin'])

df.isnull().sum()

## 5) Encoding categorical columns

In [None]:
cat_cols = [col for col in ['Sex','Embarked'] if col in df.columns]
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)
df.head()

## 6) Outlier visualization

In [None]:
for col in ['Age','Fare']:
    if col in df.columns:
        plt.figure(figsize=(6,3))
        sns.boxplot(x=df[col])
        plt.title(f'{col} Boxplot')
        plt.show()

## 7) Outlier removal using IQR

In [None]:
if 'Fare' in df.columns:
    Q1 = df['Fare'].quantile(0.25)
    Q3 = df['Fare'].quantile(0.75)
    IQR = Q3 - Q1
    df = df[(df['Fare'] > Q1 - 1.5*IQR) & (df['Fare'] < Q3 + 1.5*IQR)]
df.shape

## 8) Feature scaling

In [None]:
scale_cols = [col for col in ['Age','Fare'] if col in df.columns]
scaler = StandardScaler()
df[scale_cols] = scaler.fit_transform(df[scale_cols])
df.head()

## 9) Save cleaned dataset

In [None]:
df.to_csv('cleaned_titanic.csv', index=False)
print('cleaned_titanic.csv saved')