In [2]:
import numpy as np
import pandas as pd

# Load original data
df = pd.read_csv("titanic.csv.csv")

# ---------- Cleaning ----------
# Missing values
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Fare'] = df['Fare'].fillna(df['Fare'].median())

# Drop cabin
if 'Cabin' in df.columns:
    df.drop(columns=['Cabin'], inplace=True)

# Remove duplicates
df = df.drop_duplicates()

# Convert datatypes
df['Pclass'] = df['Pclass'].astype('category')
df['Sex'] = df['Sex'].astype('category')

# Create Age Group
df['Age_Group'] = pd.cut(
    df['Age'],
    bins=[0, 18, 60, 100],
    labels=['Child', 'Adult', 'Senior']
)

# ---------- VERIFY CLEANING ----------
print("Missing values after cleaning:")
print(df.isnull().sum())

print("\nColumns in cleaned data:")
print(df.columns)

# ---------- SAVE CLEAN FILE ----------
df.to_csv("cleaned_data.csv", index=False)

print("\n cleaned_data.csv created successfully")


Missing values after cleaning:
PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
Age_Group      0
dtype: int64

Columns in cleaned data:
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Embarked', 'Age_Group'],
      dtype='object')

 cleaned_data.csv created successfully


In [3]:
pd.read_csv("cleaned_data.csv").head()


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Age_Group
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q,Adult
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S,Adult
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q,Senior
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S,Adult
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S,Adult


# Data Cleaning Summary

1. Loaded Titanic test dataset using pandas.
2. Identified missing values in Age, Fare, and Cabin columns.
3. Filled missing Age and Fare values using median.
4. Dropped Cabin column due to excessive missing values.
5. Removed duplicate records.
6. Converted categorical columns to appropriate data types.
7. Created Age_Group feature for better data understanding.
8. Exported cleaned dataset for further analysis.