In [1]:
import pandas as pd

# Load the Titanic dataset
data = pd.read_csv('titanic.csv')

In [2]:
# Check for missing values
missing_values = data.isnull().sum()

In [3]:
missing_values

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
# Drop rows with missing values
data_cleaned = data.dropna()

#Now check if there are any missing values after dropping rows
data_cleaned.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

You can see that all the missing values are gone

# Filling Missing Values

In [5]:
# let's make a copy of the original data and then do our experiments
data_1 = data.copy()

In [6]:
# Fill missing age values with the median

data_1['Age'].fillna(data['Age'].median(), inplace=True)

In [7]:
data_1.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [8]:
# Fill missing embarked values with the mode
data_1['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
data_1.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

## Imputation

In [9]:
#let's make another copy of the original data
data_2 = data.copy()

In [10]:
data_2.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [29]:
# Randomly select values to fill missing age values
random_sample = data_2['Age'].dropna().sample(data_2['Age'].isnull().sum(), random_state=0)
data_2.loc[data_2['Age'].isnull(), 'Age'] = random_sample.values

In [30]:

data_2.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [13]:
data_si = data.copy()

from sklearn.impute import SimpleImputer

# Create an instance of SimpleImputer with strategy='mean'
imputer = SimpleImputer(strategy='mean')

# Impute missing age values using the mean
imputed_data = imputer.fit_transform(data_si[['Age']])

data_si["Age"] = imputed_data

In [19]:
len(imputed_data)

891

In [15]:
data_si.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [16]:
data_knn = data.copy()

from sklearn.impute import KNNImputer

# Create an instance of KNNImputer with k=5 (default value)
imputer = KNNImputer()

# Impute missing age values using the KNNImputer
imputed_data = imputer.fit_transform(data_knn[['Age']])

data_knn["Age"] = imputed_data

In [20]:
len(imputed_data)

891

In [17]:
data_knn.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64