In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler

# Load the Titanic dataset
df = sns.load_dataset('titanic')

# Display the first few rows
print("Original DataFrame:\n", df.head())


Original DataFrame:
    survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  


In [2]:
df.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [3]:

# Handle missing values by filling them with the mean or mode
df['age'].fillna(df['age'].mean(), inplace=True)
df['embarked'].fillna(df['embarked'].mode()[0], inplace=True)
df['embark_town'].fillna(df['embark_town'].mode()[0], inplace=True)


In [4]:
df.isna().sum()

survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         0
class            0
who              0
adult_male       0
deck           688
embark_town      0
alive            0
alone            0
dtype: int64

In [5]:
df.shape

(891, 15)

In [6]:

# Drop 'alive' column as it's redundant with 'survived'
df.drop(columns=['alive','deck'], inplace=True)


In [7]:
# Encode categorical variables using OneHotEncoder
categorical_cols = ['sex', 'class', 'embarked', 'who', 'embark_town', 'alone']
one_hot_encoder = OneHotEncoder(sparse=False, drop='first')
encoded_categorical_data = one_hot_encoder.fit_transform(df[categorical_cols])
encoded_categorical_df = pd.DataFrame(encoded_categorical_data, columns=one_hot_encoder.get_feature_names_out(categorical_cols))



or


In [9]:
data_encoded = pd.get_dummies(df, columns=categorical_cols,dtype=int,drop_first=True)
data_encoded.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,sex_female,sex_male,class_First,...,embarked_Q,embarked_S,who_child,who_man,who_woman,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,alone_False,alone_True
0,0,3,22.0,1,0,7.25,True,False,True,False,...,False,True,False,True,False,False,False,True,True,False
1,1,1,38.0,1,0,71.2833,False,True,False,True,...,False,False,False,False,True,True,False,False,True,False
2,1,3,26.0,0,0,7.925,False,True,False,False,...,False,True,False,False,True,False,False,True,False,True
3,1,1,35.0,1,0,53.1,False,True,False,True,...,False,True,False,False,True,False,False,True,True,False
4,0,3,35.0,0,0,8.05,True,False,True,False,...,False,True,False,True,False,False,False,True,False,True


In [None]:
# Merge encoded columns back into the DataFrame
df_encoded = pd.concat([df.drop(columns=categorical_cols), encoded_categorical_df], axis=1)

# Display the DataFrame after encoding
print("\nDataFrame after One-Hot Encoding:\n", df_encoded.head())


In [None]:
# Identify boolean columns
boolean_cols = df_encoded.select_dtypes(include=['bool']).columns.tolist()

# Convert boolean columns to int for correlation calculation
df_encoded[boolean_cols] = df_encoded[boolean_cols].astype(int)

# Identify numerical columns for scaling (excluding boolean columns)
numerical_cols = df_encoded.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [None]:
# Select numerical columns for scaling
numerical_cols = ['age', 'fare']

# Standard Scaling
standard_scaler = StandardScaler()
df_standard_scaled = df_encoded.copy()
df_standard_scaled[numerical_cols] = standard_scaler.fit_transform(df_encoded[numerical_cols])

# Min-Max Scaling
min_max_scaler = MinMaxScaler()
df_min_max_scaled = df_encoded.copy()
df_min_max_scaled[numerical_cols] = min_max_scaler.fit_transform(df_encoded[numerical_cols])

# Display the scaled DataFrames
print("\nStandard Scaled DataFrame:\n", df_standard_scaled.head())
print("\nMin-Max Scaled DataFrame:\n", df_min_max_scaled.head())


In [None]:
df_encoded.info()

In [None]:

# Calculate correlation matrices
correlation_original = df_encoded.corr()
correlation_standard_scaled = df_standard_scaled.corr()
correlation_min_max_scaled = df_min_max_scaled.corr()

# Display correlation matrices
print("\nCorrelation Matrix - Original Data:\n", correlation_original)
print("\nCorrelation Matrix - Standard Scaled Data:\n", correlation_standard_scaled)
print("\nCorrelation Matrix - Min-Max Scaled Data:\n", correlation_min_max_scaled)