In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

In [2]:
df = pd.read_csv('titanic-tested.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
selected_cols = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked']
df = df[selected_cols]

# Separate features and label
x = df.drop('Survived', axis=1)
y = df['Survived']

In [23]:
# One-hot encode categorical features
categorical_features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# sparse_output=False -> removes unnecessary column
encoder

In [10]:
# 1. Fit the encoder on the categorical features
encoded_data = encoder.fit_transform(df[categorical_features])

# 2. Get the new feature names (optimal)
feature_names = encoder.get_feature_names_out(categorical_features)

# 3. Create a new DataFrame with the encoded data and feature names
encoded_df = pd.DataFrame(encoded_data, columns=feature_names)

In [21]:
encoded_data

array([[0., 0., 1., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 0., 1.],
       [0., 0., 1., ..., 1., 0., 0.]])

In [20]:
feature_names

array(['Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male',
       'SibSp_0', 'SibSp_1', 'SibSp_2', 'SibSp_3', 'SibSp_4', 'SibSp_5',
       'SibSp_8', 'Parch_0', 'Parch_1', 'Parch_2', 'Parch_3', 'Parch_4',
       'Parch_5', 'Parch_6', 'Parch_9', 'Embarked_C', 'Embarked_Q',
       'Embarked_S'], dtype=object)

In [19]:
encoded_df.head()

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,SibSp_2,SibSp_3,SibSp_4,...,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Parch_9,Embarked_C,Embarked_Q,Embarked_S
0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [24]:
# Now scale the numeric Age column
# Handle missing values in Age before scaling
missing_values_age = x['Age'].isna().any()

if missing_values_age:
    print('Warning: Missing values found in Age feature. Using imputation (mean).')
    imputer = SimpleImputer(strategy='mean')  # replace with mean or other strategy
    imputer.fit(x[['Age']])
    x["Age"] = imputer.transform(x[["Age"]])[:, 0]  # Apply imputation to Age



In [25]:
# Standard scale the Age feature (after imputation)
scaler = StandardScaler()
scaler.fit(x[['Age']])
x_scaled = x.copy()
x_scaled

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
0,3,male,34.50000,0,0,Q
1,3,female,47.00000,1,0,S
2,2,male,62.00000,0,0,Q
3,3,male,27.00000,0,0,S
4,3,female,22.00000,1,1,S
...,...,...,...,...,...,...
413,3,male,30.27259,0,0,S
414,1,female,39.00000,0,0,C
415,3,male,38.50000,0,0,S
416,3,male,30.27259,0,0,S


In [26]:
x_scaled.loc[:, 'Age'] = scaler.transform(x[['Age']])
x_scaled

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
0,3,male,0.334993,0,0,Q
1,3,female,1.325530,1,0,S
2,2,male,2.514175,0,0,Q
3,3,male,-0.259330,0,0,S
4,3,female,-0.655545,1,1,S
...,...,...,...,...,...,...
413,3,male,0.000000,0,0,S
414,1,female,0.691586,0,0,C
415,3,male,0.651965,0,0,S
416,3,male,0.000000,0,0,S


In [27]:
# Append the Scaled Age to the original encoded_df dataframe
df_merged = pd.concat([encoded_df, x_scaled['Age']], axis=1)
df_merged.head()

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,SibSp_2,SibSp_3,SibSp_4,...,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Parch_9,Embarked_C,Embarked_Q,Embarked_S,Age
0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.334993
1,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.32553
2,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.514175
3,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.25933
4,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.655545
