In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# 1. Load Titanic dataset directly from online source
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

print("Original Dataset (first 5 rows):")
print(df.head(), "\n")

# ---------------- PREPROCESSING STEPS ----------------

# 2. Handle Missing Values (Introduce some NaN in 'age' for demo)
df.loc[5:10, "Age"] = np.nan
imputer = SimpleImputer(strategy="mean")
df["Age"] = imputer.fit_transform(df[["Age"]])

# 3. Encode Categorical Column (sex → numbers)
encoder = LabelEncoder()
df["Sex"] = encoder.fit_transform(df["Sex"])

# Select some numerical columns for scaling
num_cols = ["Age", "Fare", "SibSp", "Parch"]

# 4. Standardization (Z-score scaling)
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# 5. Min-Max Normalization (scale values between 0 and 1)
minmax = MinMaxScaler()
df[num_cols] = minmax.fit_transform(df[num_cols])

# 6. Feature Engineering (family size = siblings/spouses + parents/children)
df["FamilySize"] = df["SibSp"] + df["Parch"]

print("Preprocessed Dataset (first 5 rows):")
print(df.head())


Original Dataset (first 5 rows):
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            37