In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import joblib


In [5]:
import pandas as pd

# Create the dataset
data = {
    "PassengerId": [1, 2, 3, 4, 5, 6, 7],
    "Pclass": [3, 1, 3, 1, 3, 3, 1],
    "Name": [
        "Braund, Mr. Owen", "Cumings, Mrs. John", "Heikkinen, Miss. L", 
        "Futrelle, Mrs. J", "Allen, Mr. Wm Henry", "Moran, Mr. James", "McCarthy, Mr. Tim"
    ],
    "Sex": ["male", "female", "female", "female", "male", "male", "male"],
    "Age": [22, 38, 26, 35, 35, None, 54],
    "SibSp": [1, 1, 0, 1, 0, 0, 0],
    "Parch": [0, 0, 0, 0, 0, 0, 0],
    "Fare": [7.25, 71.283, 7.925, 53.1, 8.05, 8.458, 51.862],
    "Embarked": ["S", "C", "S", "S", "S", "Q", "S"],
    "Survived": [0, 1, 1, 1, 0, 0, 0]
}

# Convert to DataFrame
titanic_df = pd.DataFrame(data)

# Save as CSV
titanic_df.to_csv("titanic_sample.csv", index=False)

# Preview the data
titanic_df


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,1,3,"Braund, Mr. Owen",male,22.0,1,0,7.25,S,0
1,2,1,"Cumings, Mrs. John",female,38.0,1,0,71.283,C,1
2,3,3,"Heikkinen, Miss. L",female,26.0,0,0,7.925,S,1
3,4,1,"Futrelle, Mrs. J",female,35.0,1,0,53.1,S,1
4,5,3,"Allen, Mr. Wm Henry",male,35.0,0,0,8.05,S,0
5,6,3,"Moran, Mr. James",male,,0,0,8.458,Q,0
6,7,1,"McCarthy, Mr. Tim",male,54.0,0,0,51.862,S,0


In [6]:
# Load dataset
df = pd.read_csv("titanic_sample.csv")

# Preview the dataset
df.head()


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,1,3,"Braund, Mr. Owen",male,22.0,1,0,7.25,S,0
1,2,1,"Cumings, Mrs. John",female,38.0,1,0,71.283,C,1
2,3,3,"Heikkinen, Miss. L",female,26.0,0,0,7.925,S,1
3,4,1,"Futrelle, Mrs. J",female,35.0,1,0,53.1,S,1
4,5,3,"Allen, Mr. Wm Henry",male,35.0,0,0,8.05,S,0


In [7]:
# Check for missing values
df.info()
df.isnull().sum()

# Check data types
df.dtypes

# Describe statistics
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  7 non-null      int64  
 1   Pclass       7 non-null      int64  
 2   Name         7 non-null      object 
 3   Sex          7 non-null      object 
 4   Age          6 non-null      float64
 5   SibSp        7 non-null      int64  
 6   Parch        7 non-null      int64  
 7   Fare         7 non-null      float64
 8   Embarked     7 non-null      object 
 9   Survived     7 non-null      int64  
dtypes: float64(2), int64(5), object(3)
memory usage: 692.0+ bytes


Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Survived
count,7.0,7.0,6.0,7.0,7.0,7.0,7.0
mean,4.0,2.142857,35.0,0.428571,0.0,29.704,0.428571
std,2.160247,1.069045,11.135529,0.534522,0.0,27.886551,0.534522
min,1.0,1.0,22.0,0.0,0.0,7.25,0.0
25%,2.5,1.0,28.25,0.0,0.0,7.9875,0.0
50%,4.0,3.0,35.0,0.0,0.0,8.458,0.0
75%,5.5,3.0,37.25,1.0,0.0,52.481,1.0
max,7.0,3.0,54.0,1.0,0.0,71.283,1.0


In [9]:
# Assume 'Age' is the column to predict
X = df.drop("Age", axis=1)
y = df["Age"]


In [10]:
# Select numerical and categorical columns
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns


In [11]:
numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])


In [12]:
categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])


In [13]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, numeric_features),
        ("cat", categorical_pipeline, categorical_features)
    ]
)


In [14]:
etl_pipeline = Pipeline(steps=[("preprocessor", preprocessor)])


In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [16]:
# Fit on training data and transform both train and test
X_train_transformed = etl_pipeline.fit_transform(X_train)
X_test_transformed = etl_pipeline.transform(X_test)

# Show shape to verify
print("Train shape:", X_train_transformed.shape)
print("Test shape:", X_test_transformed.shape)


Train shape: (5, 15)
Test shape: (2, 15)


In [17]:
# Save transformed data
np.save("X_train.npy", X_train_transformed)
np.save("X_test.npy", X_test_transformed)
y_train.to_csv("y_train.csv", index=False)
y_test.to_csv("y_test.csv", index=False)

# Save pipeline
joblib.dump(etl_pipeline, "etl_pipeline.joblib")


['etl_pipeline.joblib']