In [0]:
from pyspark.sql import SparkSession
import pandas as pd
spark = SparkSession.builder.getOrCreate()
df_spark = spark.table("silver.labeled_step_test")
df = df_spark.toPandas()
df.head()

In [0]:
# Numeric features
feature_cols_numeric = ["distance_cm"]

# Categorical features
feature_cols_categorical = ["sensor_type", "device_id"]

# Label
label_col = "step_label"

In [0]:
from sklearn.model_selection import train_test_split

X = df[feature_cols_numeric + feature_cols_categorical]
y = df[label_col]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [0]:
X_train.head()

In [0]:
y_train.head()

In [0]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

numeric_transformer = StandardScaler()

In [0]:
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

In [0]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, feature_cols_numeric),
        ("cat", categorical_transformer, feature_cols_categorical)
    ]
)

In [0]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline(steps=[
    ("preprocess", preprocessor)
])

In [0]:
pipeline.fit(X_train)

X_train_transformed = pipeline.transform(X_train)
X_test_transformed = pipeline.transform(X_test)

In [0]:
X_test_transformed.toarray()

In [0]:
import joblib

feature_names = pipeline.named_steps["preprocess"].get_feature_names_out()
joblib.dump(feature_names, "/Workspace/Users/gsc314@ensign.edu/csai382_lab_2_4_-GustavoC-/etl_pipeline/feature_names.pkl")

joblib.dump(pipeline, "/Workspace/Users/gsc314@ensign.edu/csai382_lab_2_4_-GustavoC-/etl_pipeline/stedi_feature_pipeline.pkl")

joblib.dump(
    X_train_transformed,
    "/Workspace/Users/gsc314@ensign.edu/csai382_lab_2_4_-GustavoC-/etl_pipeline/X_train_transformed.pkl"
)

joblib.dump(
    X_test_transformed,
    "/Workspace/Users/gsc314@ensign.edu/csai382_lab_2_4_-GustavoC-/etl_pipeline/X_test_transformed.pkl"
)

# âœ… Save labels
joblib.dump(
    y_train,
    "/Workspace/Users/gsc314@ensign.edu/csai382_lab_2_4_-GustavoC-/etl_pipeline/y_train.pkl"
)

joblib.dump(
    y_test,
    "/Workspace/Users/gsc314@ensign.edu/csai382_lab_2_4_-GustavoC-/etl_pipeline/y_test.pkl"
)

In [0]:
import pandas as pd
import numpy as np

# Optionally, get feature names for column headers
feature_names = pipeline.named_steps["preprocess"].get_feature_names_out()

# Save X_train_transformed as CSV
pd.DataFrame(X_train_transformed.toarray(), columns=feature_names).to_csv(
    "/Workspace/Users/gsc314@ensign.edu/csai382_lab_2_4_-GustavoC-/etl_pipeline/X_train_transformed.csv", index=False
)

# Save X_test_transformed as CSV
pd.DataFrame(X_test_transformed.toarray(), columns=feature_names).to_csv(
    "/Workspace/Users/gsc314@ensign.edu/csai382_lab_2_4_-GustavoC-/etl_pipeline/X_test_transformed.csv", index=False
)