In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer




In [None]:


# Load the dataset
data = pd.read_csv('iris-parquet.csv')
print(data.head())



   sepal.length  sepal.width  petal.length  petal.width variety
0           5.1          3.5           1.4          0.2  Setosa
1           4.9          3.0           1.4          0.2  Setosa
2           4.7          3.2           1.3          0.2  Setosa
3           4.6          3.1           1.5          0.2  Setosa
4           5.0          3.6           1.4          0.2  Setosa


In [None]:


# 1. Data Cleaning
num_features = ['sepal.length', 'sepal.width', 'petal.length', 'petal.width']
cat_features = ['variety']



In [None]:

# Numeric transformer: impute + scale
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])



In [None]:


# Categorical transformer: impute + one-hot
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])



In [None]:


# Combine preprocessing steps for Input Data
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_features),
        ("cat", cat_transformer, cat_features)
    ]
)
preprocessor.set_output(transform="pandas")

# Apply transformations to input data
data_preprocessed = preprocessor.fit_transform(data)
print(data_preprocessed.head())

# Combine preprocessing steps for Output Data (target)
preprocessor_Out = ColumnTransformer(
    transformers=[
        ("cat", cat_transformer, ['variety'])
    ]
)
preprocessor_Out.set_output(transform="pandas")

# Apply transformations to Output data
data_preprocessed_Out = preprocessor_Out.fit_transform(data)
print(data_preprocessed_Out.head())

# 2. Feature Engineering
# Example: Petal area = petal.length * petal.width
data_preprocessed["petal_area"] = data['petal.length'] * data['petal.width']

# 3. Data Splitting
X = data_preprocessed
y = data_preprocessed_Out
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display processed training data
print(X_train.head())
print(y_train.head())





   num__sepal.length  num__sepal.width  num__petal.length  num__petal.width  \
0          -0.900681          1.019004          -1.340227         -1.315444   
1          -1.143017         -0.131979          -1.340227         -1.315444   
2          -1.385353          0.328414          -1.397064         -1.315444   
3          -1.506521          0.098217          -1.283389         -1.315444   
4          -1.021849          1.249201          -1.340227         -1.315444   

   cat__variety_Setosa  cat__variety_Versicolor  cat__variety_Virginica  
0                  1.0                      0.0                     0.0  
1                  1.0                      0.0                     0.0  
2                  1.0                      0.0                     0.0  
3                  1.0                      0.0                     0.0  
4                  1.0                      0.0                     0.0  
   cat__variety_Setosa  cat__variety_Versicolor  cat__variety_Virginica
0        