# 2. Data Preparation

In this section, we prepare the Wine Quality dataset for machine learning.  
The goal is to clean, transform, and encode the data so that it can be used to train predictive models effectively.

### Handle Missing Values

In [1]:
print("\n Handling missing values...")
print("Before:", df.isnull().sum().sum(), "missing values total")

df[num_features] = df[num_features].fillna(df[num_features].median())

print("After:", df.isnull().sum().sum(), "missing values total")


 Handling missing values...


NameError: name 'df' is not defined

### Handle Outliers

In [20]:
for col in num_features:
    lower, upper = df[col].quantile([0.01, 0.99])
    df[col] = np.clip(df[col], lower, upper)

print("\n✅ Outliers capped at 1st and 99th percentiles.")


✅ Outliers capped at 1st and 99th percentiles.


### Feature Engineering

In [21]:
df["sulfur_ratio"] = df["free sulfur dioxide"] / (df["total sulfur dioxide"] + 1e-6)
df["sulfur_ratio"] = df["sulfur_ratio"].replace([np.inf, -np.inf], np.nan).fillna(df["sulfur_ratio"].median())

### Define Target Variable (classification: good vs not good)

In [22]:
df["quality_label"] = (df["quality"] >= 7).astype(int)
y = df["quality_label"]
X = df.drop(columns=["quality", "quality_label"])


### Train-Test Split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining size: {X_train.shape}, Test size: {X_test.shape}")


Training size: (5197, 13), Test size: (1300, 13)


### Identify Column Types

In [24]:
num_features = X.select_dtypes(include=["float64", "int64"]).columns.tolist()
cat_features = ["color"]

### Preprocessing Pipelines

In [29]:
log_transformer = FunctionTransformer(np.log1p, validate=False)

numeric_pipeline = Pipeline(steps=[
    ("log", log_transformer),
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=0.95))
])

categorical_pipeline = Pipeline(steps=[
    ("onehot", OneHotEncoder(drop="first"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, num_features),
        ("cat", categorical_pipeline, cat_features)
    ]
)

### Full Modeling Pipeline

In [30]:
model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight="balanced",
    max_depth=None
)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", model)
])

### Save Preprocessing Objects and Data Splits

In [32]:
import joblib
# Save the preprocessor and pipeline
joblib.dump(preprocessor, "preprocessor.pkl")
joblib.dump(pipeline, "pipeline_base.pkl")

# Save train-test splits
joblib.dump({
    "X_train": X_train,
    "X_test": X_test,
    "y_train": y_train,
    "y_test": y_test
}, "train_test_data.pkl")

print("Saved preprocessor, pipeline, and train/test splits.")

Saved preprocessor, pipeline, and train/test splits.
