
# Day 11 — Data Preprocessing with Scikit-Learn

**Author:** Dhairya Patel  

This notebook covers:
1. Handling missing values
2. Encoding categorical variables
3. Feature scaling
4. Train-test split


In [None]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [None]:

data = {
    'Age': [25, np.nan, 28, 35, 40, np.nan],
    'Salary': [50000, 60000, np.nan, 80000, 90000, 75000],
    'Gender': ['Male', 'Female', 'Female', 'Male', 'Male', 'Female'],
    'Purchased': ['No', 'Yes', 'No', 'No', 'Yes', 'Yes']
}
df = pd.DataFrame(data)
print("Original Dataset:")
print(df)


In [None]:

imputer = SimpleImputer(strategy='mean')
df[['Age', 'Salary']] = imputer.fit_transform(df[['Age', 'Salary']])
print("After Handling Missing Values:")
print(df)


In [None]:

# Label Encoding for target variable
label_encoder = LabelEncoder()
df['Purchased'] = label_encoder.fit_transform(df['Purchased'])

# One-hot encoding for Gender
df = pd.get_dummies(df, columns=['Gender'], drop_first=True)
print("After Encoding:")
print(df)


In [None]:

scaler = StandardScaler()
df[['Age', 'Salary']] = scaler.fit_transform(df[['Age', 'Salary']])
print("After Scaling:")
print(df)


In [None]:

X = df.drop('Purchased', axis=1)
y = df['Purchased']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training Features:")
print(X_train)
print("Testing Features:")
print(X_test)



---

### Notes
- **Missing values:** SimpleImputer is great for filling gaps.
- **Encoding:** Use one-hot encoding for non-ordinal categories.
- **Scaling:** Standardization helps algorithms converge better.
- **Train-test split:** Always evaluate on unseen data.

**End of Day 11.**
