# 🛠️ Data Preprocessing Template 🛠️

In [1]:
# 🛠️ Data Preprocessing Template (Python Version) 🛠️
# This script includes essential steps for preparing data in Python before applying machine learning models.

## Importing the libraries

In [2]:
# ------------------------------------------
# 📌 1. Importing the necessary libraries
# ------------------------------------------
import numpy as np  # 🔢 NumPy: Used for numerical operations and handling arrays
import pandas as pd  # 📑 Pandas: Used for data manipulation and analysis
from sklearn.model_selection import train_test_split  # 🔀 For splitting dataset into training and test sets
from sklearn.preprocessing import LabelEncoder, StandardScaler  # 🔄 For encoding categorical variables and feature scaling

## Importing the dataset

In [5]:
# ------------------------------------------
# 📌 2. Loading the dataset
# ------------------------------------------
# Read the dataset from a CSV file
# `pd.read_csv("filename.csv")` loads tabular data from a CSV file into a DataFrame
dataset = pd.read_csv("Data.csv")

# Display basic information about the dataset
dataset.info()  # 📊 Shows dataset structure, column types, and missing values
print(dataset.head())  # 🔍 Displays the first few rows of the dataset

# 🔹 X (features) - All columns except the last one (independent variables)
X = dataset.iloc[:, :-1].values

# 🔹 y (target) - Only the last column (dependent variable)
y = dataset.iloc[:, -1].values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 452.0+ bytes
   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes


In [9]:
print(X)  # 📃 Print feature matrix

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [8]:
print(y)  # 🎯 Print target variable

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Taking care of missing data

In [None]:
# ------------------------------------------
# 📌 Handling Missing Data
# ------------------------------------------
# In real-world datasets, missing values (NaN) are common and can negatively impact model performance.
# Instead of removing entire rows or columns with missing values (which can lead to data loss),
# we replace them with an appropriate statistic such as the mean.

# ------------------------------------------
# 📌 1. Importing the required library
# ------------------------------------------
from sklearn.impute import SimpleImputer  # 🛠 Import SimpleImputer to handle missing values

# ------------------------------------------
# 📌 2. Creating an Imputer instance
# ------------------------------------------
# The imputer is responsible for filling missing values in numerical columns.
# - `missing_values=np.nan` → Specifies that we are handling NaN (Not a Number) values.
# - `strategy='mean'` → Fills NaN values with the **mean** of the respective column.
#
# 🔹 Other strategies available:
#   - `median` → Fills NaN values with the **median** of the column.
#   - `most_frequent` → Replaces NaN with the most frequently occurring value in the column (useful for categorical data).
#   - `constant` → Replaces NaN with a specified constant value (e.g., 0).
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# ------------------------------------------
# 📌 3. Applying the imputer to numerical features
# ------------------------------------------
# We assume that missing values exist only in numerical columns (e.g., Age, Salary).
# - `fit_transform()` first calculates the mean for each column and replaces NaN values.
# - `X[:, 1:3]` selects the **second and third columns** (indexing starts from 0).
#
# 🚨 Important:
#   - Ensure these columns contain only numerical values before applying the imputer.
#   - If categorical variables exist in this range, encoding should be done first.
X[:, 1:3] = imputer.fit_transform(X[:, 1:3])

# ------------------------------------------
# 📌 4. Checking the results
# ------------------------------------------
# Let's print `X` to verify that missing values have been successfully replaced.
print(X)  # 🖨 Displays the feature matrix after imputation

In [23]:
# Let's print `X` to verify that missing values have been successfully replaced.
print(X)  # 🖨 Displays the feature matrix after imputation

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 nan]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 nan 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


## Encoding categorical data

### Encoding the Independent Variable

In [25]:
# ------------------------------------------
# 📌 Encoding Categorical Data (Independent Variable)
# ------------------------------------------
# Machine learning models cannot work directly with categorical (text) data.
# Instead, we need to transform categorical values into numerical representations.
#
# There are two common encoding techniques:
# 1️⃣ **Label Encoding**: Assigns a unique number to each category (e.g., France → 0, Spain → 1, Germany → 2).
#    - Suitable for ordinal data (where order matters).
# 2️⃣ **One-Hot Encoding** (used here): Converts categorical values into binary vectors.
#    - Each unique category gets its own column.
#    - Example: ['France', 'Spain', 'Germany'] → [1 0 0], [0 1 0], [0 0 1]
#    - Suitable for non-ordinal data (where order **does not** matter).

# ------------------------------------------
# 📌 1. Importing Required Libraries
# ------------------------------------------
from sklearn.compose import ColumnTransformer  # 🛠 Helps apply transformations to specific columns
from sklearn.preprocessing import OneHotEncoder  # 🔄 One-Hot Encoding transformation

# ------------------------------------------
# 📌 2. Applying One-Hot Encoding to the 'Country' column
# ------------------------------------------
# - `transformers=[('encoder', OneHotEncoder(), [0])]` → Applies One-Hot Encoding to the first column (Country).
# - `remainder='passthrough'` → Keeps the rest of the dataset unchanged.
# - This ensures that only the categorical feature is transformed, while numerical features remain intact.

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')

# ------------------------------------------
# 📌 3. Transforming the Dataset
# ------------------------------------------
# - `fit_transform(X)` applies the transformation.
# - `np.array(...)` ensures that the transformed dataset is stored as a NumPy array.
X = np.array(ct.fit_transform(X))

In [28]:
#✅ Checking the Transformed Dataset
# ------------------------------------------
# Printing `X` to verify that the categorical feature is now represented as binary vectors.
print(X)  # 🖨 Displays the dataset after encoding

[[0.0 1.0 0.0 0.0 44.0 72000.0]
 [1.0 0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 1.0 38.0 61000.0]
 [1.0 0.0 1.0 0.0 40.0 nan]
 [0.0 1.0 0.0 0.0 35.0 58000.0]
 [1.0 0.0 0.0 1.0 nan 52000.0]
 [0.0 1.0 0.0 0.0 48.0 79000.0]
 [1.0 0.0 1.0 0.0 50.0 83000.0]
 [0.0 1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable

In [29]:
# ------------------------------------------
# 🏷️ Encoding the Dependent Variable (Target)
# ------------------------------------------
# 📌 Why do we need to encode the target variable?
# Machine learning models work with numerical data, but our target variable (`y`) contains categorical values.
# Example: "Yes" and "No" → These need to be converted into numerical labels (e.g., 1 and 0).
#
# 🔹 Solution: We use **Label Encoding**, which assigns each category a unique integer.
# - Example transformation:
#   ["No", "Yes", "No", "Yes"] → [0, 1, 0, 1]
# - This works well for **binary classification** problems.

# ------------------------------------------
# 📌 1. Importing Required Library
# ------------------------------------------
from sklearn.preprocessing import LabelEncoder  # 🛠 Import LabelEncoder to encode categorical target variables

# ------------------------------------------
# 📌 2. Creating and Applying Label Encoding
# ------------------------------------------
# 🔹 Label Encoding converts categorical labels into numeric values:
# Each unique category in `y` gets a distinct integer.
# This is useful for **binary target variables** (like Yes/No, Purchased/Not Purchased).

le = LabelEncoder()  # 🛠 Creating an instance of LabelEncoder

# 🚀 Applying the transformation
y = le.fit_transform(y)
# ------------------------------------------
# 📌 3. Checking the Encoded Target Variable
# ------------------------------------------
# 🔍 Let's print `y` to verify that categorical values have been successfully converted into numerical labels.
print("Encoded Target Variable:")
print(y)  # 🖨 Displays the transformed target variable

# ------------------------------------------
# 🔥 Alternative Approach: One-Hot Encoding for Multi-Class Target
# ------------------------------------------
# If the target variable had **more than two categories**, we could use One-Hot Encoding instead.
# Example: ["Low", "Medium", "High"] → [[1,0,0], [0,1,0], [0,0,1]]
# However, for **binary classification**, Label Encoding is the best choice.

# ✅ Now, the target variable (`y`) is encoded and ready for machine learning models!

Encoded Target Variable:
[0 1 0 0 1 1 0 1 0 1]


In [14]:
# 📌  Checking the Encoded Target Variable
# ------------------------------------------
# Let's print `y` to verify that categorical values have been converted to numerical labels.
print(y)  # 🖨 Displays the transformed target variable


[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set

In [30]:
# ------------------------------------------
# 🎯 Splitting the Dataset into Training and Test Sets
# ------------------------------------------
# 📌 Why do we split the dataset?
# - Machine learning models learn from **training data** (X_train, y_train).
# - The model's performance is **evaluated on unseen test data** (X_test, y_test).
# - Typically, we use **80%** of the data for training and **20%** for testing.

# ------------------------------------------
# 📌 1. Importing Required Library
# ------------------------------------------
from sklearn.model_selection import train_test_split  # 🔀 Import function for splitting data

# ------------------------------------------
# 📌 2. Splitting the Data
# ------------------------------------------
# - `test_size=0.2` → Allocates 20% of the dataset for testing.
# - `random_state=1` → Ensures **reproducibility** (so we get the same split every time).
# - `train_test_split` automatically shuffles and splits the dataset.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# ------------------------------------------
# 📌 3. Understanding the Split
# ------------------------------------------
# Now, we have:
# ✅ `X_train`, `y_train` → Training set (80% of data)
# ✅ `X_test`, `y_test` → Test set (20% of data)

In [33]:
# 🧐 Checking the training set
print("Training Features (X_train):")
print(X_train)  # 🖨 Print training features

Training Features (X_train):
[[1.0 0.0 0.0 1.0 nan 52000.0]
 [1.0 0.0 1.0 0.0 40.0 nan]
 [0.0 1.0 0.0 0.0 44.0 72000.0]
 [1.0 0.0 0.0 1.0 38.0 61000.0]
 [1.0 0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 0.0 48.0 79000.0]
 [1.0 0.0 1.0 0.0 50.0 83000.0]
 [0.0 1.0 0.0 0.0 35.0 58000.0]]


In [34]:
# 🧐 Checking the test set
print("\nTest Features (X_test):")
print(X_test)  # 🖨 Print test features


Test Features (X_test):
[[1.0 0.0 1.0 0.0 30.0 54000.0]
 [0.0 1.0 0.0 0.0 37.0 67000.0]]


In [35]:
print("\nTraining Target Labels (y_train):")
print(y_train)  # 🖨 Print training target labels


Training Target Labels (y_train):
[0 1 0 0 1 1 0 1]


In [40]:
print("\nTest Target Labels (y_test):")
print(y_test)  # 🖨 Print test target labels


Test Target Labels (y_test):
[0 1]


## Feature Scaling

In [37]:
# ------------------------------------------
# ⚙️ Feature Scaling - Normalizing numerical data to improve performance
# ------------------------------------------
# 📌 Why apply feature scaling?
# - Some ML models (e.g., SVM, KNN, Logistic Regression) perform **better with scaled data**.
# - Prevents **features with large values** (like Salary) from dominating smaller ones (like Age).
# - Improves **convergence speed** for gradient-based models (e.g., Neural Networks).

# ------------------------------------------
# 📌 1. Importing Required Library
# ------------------------------------------
from sklearn.preprocessing import StandardScaler  # 📏 Import StandardScaler

# ------------------------------------------
# 📌 2. Creating an Instance of StandardScaler
# ------------------------------------------
# Standardization formula:
#   X_scaled = (X - mean) / standard deviation
# This ensures that each feature has:
# - Mean = 0
# - Standard Deviation = 1
sc = StandardScaler()  # 🛠 Create an instance of the scaler

# ------------------------------------------
# 📌 3. Applying Scaling to the Numerical Features
# ------------------------------------------
# 🚨 Note:
# - We **do NOT** scale categorical variables (already encoded).
# - We apply scaling **only to numerical columns**.

# Apply scaling to numerical features in the training set
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])

# Apply the same transformation to the test set
X_test[:, 3:] = sc.transform(X_test[:, 3:])  # 🚨 Use `transform()` (not `fit_transform()`)

In [38]:
# 🔍 Print Scaled Training Set
print("📊 Scaled Training Features (X_train):")
print(X_train)  # 🖨 Display transformed training data

📊 Scaled Training Features (X_train):
[[1.0 0.0 0.0 1.2909944487358056 nan -1.0182239953527132]
 [1.0 0.0 1.0 -0.7745966692414834 -0.038910211282047996 nan]
 [0.0 1.0 0.0 -0.7745966692414834 0.5058327466666259 0.583476671494251]
 [1.0 0.0 0.0 1.2909944487358056 -0.3112816902563849 -0.2974586952715793]
 [1.0 0.0 0.0 1.2909944487358056 -1.809324824615238 -1.3385641287221062]
 [0.0 1.0 0.0 -0.7745966692414834 1.0505757046152997 1.1440719048906884]
 [1.0 0.0 1.0 -0.7745966692414834 1.3229471835896367 1.4644120382600814]
 [0.0 1.0 0.0 -0.7745966692414834 -0.7198389087178904 -0.537713795298624]]


In [39]:
# 🔍 Print Scaled Test Set
print("\n📊 Scaled Test Features (X_test):")
print(X_test)  # 🖨 Display transformed test data


📊 Scaled Test Features (X_test):
[[1.0 0.0 1.0 -0.7745966692414834 -1.4007676061537326 -0.8580539286680168]
 [0.0 1.0 0.0 -0.7745966692414834 -0.4474674297435534 0.18305150478250995]]
