# Feature Encoding
Feature encoding is the process of converting categorical data into numerical format so that machine learning algorithms can process it. This notebook demonstrates various feature encoding techniques.

In [1]:
# Import required libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# Load Dataset
For this example, we will use a sample dataset.

In [2]:
# Sample dataset
data = {
    'city': ['Karachi', 'Lahore', 'Islamabad', 'Karachi', 'Lahore'],
    'gender': ['Male', 'Female', 'Female', 'Male', 'Female'],
    'education': ['Bachelors', 'Masters', 'PhD', 'Bachelors', 'Masters']
}
df = pd.DataFrame(data)
df.head()

# Label Encoding
Label encoding assigns a unique integer to each category.

In [3]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply LabelEncoder to 'gender' column
df['gender_encoded'] = label_encoder.fit_transform(df['gender'])
df.head()

# One-Hot Encoding
One-hot encoding creates binary columns for each category, indicating the presence or absence of the category.

In [4]:
# Initialize OneHotEncoder
onehot_encoder = OneHotEncoder(sparse=False)

# Apply OneHotEncoder to 'city' column
city_encoded = onehot_encoder.fit_transform(df[['city']])

# Create a DataFrame with the encoded columns
city_encoded_df = pd.DataFrame(city_encoded, columns=onehot_encoder.get_feature_names_out(['city']))

# Concatenate the original DataFrame with the encoded DataFrame
df = pd.concat([df, city_encoded_df], axis=1)
df.head()

# ColumnTransformer and Pipeline
Using ColumnTransformer and Pipeline to apply different encodings to different columns.

In [5]:
# Define the ColumnTransformer
column_transformer = ColumnTransformer(
    transformers=[
        ('city_onehot', OneHotEncoder(), ['city']),
        ('gender_label', LabelEncoder(), ['gender'])
    ],
    remainder='passthrough'
)

# Apply the ColumnTransformer
transformed_data = column_transformer.fit_transform(df)

# Convert the transformed data to a DataFrame
transformed_df = pd.DataFrame(transformed_data, columns=column_transformer.get_feature_names_out())
transformed_df.head()