# Load Libraries

In [1]:
import pandas as pd

# Load Data

In [2]:
data = pd.read_csv("data_transformation2.csv")
data.set_index("Instances", inplace=True)
data

Unnamed: 0_level_0,number siblings,household income,HS grade,approve
Instances,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,3,0.8,3.0,yes
2,4,3.0,2.0,no
3,5,2.0,3.7,yes
4,2,1.5,3.0,no
5,0,1.2,2.0,no
6,1,3.5,3.0,no
7,2,1.6,3.0,yes
8,1,0.5,3.5,yes
9,2,0.8,2.9,no
10,5,2.3,3.8,yes


# Label Encoding

In [3]:
from sklearn.preprocessing import LabelEncoder
transformed_data = data.copy()

# Label encode data
label_encoder = LabelEncoder()

categorical_columns = ["approve"]

for column in categorical_columns:
  transformed_data[column] = label_encoder.fit_transform(data[column])
transformed_data

Unnamed: 0_level_0,number siblings,household income,HS grade,approve
Instances,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,3,0.8,3.0,1
2,4,3.0,2.0,0
3,5,2.0,3.7,1
4,2,1.5,3.0,0
5,0,1.2,2.0,0
6,1,3.5,3.0,0
7,2,1.6,3.0,1
8,1,0.5,3.5,1
9,2,0.8,2.9,0
10,5,2.3,3.8,1


In [4]:
# Transform back the value to its original form
label_encoder.inverse_transform(transformed_data["approve"])

array(['yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'yes', 'no', 'yes'],
      dtype=object)

# Data Scaling

In [5]:
from sklearn.preprocessing import RobustScaler  # Median and IQR (best used for with outliers)
from sklearn.preprocessing import MinMaxScaler  # Min-Max (best used for 0-1 scaling)

# Z-score (best used for normal distribution with mean 0 and variance 1)
from sklearn.preprocessing import StandardScaler
scaling_columns = ["number siblings", "household income", "HS grade"]

# Explore different scaling methods

scalers = {
  "StandardScaler": StandardScaler(),
  "MinMaxScaler": MinMaxScaler(),
  "RobustScaler": RobustScaler()
}

scaled_data = transformed_data.copy()

for column in scaling_columns:
  for scaler_name, scaler in scalers.items():
    scaled_data[column + " (" + scaler_name + ")"] = scaler.fit_transform(data[[column]])

scaled_data

Unnamed: 0_level_0,number siblings,household income,HS grade,approve,number siblings (StandardScaler),number siblings (MinMaxScaler),number siblings (RobustScaler),household income (StandardScaler),household income (MinMaxScaler),household income (RobustScaler),HS grade (StandardScaler),HS grade (MinMaxScaler),HS grade (RobustScaler)
Instances,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,3,0.8,3.0,1,0.307148,0.6,0.4,-0.984309,0.1,-0.566038,0.017178,0.555556,0.0
2,4,3.0,2.0,0,0.921443,0.8,0.8,1.369474,0.833333,1.09434,-1.700589,0.0,-2.222222
3,5,2.0,3.7,1,1.535738,1.0,1.2,0.299572,0.5,0.339623,1.219614,0.944444,1.555556
4,2,1.5,3.0,0,-0.307148,0.4,0.0,-0.235378,0.333333,-0.037736,0.017178,0.555556,0.0
5,0,1.2,2.0,0,-1.535738,0.0,-0.8,-0.556349,0.233333,-0.264151,-1.700589,0.0,-2.222222
6,1,3.5,3.0,0,-0.921443,0.2,-0.4,1.904424,1.0,1.471698,0.017178,0.555556,0.0
7,2,1.6,3.0,1,-0.307148,0.4,0.0,-0.128388,0.366667,0.037736,0.017178,0.555556,0.0
8,1,0.5,3.5,1,-0.921443,0.2,-0.4,-1.30528,0.0,-0.792453,0.876061,0.833333,1.111111
9,2,0.8,2.9,0,-0.307148,0.4,0.0,-0.984309,0.1,-0.566038,-0.154599,0.5,-0.222222
10,5,2.3,3.8,1,1.535738,1.0,1.2,0.620543,0.6,0.566038,1.391391,1.0,1.777778


In [None]:
scaler = StandardScaler()

scaled_data = transformed_data.copy()

for column in scaling_columns:
  scaled_data[column + " ('StandardScaler')"] = scaler.fit_transform(data[[column]])

scaled_data