# Data Normalization

In [1]:
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.preprocessing import StandardScaler, MinMaxScaler

diabetes = load_diabetes()
df_diabetes = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
df_diabetes.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641


In [5]:
z_scaler = StandardScaler()
df_diabetes['age'] = z_scaler.fit_transform(df_diabetes[['age']])

In [3]:
df_diabetes

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.800500,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.039567,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,1.793307,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930
3,-1.872441,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.113172,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.876870,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.115937,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485
439,0.876870,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491
440,-0.956004,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930


In [6]:
mean_age = df_diabetes['age'].mean()
std_age = df_diabetes['age'].std()

print("Mean of age:", mean_age)
print("Standard Deviation of age:", std_age)


Mean of age: 1.607562750588462e-17
Standard Deviation of age: 1.0011331448394598


In [7]:
#perform min-max scaling on the bmi column 
mm_scaler = MinMaxScaler()
df_diabetes['bmi_mm'] = mm_scaler.fit_transform(df_diabetes[['bmi']])
# Print the normalized and scaled data
print(df_diabetes[['age', 'age_z', 'bmi', 'bmi_mm']].head())

        age     age_z       bmi    bmi_mm
0  0.800500  0.800500  0.061696  0.582645
1 -0.039567 -0.039567 -0.051474  0.148760
2  1.793307  1.793307  0.044451  0.516529
3 -1.872441 -1.872441 -0.011595  0.301653
4  0.113172  0.113172 -0.036385  0.206612


# Data Encoding

In [8]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import warnings
from scipy.sparse import SparseEfficiencyWarning
warnings.simplefilter('ignore',SparseEfficiencyWarning)
df_penguins = sns.load_dataset("penguins")

df_penguins.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [9]:
# label encoder
label_encoder = LabelEncoder()
df_penguins['sex_encoded'] = label_encoder.fit_transform(df_penguins['sex'])
df_penguins.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,sex_encoded
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,1
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,0
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,0
3,Adelie,Torgersen,,,,,,2
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female,0


In [10]:
# one hot encoding
one_hot_encoder = OneHotEncoder()
island_encoded  = one_hot_encoder.fit_transform(df_penguins[['island']])
island_encoded_df = pd.DataFrame(island_encoded.toarray(), columns=
                                [f"island_{i}" for i in range(
                                    island_encoded.shape[1])])
df_penguins = pd.concat([df_penguins,island_encoded_df], axis=1)

In [11]:
island_encoded_df.head()

Unnamed: 0,island_0,island_1,island_2
0,0.0,0.0,1.0
1,0.0,0.0,1.0
2,0.0,0.0,1.0
3,0.0,0.0,1.0
4,0.0,0.0,1.0


In [12]:
## Oversampling:

In [13]:
import numpy as np

# Example data
class_A_instances = np.random.rand(100, 2)  # Assuming a 2D feature space
class_B_instances = np.random.rand(20, 2)

# Oversampling Class B to match the number of instances in Class A
oversampled_class_B_indices = np.random.choice(len(class_B_instances), size=80, replace=True)
oversampled_class_B_instances = class_B_instances[oversampled_class_B_indices]

# Combining the original Class A instances with the oversampled Class B instances
balanced_dataset = np.concatenate((class_A_instances, oversampled_class_B_instances), axis=0)

# Check the number of instances in each class
print("Number of instances in Class A:", len(class_A_instances))
print("Number of instances in Class B before oversampling:", len(class_B_instances))
print("Number of instances in Class B after oversampling:", len(oversampled_class_B_instances))

# The balanced dataset now contains 100 instances for each class
print("Number of instances in the balanced dataset:", len(balanced_dataset))


Number of instances in Class A: 100
Number of instances in Class B before oversampling: 20
Number of instances in Class B after oversampling: 80
Number of instances in the balanced dataset: 180


In [14]:
# undersampling
import numpy as np

# Example data
class_A_instances = np.random.rand(100, 2)  # Assuming a 2D feature space
class_B_instances = np.random.rand(20, 2)

# Undersampling Class A to match the number of instances in Class B
undersampled_class_A_indices = np.random.choice(len(class_A_instances), size=20, replace=False)
undersampled_class_A_instances = class_A_instances[undersampled_class_A_indices]

# The undersampled Class A now contains 20 instances
print("Number of instances in Class A after undersampling:", len(undersampled_class_A_instances))

# The number of instances in Class B remains the same
print("Number of instances in Class B:", len(class_B_instances))


Number of instances in Class A after undersampling: 20
Number of instances in Class B: 20


In [15]:
from imblearn.over_sampling import SMOTE
import numpy as np

# Example data
class_A_instances = np.random.rand(100, 2)  # Assuming a 2D feature space
class_B_instances = np.random.rand(20, 2)

# Assigning labels to instances
class_A_labels = np.zeros(len(class_A_instances))  # Label for Class A: 0
class_B_labels = np.ones(len(class_B_instances))  # Label for Class B: 1

# Concatenating instances and labels for both classes
X = np.concatenate((class_A_instances, class_B_instances), axis=0)
y = np.concatenate((class_A_labels, class_B_labels), axis=0)

# Perform synthetic sampling using SMOTE
smote = SMOTE()
synthetic_instances, synthetic_labels = smote.fit_resample(X, y)

# The number of synthetic instances generated
num_synthetic_instances = len(synthetic_instances) - len(X)
print("Number of synthetic instances generated for Class B:", num_synthetic_instances)

# After synthetic sampling, both Class A and Class B will have an equal number of instances
print("Number of instances in Class A:", len(class_A_instances))
print("Number of instances in Class B after synthetic sampling:", len(class_B_instances) + num_synthetic_instances)


Number of synthetic instances generated for Class B: 80
Number of instances in Class A: 100
Number of instances in Class B after synthetic sampling: 100
