<a href="https://colab.research.google.com/github/dajebbar/FreeCodeCamp-python-data-analysis/blob/main/Feature_Scaling_Techniques.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Feature Scaling Techniques
- MinMaxScaler
- StandardScaler
- Normalizer
- Binarizer

## Preparing Data

In [1]:
import pandas as pd
import numpy as np

In [35]:
diabetes = pd.read_excel('diabetes.xlsx', sheet_name='Diabetes_Classification')
diabetes.head()

Unnamed: 0,Patient number,Cholesterol,Glucose,HDL Chol,Chol/HDL ratio,Age,Gender,Height,Weight,BMI,Systolic BP,Diastolic BP,waist,hip,Waist/hip ratio,Diabetes
0,1,193,77,49,3.9,19,female,61,119,22.5,118,70,32,38,0.84,No diabetes
1,2,146,79,41,3.6,19,female,60,135,26.4,108,58,33,40,0.83,No diabetes
2,3,217,75,54,4.0,20,female,67,187,29.3,110,72,40,45,0.89,No diabetes
3,4,226,97,70,3.2,20,female,64,114,19.6,122,64,31,39,0.79,No diabetes
4,5,164,91,67,2.4,20,female,70,141,20.2,122,86,32,39,0.82,No diabetes


In [3]:
diabetes.shape

(390, 16)

In [4]:
diabetes.isna().sum()

Patient number     0
Cholesterol        0
Glucose            0
HDL Chol           0
Chol/HDL ratio     0
Age                0
Gender             0
Height             0
Weight             0
BMI                0
Systolic BP        0
Diastolic BP       0
waist              0
hip                0
Waist/hip ratio    0
Diabetes           0
dtype: int64

In [6]:
diabetes.Diabetes.value_counts(normalize=True)

No diabetes    0.846154
Diabetes       0.153846
Name: Diabetes, dtype: float64

In [7]:
diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 390 entries, 0 to 389
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Patient number   390 non-null    int64  
 1   Cholesterol      390 non-null    int64  
 2   Glucose          390 non-null    int64  
 3   HDL Chol         390 non-null    int64  
 4   Chol/HDL ratio   390 non-null    float64
 5   Age              390 non-null    int64  
 6   Gender           390 non-null    object 
 7   Height           390 non-null    int64  
 8   Weight           390 non-null    int64  
 9   BMI              390 non-null    float64
 10  Systolic BP      390 non-null    int64  
 11  Diastolic BP     390 non-null    int64  
 12  waist            390 non-null    int64  
 13  hip              390 non-null    int64  
 14  Waist/hip ratio  390 non-null    float64
 15  Diabetes         390 non-null    object 
dtypes: float64(3), int64(11), object(2)
memory usage: 48.9+ KB


In [9]:
print(diabetes.describe().T)
print('-*-' * 10)
print(diabetes.select_dtypes(include='object').describe().T)

                 count        mean         std    min     25%     50%  \
Patient number   390.0  195.500000  112.727548   1.00   98.25  195.50   
Cholesterol      390.0  207.230769   44.666005  78.00  179.00  203.00   
Glucose          390.0  107.338462   53.798188  48.00   81.00   90.00   
HDL Chol         390.0   50.266667   17.279069  12.00   38.00   46.00   
Chol/HDL ratio   390.0    4.524615    1.736634   1.50    3.20    4.20   
Age              390.0   46.774359   16.435911  19.00   34.00   44.50   
Height           390.0   65.951282    3.918867  52.00   63.00   66.00   
Weight           390.0  177.407692   40.407824  99.00  150.25  173.00   
BMI              390.0   28.775641    6.600915  15.20   24.10   27.80   
Systolic BP      390.0  137.133333   22.859528  90.00  122.00  136.00   
Diastolic BP     390.0   83.289744   13.498192  48.00   75.00   82.00   
waist            390.0   37.869231    5.760947  26.00   33.00   37.00   
hip              390.0   42.992308    5.664342  30.

## 1- MinMaxScaler
## $ X_{sc} = \frac{X - X_{min}}{X_{max} - X_{min}}$

In [13]:
num_features = diabetes.select_dtypes(include='number').columns.tolist()
cat_features = diabetes.select_dtypes(include='object').columns.tolist()
num_features = num_features[1:]
cat_features = cat_features[:-1]
print(num_features)
print(cat_features)

['Cholesterol', 'Glucose', 'HDL Chol', 'Chol/HDL ratio', 'Age', 'Height', 'Weight', 'BMI', 'Systolic BP', 'Diastolic BP', 'waist', 'hip', 'Waist/hip ratio']
['Gender']


### Encoding and Scaling Features

In [16]:
from sklearn.preprocessing import (
    OneHotEncoder,
    MinMaxScaler,
    StandardScaler,
)

encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
scaler = MinMaxScaler()

encoder.fit(diabetes[cat_features])
encoded_cols = list(encoder.get_feature_names_out(cat_features))
diabetes[encoded_cols] = encoder.transform(diabetes[cat_features])

diabetes[num_features] = scaler.fit_transform(diabetes[num_features])

diabetes.head()

Unnamed: 0,Patient number,Cholesterol,Glucose,HDL Chol,Chol/HDL ratio,Age,Gender,Height,Weight,BMI,Systolic BP,Diastolic BP,waist,hip,Waist/hip ratio,Diabetes,Gender_female,Gender_male
0,1,0.315068,0.086053,0.342593,0.134831,0.0,female,0.375,0.088496,0.179803,0.175,0.289474,0.2,0.235294,0.347826,No diabetes,1.0,0.0
1,2,0.186301,0.091988,0.268519,0.117978,0.0,female,0.333333,0.159292,0.275862,0.1125,0.131579,0.233333,0.294118,0.326087,No diabetes,1.0,0.0
2,3,0.380822,0.080119,0.388889,0.140449,0.013699,female,0.625,0.389381,0.347291,0.125,0.315789,0.466667,0.441176,0.456522,No diabetes,1.0,0.0
3,4,0.405479,0.145401,0.537037,0.095506,0.013699,female,0.5,0.066372,0.108374,0.2,0.210526,0.166667,0.264706,0.23913,No diabetes,1.0,0.0
4,5,0.235616,0.127596,0.509259,0.050562,0.013699,female,0.75,0.185841,0.123153,0.2,0.5,0.2,0.264706,0.304348,No diabetes,1.0,0.0


In [17]:
diabetes.drop('Gender', axis=1, inplace=True)
diabetes.head()

Unnamed: 0,Patient number,Cholesterol,Glucose,HDL Chol,Chol/HDL ratio,Age,Height,Weight,BMI,Systolic BP,Diastolic BP,waist,hip,Waist/hip ratio,Diabetes,Gender_female,Gender_male
0,1,0.315068,0.086053,0.342593,0.134831,0.0,0.375,0.088496,0.179803,0.175,0.289474,0.2,0.235294,0.347826,No diabetes,1.0,0.0
1,2,0.186301,0.091988,0.268519,0.117978,0.0,0.333333,0.159292,0.275862,0.1125,0.131579,0.233333,0.294118,0.326087,No diabetes,1.0,0.0
2,3,0.380822,0.080119,0.388889,0.140449,0.013699,0.625,0.389381,0.347291,0.125,0.315789,0.466667,0.441176,0.456522,No diabetes,1.0,0.0
3,4,0.405479,0.145401,0.537037,0.095506,0.013699,0.5,0.066372,0.108374,0.2,0.210526,0.166667,0.264706,0.23913,No diabetes,1.0,0.0
4,5,0.235616,0.127596,0.509259,0.050562,0.013699,0.75,0.185841,0.123153,0.2,0.5,0.2,0.264706,0.304348,No diabetes,1.0,0.0


## Model

In [26]:
data = diabetes.drop(columns=['Patient number', 'Diabetes'])
target = diabetes.Diabetes.map({'No diabetes':0, 'Diabetes':1})

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(data, target, 
                                                    test_size=.2, 
                                                    random_state=0, 
                                                    stratify=target)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((312, 15), (312,), (78, 15), (78,))

In [28]:
model = LogisticRegression()
model.fit(X_train, y_train)


LogisticRegression()

In [29]:
preds = model.predict(X_test)
preds

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0])

In [30]:
from sklearn.metrics import roc_auc_score

score = roc_auc_score(y_test, preds)
score

0.6666666666666666

## Standard Scaler

## $X_{sc} = \frac{X - μ}{σ}$

In [32]:
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
scaler = StandardScaler()

encoder.fit(diabetes[cat_features])
encoded_cols = list(encoder.get_feature_names_out(cat_features))
diabetes[encoded_cols] = encoder.transform(diabetes[cat_features])

diabetes[num_features] = scaler.fit_transform(diabetes[num_features])
diabetes.drop('Gender', axis=1, inplace=True)
diabetes.head()

Unnamed: 0,Patient number,Cholesterol,Glucose,HDL Chol,Chol/HDL ratio,Age,Height,Weight,BMI,Systolic BP,Diastolic BP,waist,hip,Waist/hip ratio,Diabetes,Gender_female,Gender_male
0,1,-0.319013,-0.564655,-0.073401,-0.360132,-1.692029,-1.26507,-1.447312,-0.951944,-0.838071,-0.985822,-1.020105,-0.882489,-0.565995,No diabetes,1.0,0.0
1,2,-1.372619,-0.527432,-0.536983,-0.533102,-1.692029,-1.520574,-1.05084,-0.360358,-1.276087,-1.875972,-0.846299,-0.52895,-0.70276,No diabetes,1.0,0.0
2,3,0.218998,-0.601879,0.216339,-0.302476,-1.631108,0.267951,0.237692,0.079539,-1.188484,-0.837464,0.370339,0.354899,0.117828,No diabetes,1.0,0.0
3,4,0.420753,-0.192418,1.143504,-0.763729,-1.631108,-0.49856,-1.571209,-1.391841,-0.662865,-1.430897,-1.19391,-0.705719,-1.249818,No diabetes,1.0,0.0
4,5,-0.969111,-0.304089,0.96966,-1.224982,-1.631108,1.034462,-0.902163,-1.300828,-0.662865,0.201045,-1.020105,-0.705719,-0.839524,No diabetes,1.0,0.0


In [33]:
X_train, X_test, y_train, y_test = train_test_split(data, target, 
                                                    test_size=.2, 
                                                    random_state=0, 
                                                    stratify=target)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((312, 15), (312,), (78, 15), (78,))

In [34]:
model = LogisticRegression()
model.fit(X_train, y_train)
preds = model.predict(X_test)
score = roc_auc_score(y_test, preds)
score

0.6666666666666666

## Normalize

## $ \frac{x_i}{\sqrt{x_i^2 + y_i² + z_i²}}$

In [36]:
from sklearn.preprocessing import Normalizer

encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
scaler = Normalizer()

encoder.fit(diabetes[cat_features])
encoded_cols = list(encoder.get_feature_names_out(cat_features))
diabetes[encoded_cols] = encoder.transform(diabetes[cat_features])

diabetes[num_features] = scaler.fit_transform(diabetes[num_features])
diabetes.drop('Gender', axis=1, inplace=True)
diabetes.head()

Unnamed: 0,Patient number,Cholesterol,Glucose,HDL Chol,Chol/HDL ratio,Age,Height,Weight,BMI,Systolic BP,Diastolic BP,waist,hip,Waist/hip ratio,Diabetes,Gender_female,Gender_male
0,1,0.659521,0.263125,0.167443,0.013327,0.064927,0.20845,0.406648,0.076887,0.403231,0.239205,0.109351,0.129854,0.00287,No diabetes,1.0,0.0
1,2,0.552414,0.298909,0.15513,0.013621,0.071889,0.227019,0.510794,0.099889,0.408635,0.219452,0.124861,0.151346,0.00314,No diabetes,1.0,0.0
2,3,0.633668,0.21901,0.157687,0.011681,0.058403,0.195649,0.546064,0.08556,0.321214,0.210249,0.116805,0.131406,0.002599,No diabetes,1.0,0.0
3,4,0.698336,0.299728,0.216299,0.009888,0.0618,0.197759,0.352258,0.060564,0.376978,0.197759,0.095789,0.120509,0.002441,No diabetes,1.0,0.0
4,5,0.546404,0.303188,0.223226,0.007996,0.066635,0.233221,0.469774,0.067301,0.406471,0.286529,0.106615,0.129938,0.002732,No diabetes,1.0,0.0


In [37]:
X_train, X_test, y_train, y_test = train_test_split(data, target, 
                                                    test_size=.2, 
                                                    random_state=0, 
                                                    stratify=target)

model = LogisticRegression()
model.fit(X_train, y_train)
preds = model.predict(X_test)
score = roc_auc_score(y_test, preds)
score

0.6666666666666666