# Data Preprocessing

## 1. Import libraries and load Data

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler 

import sys
sys.path.append('../Scripts/')
from data_preprocessing import *

In [2]:
file_path = '../data/Raw_Data/heart_disease.csv'
raw_data = load_data(file_path)
raw_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


## 2. Data Cleaning

We have to do the following steps:   
   * Handle missing or null values
   * Handle duplicate records
   * Handle outliers

### - Handle missing or null values

In [3]:
cleaned_data = handle_null_values(raw_data)

### - Handle duplicate records

In [4]:
cleaned_data = handle_duplicates(cleaned_data)

### - Handle outliers

Handling outliers is not always necessary for all columns in your dataset.Here are some guidelines to help us decide whether and how to handle outliers for each column:

   * `Continuous Numerical Columns`: For columns containing continuous numerical data (e.g., age, income, temperature), you may want to consider handling outliers if they are likely to be data errors or measurement anomalies. Common techniques include capping/flooring, winsorizing, or transformations.

   * `Categorical Columns`: Outliers are not typically relevant for categorical columns.

   * `Ordinal Columns`: For columns with ordinal data (e.g., ratings on a scale from 1 to 5), the concept of outliers may not apply.

   * `Binary Columns`: Outliers are not relevant for binary columns.

   * `Datetime Columns`: For columns with datetime data, the concept of outliers may not be applicable

   * `Textual Columns`: Outliers are not relevant for textual columns.

>**According to the above description,  Handling outliers is only necessary for continuous rows, so we must first find out the names of these types of columns**

In [5]:
continuous_variables = extract_continuous_variables(cleaned_data, threshold=8)

**There are several common techniques to deal with outliers in continuous variables**
* Capping/Flooring
* Winsorizing
* Transformation
* Imputation
* Removing Outliers

> **I used first technique `(Capping/Flooring)`  for this data**

In [6]:
cleaned_data = handle_outliers(cleaned_data.copy(), continuous_variables)
cleaned_data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233.0,1,0,150.0,0,2.3,0,0,1,1
1,37,1,2,130,250.0,0,1,187.0,0,3.5,0,0,2,1
2,41,0,1,130,204.0,0,0,172.0,0,1.4,2,0,2,1
3,56,1,1,120,236.0,0,1,178.0,0,0.8,2,0,2,1
4,57,0,0,120,354.0,0,1,163.0,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241.0,0,1,123.0,1,0.2,1,0,3,0
299,45,1,3,110,264.0,0,1,132.0,0,1.2,1,0,3,0
300,68,1,0,144,193.0,1,1,141.0,0,3.4,1,2,3,0
301,57,1,0,130,131.0,0,1,115.0,1,1.2,1,1,3,0


## 3. Data Transformation

* Feature Scaling
* Encoding Categorical Variables

### - Feature Scaling

In [7]:
cleaned_data = standardize_numerical(cleaned_data.copy(), continuous_variables)
cleaned_data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,0.708333,1,3,0.671053,0.437852,1,0,0.558855,0,0.575,0,0,1,1
1,0.166667,1,2,0.473684,0.507417,0,1,0.872747,0,0.875,0,0,2,1
2,0.250000,0,1,0.473684,0.319182,0,0,0.745493,0,0.350,2,0,2,1
3,0.562500,1,1,0.342105,0.450128,0,1,0.796394,0,0.200,2,0,2,1
4,0.583333,0,0,0.342105,0.932992,0,1,0.669141,1,0.150,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,0.583333,0,0,0.605263,0.470588,0,1,0.329799,1,0.050,1,0,3,0
299,0.333333,1,3,0.210526,0.564706,0,1,0.406151,0,0.300,1,0,3,0
300,0.812500,1,0,0.657895,0.274169,1,1,0.482503,0,0.850,1,2,3,0
301,0.583333,1,0,0.473684,0.020460,0,1,0.261930,1,0.300,1,1,3,0


### - Encoding Categorical Variables

In [8]:
categorical_variables = extract_categorical_variables(cleaned_data)
# One-Hot Encoding
df_encoded = encode_categorical(cleaned_data, categorical_variables)
df_encoded

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,ca,target,cp_1,cp_2,cp_3,restecg_1,restecg_2,slope_1,slope_2,thal_1,thal_2,thal_3
0,0.708333,1,0.671053,0.437852,1,0.558855,0,0.575,0,1,0,0,1,0,0,0,0,1,0,0
1,0.166667,1,0.473684,0.507417,0,0.872747,0,0.875,0,1,0,1,0,1,0,0,0,0,1,0
2,0.250000,0,0.473684,0.319182,0,0.745493,0,0.350,0,1,1,0,0,0,0,0,1,0,1,0
3,0.562500,1,0.342105,0.450128,0,0.796394,0,0.200,0,1,1,0,0,1,0,0,1,0,1,0
4,0.583333,0,0.342105,0.932992,0,0.669141,1,0.150,0,1,0,0,0,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,0.583333,0,0.605263,0.470588,0,0.329799,1,0.050,0,0,0,0,0,1,0,1,0,0,0,1
299,0.333333,1,0.210526,0.564706,0,0.406151,0,0.300,0,0,0,0,1,1,0,1,0,0,0,1
300,0.812500,1,0.657895,0.274169,1,0.482503,0,0.850,2,0,0,0,0,1,0,1,0,0,0,1
301,0.583333,1,0.473684,0.020460,0,0.261930,1,0.300,1,0,0,0,0,1,0,1,0,0,0,1


## 4. Feature Engineering

In [9]:
# todo

## 5. Handling Data Imbalance (if applicable)

In [10]:
# todo

## 6. Data Splitting

In [11]:
X,y = split_features_target(df_encoded,'target')

X_train, y_train, X_test, y_test = split_train_test(X, y, random_state=42)
X_train, y_train, X_test, y_test

(          age  sex  trestbps      chol  fbs   thalach  exang  oldpeak  ca  \
 132  0.270833    1  0.342105  0.691560    0  0.660657      0    0.000   0   
 203  0.812500    1  1.000000  0.605627    1  0.558855      1    0.400   0   
 197  0.791667    1  0.407895  0.523785    1  0.669141      0    0.050   2   
 75   0.541667    0  0.539474  0.507417    0  0.652174      0    0.350   0   
 177  0.729167    1  0.605263  0.855243    0  0.626723      0    0.000   0   
 ..        ...  ...       ...       ...  ...       ...    ...      ...  ..   
 189  0.250000    1  0.210526  0.188235    0  0.626723      0    0.000   0   
 71   0.458333    1  0.000000  0.413299    0  0.592789      1    0.000   1   
 106  0.833333    1  0.868421  0.441944    1  0.397667      0    0.025   1   
 271  0.666667    1  0.526316  0.441944    0  0.516437      0    0.650   2   
 102  0.708333    0  0.605263  0.282353    0  0.804878      0    0.000   2   
 
      cp_1  cp_2  cp_3  restecg_1  restecg_2  slope_1  slope_2

### 7. Save Preprocessed Data

In [12]:
save_processed_data(X_train, y_train, X_test, y_test, output_path='../data/Processed_Data')