# Data Preprocessing

## 1. Import libraries and load Data

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler 

In [3]:
raw_data = pd.read_csv('../data/Raw_Data/heart_disease.csv')
raw_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


## 2. Data Cleaning

We have to do the following steps:   
   * Handle missing or null values
   * Handle duplicate records
   * Handle outliers

### - Handle missing or null values

In [4]:
raw_data.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

`Fortunately we have no missing data so we don't have to do anything.`

### - Handle duplicate records

In [5]:
raw_data.duplicated().any()

True

In [6]:
duplicate_rows = raw_data[raw_data.duplicated()]
print(duplicate_rows)

     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
164   38    1   2       138   175    0        1      173      0      0.0   

     slope  ca  thal  target  
164      2   4     2       1  


In [7]:
raw_data.drop_duplicates(inplace=True)

### - Handle outliers

Handling outliers is not always necessary for all columns in your dataset.Here are some guidelines to help us decide whether and how to handle outliers for each column:

   * `Continuous Numerical Columns`: For columns containing continuous numerical data (e.g., age, income, temperature), you may want to consider handling outliers if they are likely to be data errors or measurement anomalies. Common techniques include capping/flooring, winsorizing, or transformations.

   * `Categorical Columns`: Outliers are not typically relevant for categorical columns.

   * `Ordinal Columns`: For columns with ordinal data (e.g., ratings on a scale from 1 to 5), the concept of outliers may not apply.

   * `Binary Columns`: Outliers are not relevant for binary columns.

   * `Datetime Columns`: For columns with datetime data, the concept of outliers may not be applicable

   * `Textual Columns`: Outliers are not relevant for textual columns.

>**According to the above description,  Handling outliers is only necessary for continuous rows, so we must first find out the names of these types of columns**

In [12]:
numeric_columns = raw_data.select_dtypes(include=['float64', 'int64']).columns

# Define the threshold to distinguish continuous from discrete variables
threshold = 8

# Excluding binary and categorical variables
non_continuous_columns = []
for column in numeric_columns:
    unique_values = raw_data[column].nunique()
    if unique_values  threshold: 
        non_continuous_columns.append(column)

continuous_variables = numeric_columns.difference(non_continuous_columns)
print(continuous_variables) 


['age', 'trestbps', 'chol', 'thalach', 'oldpeak']


**There are several common techniques to deal with outliers in continuous variables**
* Capping/Flooring
* Winsorizing
* Transformation
* Imputation
* Removing Outliers

> **I used first technique `(Capping/Flooring)`  for this data**

In [75]:
for var in continuous_variables:
    data = raw_data[var]
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    data_capped = data.clip(lower_bound, upper_bound)

    raw_data[var] = data_capped

## 3. Data Transformation

* Feature Scaling
* Encoding Categorical Variables

### - Feature Scaling

In [76]:
scaler = MinMaxScaler()
raw_data[continuous_variables] = scaler.fit_transform(raw_data[continuous_variables])

In [77]:
raw_data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,0.708333,1,3,0.671053,0.437852,1,0,0.558855,0,0.575,0,0,1,1
1,0.166667,1,2,0.473684,0.507417,0,1,0.872747,0,0.875,0,0,2,1
2,0.250000,0,1,0.473684,0.319182,0,0,0.745493,0,0.350,2,0,2,1
3,0.562500,1,1,0.342105,0.450128,0,1,0.796394,0,0.200,2,0,2,1
4,0.583333,0,0,0.342105,0.932992,0,1,0.669141,1,0.150,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,0.583333,0,0,0.605263,0.470588,0,1,0.329799,1,0.050,1,0,3,0
299,0.333333,1,3,0.210526,0.564706,0,1,0.406151,0,0.300,1,0,3,0
300,0.812500,1,0,0.657895,0.274169,1,1,0.482503,0,0.850,1,2,3,0
301,0.583333,1,0,0.473684,0.020460,0,1,0.261930,1,0.300,1,1,3,0


### - Encoding Categorical Variables

In [92]:
unique_value_counts = raw_data.nunique()

# Filter the columns with more than two (non-binary categorical columns)  and less than five unique values 
categorical_variables = unique_value_counts[(unique_value_counts > 2) & (unique_value_counts < 5)].index.tolist()

# One-Hot Encoding
df_encoded = pd.get_dummies(raw_data, columns=categorical_variables, drop_first=True)
df_encoded

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,ca,target,cp_1,cp_2,cp_3,restecg_1,restecg_2,slope_1,slope_2,thal_1,thal_2,thal_3
0,0.708333,1,0.671053,0.437852,1,0.558855,0,0.575,0,1,0,0,1,0,0,0,0,1,0,0
1,0.166667,1,0.473684,0.507417,0,0.872747,0,0.875,0,1,0,1,0,1,0,0,0,0,1,0
2,0.250000,0,0.473684,0.319182,0,0.745493,0,0.350,0,1,1,0,0,0,0,0,1,0,1,0
3,0.562500,1,0.342105,0.450128,0,0.796394,0,0.200,0,1,1,0,0,1,0,0,1,0,1,0
4,0.583333,0,0.342105,0.932992,0,0.669141,1,0.150,0,1,0,0,0,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,0.583333,0,0.605263,0.470588,0,0.329799,1,0.050,0,0,0,0,0,1,0,1,0,0,0,1
299,0.333333,1,0.210526,0.564706,0,0.406151,0,0.300,0,0,0,0,1,1,0,1,0,0,0,1
300,0.812500,1,0.657895,0.274169,1,0.482503,0,0.850,2,0,0,0,0,1,0,1,0,0,0,1
301,0.583333,1,0.473684,0.020460,0,0.261930,1,0.300,1,0,0,0,0,1,0,1,0,0,0,1


## 4. Feature Engineering

In [None]:
# todo

## 5. Handling Data Imbalance (if applicable)

In [93]:
# todo

## 6. Data Splitting

In [96]:
np.random.seed(42)

X = df_encoded.drop('target', axis=1)  
y = df_encoded['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### 7. Save Preprocessed Data

In [97]:
X_train.to_csv('../data/Processed_Data/X_train.csv', index=False)
X_test.to_csv('../data/Processed_Data/X_test.csv', index=False)
y_train.to_csv('../data/Processed_Data/y_train.csv', index=False)
y_test.to_csv('../data/Processed_Data/y_test.csv', index=False)