### DATA TRANSFORMATION

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [2]:
# Loading the dataset:
df = pd.read_csv("../../data/processed/heart_2022_cleaned.csv")

In [3]:
# Making a copy from the cleaned dataframe:
df_transformed = df.copy()

### Numerical Variables

##### Transforming Ratio Variables

In [4]:
# Feature Scaling:
numeric_cols = ["PhysicalHealthDays", "MentalHealthDays", "SleepHours", "BMI"]
scaler = StandardScaler()
df_transformed[numeric_cols] = scaler.fit_transform(df_transformed[numeric_cols])

#

### Categorial Variables

##### Encoding Binary Variables

In [5]:
# Encoding "Sex" with 0 and 1's.:
df_transformed["Sex"] = df_transformed["Sex"].replace({"Female": 0, "Male": 1})
df_transformed["Sex"].value_counts()

  df_transformed["Sex"] = df_transformed["Sex"].replace({"Female": 0, "Male": 1})


Sex
0    161516
1    152655
Name: count, dtype: int64

In [6]:
# Encoding "HadDiabetes" with 0 and 1's, but first we replace the strings with No and Yes:
df_transformed["HadDiabetes"] = df_transformed["HadDiabetes"].replace(
    {
        "No, pre-diabetes or borderline diabetes": "No",
        "Yes, but only during pregnancy (female)": "Yes",
    }
)
df_transformed["HadDiabetes"] = df_transformed["HadDiabetes"].replace(
    {"No": 0, "Yes": 1}
)
df_transformed["HadDiabetes"].value_counts()

  df_transformed["HadDiabetes"] = df_transformed["HadDiabetes"].replace(


HadDiabetes
0    269710
1     44461
Name: count, dtype: int64

##### Encoding Nominal Variables

In [7]:
# One-hot Encoding "Race" into 6 groups, which will help us identify an individual's race:
df_transformed["RaceEthnicityCategory"].unique()
df_race = pd.get_dummies(df_transformed["RaceEthnicityCategory"]).astype(int)
df_transformed = pd.concat([df_transformed, df_race], axis=1)
df_transformed.drop(columns=["RaceEthnicityCategory"], inplace=True)
df_transformed

Unnamed: 0,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,PhysicalActivities,SleepHours,HadAsthma,HadDepressiveDisorder,HadKidneyDisease,HadDiabetes,...,AgeCategory,BMI,AlcoholDrinkers,HIVTesting,HadHeartDisease,"Black only, Non-Hispanic",Hispanic,"Multiracial, Non-Hispanic","Other race only, Non-Hispanic","White only, Non-Hispanic"
0,0,Excellent,-0.492507,-0.524722,No,-0.706793,No,No,No,0,...,Age 80 or older,-0.308076,No,No,0,0,0,0,0,1
1,0,Very good,-0.255564,-0.160389,Yes,-1.399460,No,No,No,0,...,Age 55 to 59,-0.455420,No,No,0,0,0,0,0,1
2,0,Fair,-0.255564,-0.524722,Yes,1.371209,No,No,No,0,...,Age 40 to 44,-1.044798,Yes,No,0,0,0,0,0,1
3,1,Poor,-0.374035,-0.524722,No,-0.014126,No,No,No,1,...,Age 80 or older,-0.383283,No,No,1,0,0,0,0,1
4,0,Very good,-0.492507,-0.524722,Yes,-0.014126,No,No,No,0,...,Age 80 or older,-0.862152,Yes,No,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
314166,0,Fair,-0.492507,0.325388,Yes,-0.014126,No,Yes,No,0,...,Age 25 to 29,-0.650345,No,No,0,1,0,0,0,0
314167,1,Good,-0.492507,1.296943,Yes,-0.014126,No,No,No,1,...,Age 65 to 69,0.196885,Yes,Yes,1,0,0,1,0,0
314168,0,Excellent,-0.255564,-0.281833,Yes,-0.014126,No,No,No,0,...,Age 50 to 54,0.012705,No,Yes,0,1,0,0,0,0
314169,1,Very good,-0.492507,-0.524722,No,-1.399460,Yes,No,No,0,...,Age 70 to 74,0.609757,No,Yes,1,1,0,0,0,0


##### Encoding Ordinal Variables

In [8]:
# Categorizing and Encoding "AgeCategory" into 13 groups:
age_ranges = df_transformed["AgeCategory"].unique()
age_codes, _ = pd.factorize(age_ranges, sort=True)
age_range_to_code = dict(zip(age_ranges, age_codes))
df_transformed["AgeCategory"] = df_transformed["AgeCategory"].replace(age_range_to_code)
df_transformed["AgeCategory"].value_counts().sort_index()

  df_transformed["AgeCategory"] = df_transformed["AgeCategory"].replace(age_range_to_code)


AgeCategory
0     19736
1     16021
2     18423
3     20650
4     21915
5     20876
6     24502
7     26940
8     32363
9     34407
10    31508
11    23025
12    23805
Name: count, dtype: int64

In [9]:
# Categorizing and Encoding "GenHGeneralHealthealth" into 5 distinct groups:
df_transformed["GeneralHealth"].unique()
df_transformed["GeneralHealth"] = df_transformed["GeneralHealth"].replace(
    {"Poor": 0, "Fair": 1, "Good": 2, "Very good": 3, "Excellent": 4}
)
df_transformed["GeneralHealth"].value_counts().sort_index()

  df_transformed["GeneralHealth"] = df_transformed["GeneralHealth"].replace(


GeneralHealth
0     12210
1     39583
2     99551
3    110462
4     52365
Name: count, dtype: int64

In [10]:
# Categorizing and Encoding "SmokerStatus" into 4 distinct groups:
df_transformed["SmokerStatus"].unique()
df_transformed["SmokerStatus"] = df_transformed["SmokerStatus"].replace(
    {
        "Never smoked": 0,
        "Former smoker": 1,
        "Current smoker - now smokes some days": 2,
        "Current smoker - now smokes every day": 3,
    }
)
df_transformed["SmokerStatus"].value_counts().sort_index()

  df_transformed["SmokerStatus"] = df_transformed["SmokerStatus"].replace(


SmokerStatus
0    189861
1     86754
2     10342
3     27214
Name: count, dtype: int64

In [11]:
# Categorizing and Encoding "ECigaretteUsage" into 4 distinct groups:
df_transformed["ECigaretteUsage"].unique()
df_transformed["ECigaretteUsage"] = df_transformed["ECigaretteUsage"].replace(
    {
        "Never used e-cigarettes in my entire life": 0,
        "Not at all (right now)": 1,
        "Use them some days": 2,
        "Use them every day": 3,
    }
)
df_transformed["ECigaretteUsage"].value_counts().sort_index()

  df_transformed["ECigaretteUsage"] = df_transformed["ECigaretteUsage"].replace(


ECigaretteUsage
0    240458
1     56504
2      9051
3      8158
Name: count, dtype: int64

> Overall, this transformed data will help us in the next phase, as it ensures standardized inputs, and simplifies the categorical variables into numerical formats. The encoding techniques applied makes the dataset more accessible and manageable for us in the next phase, which is data modeling.