# Import Libraries  
We start by importing the necessary libraries for data analysis and processing.  


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load the Dataset  
Here, we load the dataset from a CSV file into a Pandas DataFrame.  


In [None]:
data = pd.read_csv('heart_2022_no_nans.csv')

# Explore the Dataset  
Check the general information about the dataset, descriptive statistics, and the first five rows.  


In [None]:
print("Dataset Information:")
print(data.info())

print("\nDescriptive Statistics for Numerical Columns:")
print(data.describe())

print("\nFirst Five Rows of the Dataset:")
print(data.head())

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246022 entries, 0 to 246021
Data columns (total 40 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   State                      246022 non-null  object 
 1   Sex                        246022 non-null  object 
 2   GeneralHealth              246022 non-null  object 
 3   PhysicalHealthDays         246022 non-null  float64
 4   MentalHealthDays           246022 non-null  float64
 5   LastCheckupTime            246022 non-null  object 
 6   PhysicalActivities         246022 non-null  object 
 7   SleepHours                 246022 non-null  float64
 8   RemovedTeeth               246022 non-null  object 
 9   HadHeartAttack             246022 non-null  object 
 10  HadAngina                  246022 non-null  object 
 11  HadStroke                  246022 non-null  object 
 12  HadAsthma                  246022 non-null  object 
 13  HadSkinC

# Check for Missing Values  
Ensure there are no missing values in the dataset.  


In [None]:
print("\nMissing Values Count in Each Column:")
print(data.isnull().sum())


Missing Values Count in Each Column:
State                        0
Sex                          0
GeneralHealth                0
PhysicalHealthDays           0
MentalHealthDays             0
LastCheckupTime              0
PhysicalActivities           0
SleepHours                   0
RemovedTeeth                 0
HadHeartAttack               0
HadAngina                    0
HadStroke                    0
HadAsthma                    0
HadSkinCancer                0
HadCOPD                      0
HadDepressiveDisorder        0
HadKidneyDisease             0
HadArthritis                 0
HadDiabetes                  0
DeafOrHardOfHearing          0
BlindOrVisionDifficulty      0
DifficultyConcentrating      0
DifficultyWalking            0
DifficultyDressingBathing    0
DifficultyErrands            0
SmokerStatus                 0
ECigaretteUsage              0
ChestScan                    0
RaceEthnicityCategory        0
AgeCategory                  0
HeightInMeters               0
W

# Identify Categorical Columns  
List all categorical columns to transform them into numerical values.  


In [None]:
categorical_columns = data.select_dtypes(include=['object']).columns
print("\nCategorical Columns:", categorical_columns)


Categorical Columns: Index(['State', 'Sex', 'GeneralHealth', 'LastCheckupTime',
       'PhysicalActivities', 'RemovedTeeth', 'HadHeartAttack', 'HadAngina',
       'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD',
       'HadDepressiveDisorder', 'HadKidneyDisease', 'HadArthritis',
       'HadDiabetes', 'DeafOrHardOfHearing', 'BlindOrVisionDifficulty',
       'DifficultyConcentrating', 'DifficultyWalking',
       'DifficultyDressingBathing', 'DifficultyErrands', 'SmokerStatus',
       'ECigaretteUsage', 'ChestScan', 'RaceEthnicityCategory', 'AgeCategory',
       'AlcoholDrinkers', 'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver',
       'TetanusLast10Tdap', 'HighRiskLastYear', 'CovidPos'],
      dtype='object')


# Encode Categorical Columns  
Transform categorical columns into numerical values using LabelEncoder.  


In [None]:
encoder = LabelEncoder()
for col in categorical_columns:
    data[col] = encoder.fit_transform(data[col])

print("\nFirst Five Rows After Encoding Categorical Columns:")
print(data.head())


First Five Rows After Encoding Categorical Columns:
   State  Sex  GeneralHealth  PhysicalHealthDays  MentalHealthDays  \
0      0    0              4                 4.0               0.0   
1      0    1              4                 0.0               0.0   
2      0    1              4                 0.0               0.0   
3      0    0              1                 5.0               0.0   
4      0    0              2                 3.0              15.0   

   LastCheckupTime  PhysicalActivities  SleepHours  RemovedTeeth  \
0                3                   1         9.0             3   
1                3                   1         6.0             3   
2                3                   0         8.0             1   
3                3                   1         9.0             3   
4                3                   1         5.0             0   

   HadHeartAttack  ...  HeightInMeters  WeightInKilograms    BMI  \
0               0  ...            1.60           

# Split Features and Target  
Separate the input features (X) and the target variable (y).  


In [None]:
X = data.drop('HadHeartAttack', axis=1)
y = data['HadHeartAttack']

# Split Data into Training and Testing Sets  
Divide the data into training and testing sets to train and evaluate the model later.  


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (196817, 39)
Shape of X_test: (49205, 39)
Shape of y_train: (196817,)
Shape of y_test: (49205,)


# Save Split Data  
Save the split datasets as separate CSV files for future use.  


In [None]:
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)
print("Data saved successfully")


Data saved successfully!
