In [107]:
from scipy import stats
import pandas as pd
import numpy as np

###### Start of code #####

hd_df = pd.read_csv('heart disease.csv')

<b> Show Dataset attributes and descriptive stats

In [108]:
# Dataset makeup
print("Dataset")
print(hd_df.head())
print("\nDataset information")
print(hd_df.info())

# Descriptive statistics
print("\nDescriptive Statistics")
print(hd_df.describe())

# Duplicates?
print("\nDuplicate rows:\n", hd_df[hd_df.duplicated()])

Dataset
   Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  MaxHR  \
0   40   M           ATA        140          289          0     Normal    172   
1   49   F           NAP        160          180          0     Normal    156   
2   37   M           ATA        130          283          0         ST     98   
3   48   F           ASY        138          214          0     Normal    108   
4   54   M           NAP        150          195          0     Normal    122   

  ExerciseAngina  Oldpeak ST_Slope  HeartDisease  
0              N      0.0       Up             0  
1              N      1.0     Flat             1  
2              N      0.0       Up             0  
3              Y      1.5     Flat             1  
4              N      0.0       Up             0  

Dataset information
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          -------

In [109]:
count_zeros_col = (hd_df['RestingBP'] == 0).sum()
print(f"Number of zeros in RestingBP: {count_zeros_col}")

count_zeros_col = (hd_df['Age'] == 0).sum()
print(f"Number of zeros in Age: {count_zeros_col}")

count_zeros_col = (hd_df['Cholesterol'] == 0).sum()
print(f"Number of zeros in Cholesterol: {count_zeros_col}")

Number of zeros in RestingBP: 1
Number of zeros in Age: 0
Number of zeros in Cholesterol: 172


<b> Drop rows where certain column values are 0.
Rows will be fewer than 1% of data set.

In [110]:
# Drop records with RestingBP = 0
hd_df = hd_df[hd_df['RestingBP'] != 0]

# Re-index after dropped rows
hd_df = hd_df.reset_index(drop=True)

<b> Show unique values of categorical columns for creating dummy variables

In [111]:
# Determine unique values in categorical columns
cat_columns = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope', 'FastingBS']

unique_values = {col: hd_df[col].unique().tolist() for col in cat_columns}

for col, values in unique_values.items():
    print(f"Unique values in '{col}': {values}")

Unique values in 'Sex': ['M', 'F']
Unique values in 'ChestPainType': ['ATA', 'NAP', 'ASY', 'TA']
Unique values in 'RestingECG': ['Normal', 'ST', 'LVH']
Unique values in 'ExerciseAngina': ['N', 'Y']
Unique values in 'ST_Slope': ['Up', 'Flat', 'Down']
Unique values in 'FastingBS': [0, 1]


<b> Transform variables from single character to more descriptive names

In [112]:
# Change M/F to Male/Female so dummy variable column header makes more sense
hd_df['Sex'] = hd_df['Sex'].replace({
    'M': 'Male',
    'F': 'Female'
})

# Change Y/N values to Yes/No so dummy variable column header makes more sense
hd_df['ExerciseAngina'] = hd_df['ExerciseAngina'].replace({
    'N': 'No',
    'Y': 'Yes'
})

# Change 0/1 values to Yes/No so dummy variable column header makes more sense
hd_df['FastingBS'] = hd_df['FastingBS'].replace({
    0: 'No',
    1: 'Yes'
})


<b> Create binary variables and mapping ordinal values

In [113]:
# Include dummy variables for columns containing binary values
dummy_df1 = pd.get_dummies(hd_df['Sex'], prefix='Sex', drop_first=True, dtype=int)
dummy_df2 = pd.get_dummies(hd_df['ExerciseAngina'], prefix='Ex_Ang', drop_first=True, dtype=int)
dummy_df3 = pd.get_dummies(hd_df['FastingBS'], prefix='FastingBS', drop_first=True, dtype=int)
hd_df = pd.concat([hd_df, dummy_df1, dummy_df2, dummy_df3], axis=1)

# Create new columns corresponding to ordinal columns
mapping = {'ASY': 0, 'NAP': 1, 'ATA': 2, 'TA': 3}
hd_df['ChestPainType_num'] = hd_df['ChestPainType'].map(mapping)

mapping = {'Normal': 0, 'ST': 1, 'LVH': 2}
hd_df['RestingECG_num'] = hd_df['RestingECG'].map(mapping)

mapping = {'Up': 0, 'Flat': 1, 'Down': 2}
hd_df['ST_Slope_num'] = hd_df['ST_Slope'].map(mapping)

<b> Impute values of cholesterol using PMM

In [122]:
# Initialize MICEData with the dataset
from statsmodels.imputation.mice import MICEData
import statsmodels.formula.api as smf

cols_to_include = ['Cholesterol', 'Age', 'RestingBP', 'MaxHR', 'Oldpeak', 'Sex_Male', 'Ex_Ang_Yes',
                   'FastingBS_Yes', 'ChestPainType_num', 'RestingECG_num', 'ST_Slope_num']

hd_df['Cholesterol'] = hd_df['Cholesterol'].replace(0, np.nan)
mice_data_df = hd_df[cols_to_include].copy()

mice_data = MICEData(mice_data_df)

# Specify imputation for Cholesterol using specific columns as predictors
predictor_cols = 'Age + RestingBP + MaxHR + Oldpeak + Sex_Male + Ex_Ang_Yes + FastingBS_Yes + ChestPainType_num + RestingECG_num + ST_Slope_num'
mice_data.set_imputer('Cholesterol', formula=predictor_cols)

# Perform imputation using other columns as predictors
mice_data.update_all(n_iter=10)  # Run MICE for 10 iterations

# Get the imputed dataset
imputed_data = mice_data.data

In [130]:
# Extract only the imputed column Cholesterol and make new column to check imputing results
hd_df['Imputed_Cholesterol'] = imputed_data['Cholesterol']

# Print number of NaN values in original column
print("\nNumber of rows where Cholestoral = NaN: ", hd_df['Cholesterol'].isna().sum())

# Print number of zero values in imputed column
print("\nNumber of rows where Imputed Cholestoral = 0: ",(hd_df['Imputed_Cholesterol'] == 0).sum())

# Print number of NaN values in imputed column
print("\nNumber of rows where Imputed Cholestoral = NaN: ",hd_df['Imputed_Cholesterol'].isna().sum())

# Side-by-side comparison
print("\n",hd_df.loc[hd_df['Imputed_Cholesterol'] != hd_df['Cholesterol'], ['Cholesterol', 'Imputed_Cholesterol']])




Number of rows where Cholestoral = NaN:  171

Number of rows where Imputed Cholestoral = 0:  0

Number of rows where Imputed Cholestoral = NaN:  0

      Cholesterol  Imputed_Cholesterol
293          NaN                246.0
294          NaN                159.0
295          NaN                260.0
296          NaN                182.0
297          NaN                179.0
..           ...                  ...
513          NaN                246.0
514          NaN                212.0
517          NaN                126.0
534          NaN                248.0
535          NaN                243.0

[171 rows x 2 columns]


<b> Display results after preparation

In [131]:
# Check results
print("Dataset")
print(hd_df.head())
print("\nDataset information")
print(hd_df.info())
# Descriptive statistics
print("\nDescriptive Statistics")
print(hd_df.describe())

Dataset
   Age     Sex ChestPainType  RestingBP  Cholesterol FastingBS RestingECG  \
0   40    Male           ATA        140        289.0        No     Normal   
1   49  Female           NAP        160        180.0        No     Normal   
2   37    Male           ATA        130        283.0        No         ST   
3   48  Female           ASY        138        214.0        No     Normal   
4   54    Male           NAP        150        195.0        No     Normal   

   MaxHR ExerciseAngina  Oldpeak ST_Slope  HeartDisease  Sex_Male  Ex_Ang_Yes  \
0    172             No      0.0       Up             0         1           0   
1    156             No      1.0     Flat             1         0           0   
2     98             No      0.0       Up             0         1           0   
3    108            Yes      1.5     Flat             1         0           1   
4    122             No      0.0       Up             0         1           0   

   FastingBS_Yes  ChestPainType_num  Resti