In [3]:
!mkdir -p ~/.kaggle
!mv /content/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [5]:
! kaggle datasets download johnsmith88/heart-disease-dataset

Dataset URL: https://www.kaggle.com/datasets/johnsmith88/heart-disease-dataset
License(s): unknown
Downloading heart-disease-dataset.zip to /content
  0% 0.00/6.18k [00:00<?, ?B/s]
100% 6.18k/6.18k [00:00<00:00, 14.8MB/s]


In [26]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import KBinsDiscretizer

# Load the dataset
df = pd.read_csv('/content/heart.csv')

# Remove rows with NaN values
df_cleaned = df.dropna()

# Loop through each column and print unique values and the count of unique values
for column in df_cleaned.columns:
    unique_values = df_cleaned[column].unique()
    print(f"Column: {column}")
    print(f"Unique values: {unique_values}")
    print(f"Number of unique values: {len(unique_values)}\n")

# 1. One-Hot Encoding for categorical variables
categorical_columns = ['cp', 'restecg', 'slope', 'ca', 'thal']
encoder = OneHotEncoder(sparse_output=False)  # Changed from sparse=False to sparse_output=False
encoded_categorical = encoder.fit_transform(df_cleaned[categorical_columns])

# Create a new DataFrame for the encoded columns
encoded_df = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out(categorical_columns))

# Concatenate with the original DataFrame (excluding the original categorical columns)
df_cleaned = df_cleaned.drop(categorical_columns, axis=1)
df_cleaned = pd.concat([df_cleaned, encoded_df], axis=1)

# 2. KBinsDiscretizer for continuous variables
continuous_columns = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
binning = KBinsDiscretizer(n_bins=2, encode='ordinal', strategy='uniform')  # Example: uniform binning with 5 bins
binned_data = binning.fit_transform(df_cleaned[continuous_columns])

# Create a new DataFrame for the binned columns
binned_df = pd.DataFrame(binned_data, columns=[f'{col}_bin' for col in continuous_columns])

# Concatenate with the original DataFrame (excluding the original continuous columns)
df_cleaned = df_cleaned.drop(continuous_columns, axis=1)
df_cleaned = pd.concat([df_cleaned, binned_df], axis=1)

# Resulting DataFrame with one-hot encoded and binned data
df_cleaned.head()


Column: age
Unique values: [52 53 70 61 62 58 55 46 54 71 43 34 51 50 60 67 45 63 42 44 56 57 59 64
 65 41 66 38 49 48 29 37 47 68 76 40 39 77 69 35 74]
Number of unique values: 41

Column: sex
Unique values: [1 0]
Number of unique values: 2

Column: cp
Unique values: [0 1 2 3]
Number of unique values: 4

Column: trestbps
Unique values: [125 140 145 148 138 100 114 160 120 122 112 132 118 128 124 106 104 135
 130 136 180 129 150 178 146 117 152 154 170 134 174 144 108 123 110 142
 126 192 115  94 200 165 102 105 155 172 164 156 101]
Number of unique values: 49

Column: chol
Unique values: [212 203 174 294 248 318 289 249 286 149 341 210 298 204 308 266 244 211
 185 223 208 252 209 307 233 319 256 327 169 131 269 196 231 213 271 263
 229 360 258 330 342 226 228 278 230 283 241 175 188 217 193 245 232 299
 288 197 315 215 164 326 207 177 257 255 187 201 220 268 267 236 303 282
 126 309 186 275 281 206 335 218 254 295 417 260 240 302 192 225 325 235
 274 234 182 167 172 321 300 199 564 15

Unnamed: 0,sex,fbs,exang,target,cp_0,cp_1,cp_2,cp_3,restecg_0,restecg_1,...,ca_4,thal_0,thal_1,thal_2,thal_3,age_bin,trestbps_bin,chol_bin,thalach_bin,oldpeak_bin
0,1,0,0,0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,1,1,1,0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
2,1,0,1,0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
3,1,0,0,0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0
4,0,1,0,0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


In [27]:
for column in df_cleaned.columns:
    unique_values = df_cleaned[column].unique()
    print(f"Column: {column}")
    print(f"Unique values: {unique_values}")
    print(f"Number of unique values: {len(unique_values)}\n")

Column: sex
Unique values: [1 0]
Number of unique values: 2

Column: fbs
Unique values: [0 1]
Number of unique values: 2

Column: exang
Unique values: [0 1]
Number of unique values: 2

Column: target
Unique values: [0 1]
Number of unique values: 2

Column: cp_0
Unique values: [1. 0.]
Number of unique values: 2

Column: cp_1
Unique values: [0. 1.]
Number of unique values: 2

Column: cp_2
Unique values: [0. 1.]
Number of unique values: 2

Column: cp_3
Unique values: [0. 1.]
Number of unique values: 2

Column: restecg_0
Unique values: [0. 1.]
Number of unique values: 2

Column: restecg_1
Unique values: [1. 0.]
Number of unique values: 2

Column: restecg_2
Unique values: [0. 1.]
Number of unique values: 2

Column: slope_0
Unique values: [0. 1.]
Number of unique values: 2

Column: slope_1
Unique values: [0. 1.]
Number of unique values: 2

Column: slope_2
Unique values: [1. 0.]
Number of unique values: 2

Column: ca_0
Unique values: [0. 1.]
Number of unique values: 2

Column: ca_1
Unique val


# How the binarisation was achieved.

We are binarizing the dataset by applying One-Hot Encoding to categorical variables, which creates binary columns for each category, and by using the **KBinsDiscretizer** to transform continuous variables (such as age, blood pressure, and cholesterol) into discrete bins. The **KBinsDiscretizer** divides each continuous feature into a specified number of equal-width intervals, based on the range of the data. For example, if the range of a feature is 0 to 200 and we specify 2 bins, the discretizer would create two bins: one for values between 0-100 and the other for values between 101-200. It then assigns each value within those intervals an ordinal value (e.g., 0 for the first bin, 1 for the second, etc.), effectively transforming the continuous data into categorical representations. This approach helps in handling continuous variables by converting them into a fixed number of categories, making them more suitable for certain types of machine learning models. Both transformations are then added to the cleaned dataset.