In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
df = pd.read_csv("/content/hypertension_dataset 2.csv")

In [7]:
print("Initial shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

Initial shape: (1985, 11)

First 5 rows:
   age  salt_intake  stress_score       bp_history  sleep_duration   bmi  \
0   69          8.0             9           Normal             6.4  25.8   
1   32         11.7            10           Normal             5.4  23.4   
2   78          9.5             3           Normal             7.1  18.7   
3   38         10.0            10     Hypertension             4.2  22.1   
4   41          9.8             1  Prehypertension             5.8  16.2   

      medication family_history exercise_level smoking_status has_hypertension  
0            NaN            Yes            Low     Non-Smoker              Yes  
1            NaN             No            Low     Non-Smoker               No  
2            NaN             No       Moderate     Non-Smoker               No  
3  ACE Inhibitor             No            Low     Non-Smoker              Yes  
4          Other             No       Moderate     Non-Smoker               No  


In [8]:
df.columns = (
    df.columns.str.strip()
             .str.lower()
             .str.replace(" ", "_")
             .str.replace("-", "_")
)

print("\nCleaned columns:", df.columns.tolist())


Cleaned columns: ['age', 'salt_intake', 'stress_score', 'bp_history', 'sleep_duration', 'bmi', 'medication', 'family_history', 'exercise_level', 'smoking_status', 'has_hypertension']


In [9]:
print("\nMissing values per column:\n", df.isnull().sum())


Missing values per column:
 age                   0
salt_intake           0
stress_score          0
bp_history            0
sleep_duration        0
bmi                   0
medication          799
family_history        0
exercise_level        0
smoking_status        0
has_hypertension      0
dtype: int64


In [10]:
for col in df.columns:
    if df[col].dtype in ["int64", "float64"]:
        df[col].fillna(df[col].median(), inplace=True)
    else:
        df[col].fillna(df[col].mode()[0], inplace=True)

print("\nMissing values after cleaning:\n", df.isnull().sum())


Missing values after cleaning:
 age                 0
salt_intake         0
stress_score        0
bp_history          0
sleep_duration      0
bmi                 0
medication          0
family_history      0
exercise_level      0
smoking_status      0
has_hypertension    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [11]:
categorical_cols = df.select_dtypes(include=["object", "category"]).columns
print("\nCategorical columns:", categorical_cols.tolist())


Categorical columns: ['bp_history', 'medication', 'family_history', 'exercise_level', 'smoking_status', 'has_hypertension']


In [12]:
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col].astype(str))

In [13]:
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns

In [14]:
for col in numeric_cols:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
    df[col] = np.where(df[col] < lower, lower,
              np.where(df[col] > upper, upper, df[col]))

In [15]:
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

print("\nFinal shape:", df.shape)
print("\nDataset sample after cleaning:\n", df.head())


Final shape: (1985, 11)

Dataset sample after cleaning:
         age  salt_intake  stress_score  bp_history  sleep_duration       bmi  \
0  0.959963    -0.267932      1.279847   -0.033215       -0.034347 -0.047975   
1 -0.943609     1.601295      1.598165   -0.033215       -0.685527 -0.583232   
2  1.422994     0.489863     -0.630061   -0.033215        0.421479 -1.631443   
3 -0.634921     0.742461      1.598165   -1.326008       -1.466942 -0.873163   
4 -0.480578     0.641422     -1.266698    1.259577       -0.425055 -2.189002   

   medication  family_history  exercise_level  smoking_status  \
0         0.0        1.007585       -0.188282       -0.633125   
1         0.0       -0.992472       -0.188282       -0.633125   
2         0.0       -0.992472        1.211490       -0.633125   
3         0.0       -0.992472       -0.188282       -0.633125   
4         0.0       -0.992472        1.211490       -0.633125   

   has_hypertension  
0          0.960963  
1         -1.040623  
2   