In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
df = pd.read_csv('/Data_cardiovascular_risk.csv')
print(df.head())  # show the first few rows
print(df.info())  # show dataset information
print(df.describe())  # show summary statistics

   id  age  education sex is_smoking  cigsPerDay  BPMeds  prevalentStroke  \
0   0   64        2.0   F        YES         3.0     0.0                0   
1   1   36        4.0   M         NO         0.0     0.0                0   
2   2   46        1.0   F        YES        10.0     0.0                0   
3   3   50        1.0   M        YES        20.0     0.0                0   
4   4   64        1.0   F        YES        30.0     0.0                0   

   prevalentHyp  diabetes  totChol  sysBP  diaBP    BMI  heartRate  glucose  \
0             0         0    221.0  148.0   85.0    NaN       90.0     80.0   
1             1         0    212.0  168.0   98.0  29.77       72.0     75.0   
2             0         0    250.0  116.0   71.0  20.35       88.0     94.0   
3             1         0    233.0  158.0   88.0  28.26       68.0     94.0   
4             0         0    241.0  136.5   85.0  26.42       70.0     77.0   

   TenYearCHD  
0           1  
1           0  
2           0 

In [None]:
# Handle missing values
for col in df.columns:
  if df[col].dtype in ['int64', 'float64']:  # Check if column is numeric
      df[col].fillna(df[col].mean(), inplace=True)  # Fill missing values with mean

  else:
      df[col].fillna(df[col].mode()[0], inplace=True)  # Fill missing values with mode

      print(df.isnull().sum())

id                 0
age                0
education          0
sex                0
is_smoking         0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64
id                 0
age                0
education          0
sex                0
is_smoking         0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64


In [None]:
categorical_cols = ['sex', 'is_smoking', 'prevalentStroke', 'prevalentHyp', 'diabetes']  # Correct column names
df = pd.get_dummies(df, columns=categorical_cols)
print(df.head())

   id  age  education  cigsPerDay  BPMeds  totChol  sysBP  diaBP        BMI  \
0   0   64        2.0         3.0     0.0    221.0  148.0   85.0  25.794964   
1   1   36        4.0         0.0     0.0    212.0  168.0   98.0  29.770000   
2   2   46        1.0        10.0     0.0    250.0  116.0   71.0  20.350000   
3   3   50        1.0        20.0     0.0    233.0  158.0   88.0  28.260000   
4   4   64        1.0        30.0     0.0    241.0  136.5   85.0  26.420000   

   heartRate  ...  sex_F  sex_M  is_smoking_NO  is_smoking_YES  \
0       90.0  ...   True  False          False            True   
1       72.0  ...  False   True           True           False   
2       88.0  ...   True  False          False            True   
3       68.0  ...  False   True          False            True   
4       70.0  ...   True  False          False            True   

   prevalentStroke_0  prevalentStroke_1  prevalentHyp_0  prevalentHyp_1  \
0               True              False            Tr

In [None]:
# Correct the column names based on the output of df.head() after one-hot encoding
scaler = StandardScaler()
# Adjust the column names below to match the actual names in your DataFrame
df[['age', 'education', 'cigsPerDay', 'BPMeds', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']] = scaler.fit_transform(df[['age', 'education', 'cigsPerDay', 'BPMeds', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']])
print(df.head())

   id       age  education  cigsPerDay   BPMeds   totChol     sysBP     diaBP  \
0   0  1.682783   0.028898   -0.512681 -0.17667 -0.357314  0.690879  0.176093   
1   1 -1.576210   2.017431   -0.766087 -0.17667 -0.557375  1.588193  1.257462   
2   2 -0.412284  -0.965369    0.078600 -0.17667  0.287325 -0.744824 -0.988457   
3   3  0.053287  -0.965369    0.923287 -0.17667 -0.090567  1.139536  0.425640   
4   4  1.682783  -0.965369    1.767973 -0.17667  0.087264  0.174923  0.176093   

            BMI  heartRate  ...  sex_F  sex_M  is_smoking_NO  is_smoking_YES  \
0 -8.651790e-16   1.171652  ...   True  False          False            True   
1  9.680255e-01  -0.332317  ...  False   True           True           False   
2 -1.325992e+00   1.004544  ...   True  False          False            True   
3  6.003008e-01  -0.666532  ...  False   True          False            True   
4  1.522126e-01  -0.499424  ...   True  False          False            True   

   prevalentStroke_0  prevalentS

In [None]:
X = df.drop('TenYearCHD', axis=1)  # features
y = df['TenYearCHD']  # target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # split data into training and testing sets


In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8687315634218289


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
y_pred = log_reg.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.8687315634218289
Classification Report:
              precision    recall  f1-score   support

           0       0.87      1.00      0.93       581
           1       0.83      0.10      0.18        97

    accuracy                           0.87       678
   macro avg       0.85      0.55      0.56       678
weighted avg       0.86      0.87      0.82       678

Confusion Matrix:
[[579   2]
 [ 87  10]]


In [None]:
# Correct the column names in the new_patient DataFrame
new_patient = pd.DataFrame({
    'age': [45], 'education': [12], 'sex': ['male'], 'is_smoking': [1],
    'cigsPerDay': [10], 'BPMeds': [1], 'prevalentStroke': [0],
    'prevalentHyp': [1], 'diabetes': [0], 'totChol': [200],
    'sysBP': [120], 'diaBP': [80], 'BMI': [25], 'heartRate': [70],
    'glucose': [100]
                    })

# One-hot encode the categorical variables in the new patient DataFrame
new_patient = pd.get_dummies(new_patient, columns=['sex', 'is_smoking']) # One-hot encode categorical features

# Ensure all columns used during training are present in the new patient data
for col in X_train.columns:
     if col not in new_patient.columns:
            new_patient[col] = 0  # Add missing columns and fill with 0

# Reorder columns to match the training data
new_patient = new_patient[X_train.columns]

# Fit the scaler to the TRAINING data and then transform BOTH training and new data
scaler = StandardScaler()
X_train_transformed = scaler.fit_transform(X_train) # Fit and transform the training data
new_patient_transformed = scaler.transform(new_patient) # Transform the new patient data using the fitted scaler

# Retrain the model with the scaled training data
og_reg = LogisticRegression()
log_reg.fit(X_train_transformed, y_train)

# Make the prediction
prediction = log_reg.predict(new_patient_transformed)
print("Predicted risk of heart disease:", prediction[0])

Predicted risk of heart disease: 1
