In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Load the dataset
data = pd.read_csv('heart_disease_uci.csv')  # Update with the correct path to your dataset

# Display the first few rows of the dataset
print(data.head())

# Check for missing values in the entire dataset
print(data.isna().sum())

# Separate features and target variable
X = data.drop(['num', 'id', 'dataset'], axis=1)  # Features
y = data['num'].apply(lambda x: 1 if x > 0 else 0)  # Target variable (binary classification)

# Check for missing values in X and y
print(X.isna().sum())
print(y.isna().sum())

# Handle missing values in X
# For numerical columns, use mean imputation
numerical_columns = X.select_dtypes(include=['float64', 'int64']).columns
imputer = SimpleImputer(strategy='mean')
X[numerical_columns] = imputer.fit_transform(X[numerical_columns])

# For categorical columns, use most frequent value imputation
categorical_columns = X.select_dtypes(include=['object']).columns
X[categorical_columns] = X[categorical_columns].fillna(X[categorical_columns].mode().iloc[0])

# Verify no missing values are left
print(X.isna().sum())

# Convert categorical columns to numerical values
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

# Check for missing values in X and y after imputation
print(X.isna().sum())
print(y.isna().sum())

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Predict risk level for a new patient based on sample data
# Example input: [age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpeak, slope, ca, thal]
new_patient = [[70, 1, 2, 160, 300, 1, 2, 130, 1, 3.5, 2, 2, 3]]
predicted_risk = model.predict(new_patient)
print("Predicted Risk Level:", "Risk" if predicted_risk[0] == 1 else "No Risk")


   id  age     sex    dataset               cp  trestbps   chol    fbs  \
0   1   63    Male  Cleveland   typical angina     145.0  233.0   True   
1   2   67    Male  Cleveland     asymptomatic     160.0  286.0  False   
2   3   67    Male  Cleveland     asymptomatic     120.0  229.0  False   
3   4   37    Male  Cleveland      non-anginal     130.0  250.0  False   
4   5   41  Female  Cleveland  atypical angina     130.0  204.0  False   

          restecg  thalch  exang  oldpeak        slope   ca  \
0  lv hypertrophy   150.0  False      2.3  downsloping  0.0   
1  lv hypertrophy   108.0   True      1.5         flat  3.0   
2  lv hypertrophy   129.0   True      2.6         flat  2.0   
3          normal   187.0  False      3.5  downsloping  0.0   
4  lv hypertrophy   172.0  False      1.4    upsloping  0.0   

                thal  num  
0       fixed defect    0  
1             normal    2  
2  reversable defect    1  
3             normal    0  
4             normal    0  
id      

