In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

In [39]:
data = {
    'age': [45, 50, 30, 40, 60, 25, 35, np.nan, 50, 65],
    'gender': ['M', 'F', 'M', 'F', 'M', 'M', 'F', 'M', 'F', 'M'],
    'symptom_1': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0],  # 1 for present, 0 for absent
    'symptom_2': [0, 1, 1, 1, 0, 1, 0, 1, 0, 1],
    'medical_test_result': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0],  # 1 for positive, 0 for negative
    'lifestyle_factor': [1, 1, 0, 0, 1, 1, 0, 0, 1, 0],  # 1 for healthy, 0 for unhealthy
    'disease': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]  # 1 for disease, 0 for no disease
}

In [41]:
df = pd.DataFrame(data)

In [43]:
imputer = SimpleImputer(strategy='mean')
df['age'] = imputer.fit_transform(df[['age']])

In [45]:
df['gender'] = df['gender'].map({'M': 0, 'F': 1})

In [47]:
X = df.drop('disease', axis=1)
y = df['disease']

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [51]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [53]:
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)

In [55]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)

In [57]:
svm = SVC()
svm.fit(X_train_scaled, y_train)

In [59]:
log_reg_pred = log_reg.predict(X_test_scaled)
decision_tree_pred = decision_tree.predict(X_test)
svm_pred = svm.predict(X_test_scaled)

In [61]:
def evaluate_model(y_true, y_pred, model_name):
    print(f"Evaluation for {model_name}:")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"Recall: {recall_score(y_true, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_true, y_pred):.4f}")
    print("\n")

In [63]:
evaluate_model(y_test, log_reg_pred, "Logistic Regression")
evaluate_model(y_test, decision_tree_pred, "Decision Tree")
evaluate_model(y_test, svm_pred, "SVM")

Evaluation for Logistic Regression:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000


Evaluation for Decision Tree:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000


Evaluation for SVM:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000




In [65]:
log_reg_balanced = LogisticRegression(class_weight='balanced')
log_reg_balanced.fit(X_train_scaled, y_train)

In [67]:
log_reg_balanced_pred = log_reg_balanced.predict(X_test_scaled)
evaluate_model(y_test, log_reg_balanced_pred, "Logistic Regression (Balanced)")

Evaluation for Logistic Regression (Balanced):
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000




In [71]:
from sklearn.model_selection import cross_val_score
cv_score = cross_val_score(log_reg, X, y, cv=5).mean()
print(f"Cross-validation score for Logistic Regression: {cv_score:.4f}")

Cross-validation score for Logistic Regression: 0.8000
