In [1]:
import pandas as pd 
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,classification_report,recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,VotingClassifier

In [2]:
# Creating Basline Model
df=pd.read_csv("novagen_dataset.csv")
df.isnull().sum()

#split features and target
X=df.drop("Target",axis=1)
y=df["Target"]

# What is stratify in train_test_split?
# stratify=y means:
# Split the data in such a way that the class 
# proportion in train and test remains the same as the original dataset.
#  Simple Meaning
# If your dataset has:
# 70% Class 0
# 30% Class 1
# Then after splitting:
# Train set → 70% Class 0, 30% Class 1
# Test set → 70% Class 0, 30% Class 1
# So no class imbalance is introduced by splitting.

#train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [3]:
#Algorithm Wise Rule:

# | Algorithm           | Scale X? | Scale y? |
# | ------------------- | -------- | -------- |
# | Logistic Regression | ✅ Yes    | ❌ No  |
# | KNN                 | ✅ Yes    | ❌ No  |
# | SVM                 | ✅ Yes    | ❌ No  |
# | Decision Tree       | ❌ No     | ❌ No  |
# | Random Forest       | ❌ No     | ❌ No  |
# | Linear Regression   | Optional |  ❌ No   |
# | SVR                 | ✅ Yes    | ✅ Yes |


In [4]:
#Scaling(imp in logistic and KNN Because they are distance based)
scaler = StandardScaler()
# Because X contains features (inputs) and y contains labels (outputs).
# Scaling is needed for inputs, not class labels.
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression (With regulaization)

In [5]:
log_reg=LogisticRegression(
    penalty="l2",
    solver="liblinear",
    max_iter=1000
)
#Training
log_reg.fit(X_train_scaled,y_train)
#Prediciting
y_pred_log_r=log_reg.predict(X_test_scaled)

# In Model Evaluation, Recall is more important than accuracy 
# because missing a high-risk patient is dangerous

print("Logisitic regression Accuracy:",accuracy_score(y_test,y_pred_log_r))
print("Logisitic regression recall:",recall_score(y_test,y_pred_log_r))
print("Logisitic regression classification report:\n",classification_report(y_test,y_pred_log_r))

Logisitic regression Accuracy: 0.8141361256544503
Logisitic regression recall: 0.8283132530120482
Logisitic regression classification report:
               precision    recall  f1-score   support

           0       0.81      0.80      0.80       914
           1       0.82      0.83      0.82       996

    accuracy                           0.81      1910
   macro avg       0.81      0.81      0.81      1910
weighted avg       0.81      0.81      0.81      1910



# Model 2 KNN

In [6]:
knn=KNeighborsClassifier(
    n_neighbors=5,
    metric="euclidean"#Distance is calculated using Euclidean distance (straight-line distance)
)
knn.fit(X_train_scaled,y_train)
y_pred_knn=knn.predict(X_test_scaled)

print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print("KNN Recall:", recall_score(y_test, y_pred_knn))
print("KNN classification Report",classification_report(y_test, y_pred_knn))

KNN Accuracy: 0.8832460732984293
KNN Recall: 0.8835341365461847
KNN classification Report               precision    recall  f1-score   support

           0       0.87      0.88      0.88       914
           1       0.89      0.88      0.89       996

    accuracy                           0.88      1910
   macro avg       0.88      0.88      0.88      1910
weighted avg       0.88      0.88      0.88      1910



# Random Forest(Ensemble learning Bagging)

In [7]:
rf=RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42
)
#training
rf.fit(X_train,y_train)
#predictions
y_pred_rf = rf.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest Recall:", recall_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.9382198952879581
Random Forest Recall: 0.9588353413654619
              precision    recall  f1-score   support

           0       0.95      0.92      0.93       914
           1       0.93      0.96      0.94       996

    accuracy                           0.94      1910
   macro avg       0.94      0.94      0.94      1910
weighted avg       0.94      0.94      0.94      1910



# Gradient Boosting (Ensemble Learning Homogenous)

In [8]:
gb = GradientBoostingClassifier(
    n_estimators=150,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

gb.fit(X_train, y_train)

y_pred_gb = gb.predict(X_test)

print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_gb))
print("Gradient Boosting Recall:", recall_score(y_test, y_pred_gb))
print(classification_report(y_test, y_pred_gb))


Gradient Boosting Accuracy: 0.9303664921465968
Gradient Boosting Recall: 0.9497991967871486
              precision    recall  f1-score   support

           0       0.94      0.91      0.93       914
           1       0.92      0.95      0.93       996

    accuracy                           0.93      1910
   macro avg       0.93      0.93      0.93      1910
weighted avg       0.93      0.93      0.93      1910



# Voting classifier(Ensemble learning Hetrogenus)

In [9]:
voting_clf = VotingClassifier(
    estimators=[
        ("lr", LogisticRegression(max_iter=1000, solver="liblinear")),
        ("knn", KNeighborsClassifier(n_neighbors=5)),
        ("rf", RandomForestClassifier(n_estimators=200, random_state=42))
    ],
    voting="soft"
)

voting_clf.fit(X_train_scaled, y_train)

y_pred_vote = voting_clf.predict(X_test_scaled)

print("Voting Classifier Accuracy:", accuracy_score(y_test, y_pred_vote))
print("Voting Classifier Recall:", recall_score(y_test, y_pred_vote))
print(classification_report(y_test, y_pred_vote))

Voting Classifier Accuracy: 0.9157068062827225
Voting Classifier Recall: 0.929718875502008
              precision    recall  f1-score   support

           0       0.92      0.90      0.91       914
           1       0.91      0.93      0.92       996

    accuracy                           0.92      1910
   macro avg       0.92      0.92      0.92      1910
weighted avg       0.92      0.92      0.92      1910



## Results

| Model                | Recall |
|----------------------|:------:|
| Logistic Regression  | 82.8%  |
| KNN                  | 88.3%  |
| Random Forest        | 95.8%  |
| Gradient Boosting    | 94.9%  |
| Voting Classifier    | 93.07% |

### Best Classifier that we should use for NovaGen(based on Recall) - Random Forest with accuracy of 93.7%

# Why Recall is More Important Than Accuracy?
# This is the most important part of project.
#  Medical Context Reality
# Positive class (1) = High-risk patient
# False Negative = Model says “low risk” but patient is actually high-risk 
#  This is dangerous.
