## **Preprocessing the dataset**

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = pd.read_csv('blood_cancer_dataset.csv')

In [3]:
df.drop(['Patient_ID', 'Date_of_Test'], axis=1, inplace=True)

In [4]:
df.drop(['Cancer_Type', 'Stage'], axis=1, inplace=True)

In [5]:
le_gender = LabelEncoder()
df['Gender'] = le_gender.fit_transform(df['Gender'])

In [6]:
df['Diagnosis_Result'] = df['Diagnosis_Result'].map({'Negative': 0, 'Positive': 1})

In [7]:
vectorizer = CountVectorizer(tokenizer=lambda x: x.split(', '))
symptom_features = vectorizer.fit_transform(df['Symptoms'])

In [8]:
symptom_df = pd.DataFrame(symptom_features.toarray(), columns=vectorizer.get_feature_names())
df = pd.concat([df.drop('Symptoms', axis=1), symptom_df], axis=1)

In [9]:
numeric_cols = ['Age', 'White_Blood_Cell_Count', 'Red_Blood_Cell_Count', 'Hemoglobin_Level', 'Platelet_Count']
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [10]:
print("Preprocessed dataset shape:", df.shape)
print(df.head())

Preprocessed dataset shape: (5000, 21)
        Age  Gender  White_Blood_Cell_Count  Red_Blood_Cell_Count  \
0  0.467172       1               -0.764495             -0.259511   
1  0.004473       1               -0.352799              1.495621   
2 -0.998042       0               -0.793786             -0.002215   
3  1.508244       2                0.451880             -1.132483   
4 -0.226877       2               -0.614787             -0.204376   

   Hemoglobin_Level  Platelet_Count  Diagnosis_Result  abdominal pain  \
0         -0.417811       -0.519299                 0               0   
1          0.959971       -0.355628                 0               0   
2          0.871181       -0.569135                 0               0   
3         -1.703741        1.417057                 1               0   
4          0.592563       -0.995465                 0               1   

   bleeding gums  bone pain  ...  fatigue  fever  frequent infections  \
0              0          0  ...  

## **Modeling data**

In [11]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier

In [12]:
X = df.drop('Diagnosis_Result', axis=1)
y = df['Diagnosis_Result']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [14]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [15]:
kmeans = KMeans(n_clusters=3, random_state=42)
train_clusters = kmeans.fit_predict(X_train_scaled).reshape(-1, 1)
test_clusters = kmeans.predict(X_test_scaled).reshape(-1, 1)

In [16]:
X_train_final = np.hstack((X_train_scaled, train_clusters))
X_test_final = np.hstack((X_test_scaled, test_clusters))

In [17]:
# Define models and hyperparameter grids
param_grids = {
    'lr': {
        'C': [0.01, 0.1, 1, 10],
        'penalty': ['l2'],
        'solver': ['lbfgs'],
        'max_iter': [1000]
    },
    'svm': {
        'C': [0.1, 1, 10],
        'kernel': ['rbf', 'linear'],
        'gamma': ['scale', 'auto']
    },
    'knn': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['minkowski', 'euclidean']
    }
}

In [18]:
lr = LogisticRegression(random_state=42)
svm = SVC(probability=True, random_state=42)
knn = KNeighborsClassifier()

In [19]:
def grid_search_fit(model, params, X, y):
    grid = GridSearchCV(model, params, cv=5, scoring='accuracy', n_jobs=-1)
    grid.fit(X, y)
    print(f"Best params for {model.__class__.__name__}: {grid.best_params_}")
    print(f"Best CV accuracy: {grid.best_score_ * 100:.2f}%")
    return grid.best_estimator_

In [20]:
print("Tuning Logistic Regression...")
best_lr = grid_search_fit(lr, param_grids['lr'], X_train_final, y_train)

Tuning Logistic Regression...
Best params for LogisticRegression: {'C': 0.1, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'lbfgs'}
Best CV accuracy: 100.00%


In [21]:
print("\nTuning SVM...")
best_svm = grid_search_fit(svm, param_grids['svm'], X_train_final, y_train)


Tuning SVM...
Best params for SVC: {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}
Best CV accuracy: 100.00%


In [22]:
print("\nTuning KNN...")
best_knn = grid_search_fit(knn, param_grids['knn'], X_train_final, y_train)


Tuning KNN...
Best params for KNeighborsClassifier: {'metric': 'minkowski', 'n_neighbors': 9, 'weights': 'distance'}
Best CV accuracy: 99.42%


In [23]:
ensemble = VotingClassifier(
    estimators=[('lr', best_lr), ('svm', best_svm), ('knn', best_knn)],
    voting='soft'
)

In [24]:
ensemble.fit(X_train_final, y_train)
y_pred_ensemble = ensemble.predict(X_test_final)

In [25]:
acc = accuracy_score(y_test, y_pred_ensemble)

In [26]:
print(f"\nEnsemble Test Accuracy: {acc * 100:.2f}%")
print("Classification Report:")
print(classification_report(y_test, y_pred_ensemble))


Ensemble Test Accuracy: 100.00%
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       750
           1       1.00      1.00      1.00       250

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000

