In [7]:
!pip install pandas scikit-learn xgboost lightgbm catboost imbalanced-learn --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

**Loading the dataset**

In [9]:
df = pd.read_csv("eye_cancer_patients.csv")

In [10]:
# Dropping the unnecessary rows and colums
df = df.drop_duplicates()
df = df.dropna(subset=['Outcome_Status'])

In [11]:
# Converting the column names to consistent format
df.columns = df.columns.str.strip().str.replace(" ", "_")

In [12]:
# Parsing the data to extract features
df['Date_of_Diagnosis'] = pd.to_datetime(df['Date_of_Diagnosis'], errors='coerce')
df['Diagnosis_Year'] = df['Date_of_Diagnosis'].dt.year
df['Diagnosis_Month'] = df['Date_of_Diagnosis'].dt.month
df.drop(columns=['Patient_ID', 'Date_of_Diagnosis'], inplace=True)

In [19]:
# Defining features and targets
X = df.drop('Outcome_Status', axis=1)
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(df['Outcome_Status'].astype(str))

In [20]:
# Splitting  before encoding along with  SMOTE
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [21]:
# Indentifying columns for transformation
categorical_cols = ['Gender', 'Cancer_Type', 'Laterality', 'Stage_at_Diagnosis', 'Treatment_Type',
                    'Surgery_Status', 'Radiation_Therapy', 'Chemotherapy', 'Genetic_Markers',
                    'Family_History', 'Country']
numerical_cols = ['Age', 'Survival_Time_Months', 'Diagnosis_Year', 'Diagnosis_Month']

In [22]:
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
], remainder='passthrough')

# Fitting and transforming the traning data
X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)

# Handling class imbalance
sm = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = sm.fit_resample(X_train_encoded, y_train)

In [23]:
# Traning the models
models = {
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    'LightGBM': LGBMClassifier(),
    'CatBoost': CatBoostClassifier(verbose=0)
}

In [24]:
# Evaluation of each model
for name, model in models.items():
    print(f"\n🧠 Model: {name}")
    model.fit(X_train_balanced, y_train_balanced)
    y_pred = model.predict(X_test_encoded)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


🧠 Model: XGBoost


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.331
Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.29      0.31       323
           1       0.34      0.35      0.34       342
           2       0.33      0.35      0.34       335

    accuracy                           0.33      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.33      0.33      0.33      1000

Confusion Matrix:
 [[ 94 117 112]
 [ 93 119 130]
 [100 117 118]]

🧠 Model: LightGBM
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003467 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1011
[LightGBM] [Info] Number of data points in the train set: 4104, number of used features: 128
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training f



Accuracy: 0.323
Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.31      0.32       323
           1       0.32      0.33      0.33       342
           2       0.32      0.33      0.32       335

    accuracy                           0.32      1000
   macro avg       0.32      0.32      0.32      1000
weighted avg       0.32      0.32      0.32      1000

Confusion Matrix:
 [[101 117 105]
 [100 113 129]
 [105 121 109]]

🧠 Model: CatBoost
Accuracy: 0.334
Classification Report:
               precision    recall  f1-score   support

           0       0.36      0.34      0.35       323
           1       0.33      0.32      0.32       342
           2       0.32      0.34      0.33       335

    accuracy                           0.33      1000
   macro avg       0.34      0.33      0.33      1000
weighted avg       0.33      0.33      0.33      1000

Confusion Matrix:
 [[109 106 108]
 [ 94 110 138]
 [100 120 115]]
