In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [2]:
# Load dataset
df = pd.read_csv("dataset_med.csv")

In [3]:
# Convert dates into numeric duration
df["diagnosis_date"] = pd.to_datetime(df["diagnosis_date"])
df["end_treatment_date"] = pd.to_datetime(df["end_treatment_date"])
df["treatment_duration"] = (df["end_treatment_date"] - df["diagnosis_date"]).dt.days
df = df.drop(["diagnosis_date", "end_treatment_date"], axis=1)
print("Data loaded and treatment duration calculated.")

Data loaded and treatment duration calculated.


In [4]:
# Features and target
X = df.drop(["survived"], axis=1)
y = df["survived"]

In [5]:
# Identify categorical & numeric columns
cat_cols = X.select_dtypes(include=["object"]).columns
num_cols = X.select_dtypes(include=["int64", "float64"]).columns

In [6]:
# Preprocessing: impute + one-hot encode
preprocessor = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ]
)
X_preprocessed = preprocessor.fit_transform(X)
print("Data preprocessed and features transformed.")

Data preprocessed and features transformed.


In [7]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_preprocessed, y, test_size=0.2, random_state=42
)

In [8]:
print("Training the RandomForest model...")

Training the RandomForest model...


In [9]:
model = RandomForestClassifier(
    class_weight="balanced", 
    random_state=42,
    n_estimators=150,    # INCREASED from 30
    max_depth=10,        # INCREASED from 5
    n_jobs=-1
)

In [10]:
# Train on the original (unresampled) training set
model.fit(X_train, y_train)
print("Training complete.")

Training complete.


In [11]:
# Prediction on test set
y_pred = model.predict(X_test)
print("\n--- Model Evaluation --")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


--- Model Evaluation --
Accuracy: 0.5459887640449438
              precision    recall  f1-score   support

           0       0.78      0.58      0.67    138639
           1       0.22      0.43      0.29     39361

    accuracy                           0.55    178000
   macro avg       0.50      0.50      0.48    178000
weighted avg       0.66      0.55      0.58    178000



In [12]:
# Predict new patient
new_patient = pd.DataFrame([{
    "id": 22474,
    "age": 65,
    "asthma":0,
    "cancer_stage": "Stage I",
    "gender": "Male",
    "country": "Sweden",
    "stage": "Stage I",
    "family_history": "Yes",
    "smoking_status": "Passive Smoker",
    "bmi": 29.4,
    "cholesterol_level": 199,
    "bp_high": 0,
    "bp_low": 0,
    "diabetes": 1,
    "cirrhosis": 1,
    "other_cancer":1,
    "hypertension": 0,
    "treatment_type": "Chemotherapy",
    "diagnosis_date": "2016-04-05",
    "end_treatment_date": "2017-09-10"
}])

In [13]:
# Process new patient dates
new_patient["diagnosis_date"] = pd.to_datetime(new_patient["diagnosis_date"])
new_patient["end_treatment_date"] = pd.to_datetime(new_patient["end_treatment_date"])
new_patient["treatment_duration"] = (new_patient["end_treatment_date"] - new_patient["diagnosis_date"]).dt.days
new_patient = new_patient.drop(["diagnosis_date", "end_treatment_date"], axis=1)

In [14]:
# Transform new patient data using the preprocessor
new_patient_preprocessed = preprocessor.transform(new_patient)

In [15]:
# Predict survival
pred = model.predict(new_patient_preprocessed)
print("\n--- New Patient Prediction ---")
print("Prediction:", "Survived" if pred[0] == 0 else "Not Survived")


--- New Patient Prediction ---
Prediction: Survived
