In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [2]:
df = pd.read_csv("dataset_med.csv")

In [3]:
df.head(7)

Unnamed: 0,id,age,gender,country,diagnosis_date,cancer_stage,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,end_treatment_date,survived
0,1,64,Male,Sweden,4/5/2016,Stage I,Yes,Passive Smoker,29.4,199,0,0,1,0,Chemotherapy,9/10/2017,0
1,2,50,Female,Netherlands,4/20/2023,Stage III,Yes,Passive Smoker,41.2,280,1,1,0,0,Surgery,6/17/2024,1
2,3,65,Female,Hungary,4/5/2023,Stage III,Yes,Former Smoker,44.0,268,1,1,0,0,Combined,4/9/2024,0
3,4,51,Female,Belgium,2/5/2016,Stage I,No,Passive Smoker,43.0,241,1,1,0,0,Chemotherapy,4/23/2017,0
4,5,37,Male,Luxembourg,11/29/2023,Stage I,No,Passive Smoker,19.7,178,0,0,0,0,Combined,1/8/2025,0
5,6,50,Male,Italy,1/2/2023,Stage I,No,Never Smoked,37.6,274,1,0,0,0,Radiation,12/27/2024,0
6,7,49,Female,Croatia,5/21/2018,Stage III,Yes,Passive Smoker,43.1,259,0,0,0,0,Radiation,5/6/2019,1


In [4]:
# Convert date columns to datetime
df["diagnosis_date"] = pd.to_datetime(df["diagnosis_date"], errors='coerce')
df["end_treatment_date"] = pd.to_datetime(df["end_treatment_date"], errors='coerce')

In [5]:
df.head()

Unnamed: 0,id,age,gender,country,diagnosis_date,cancer_stage,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,end_treatment_date,survived
0,1,64,Male,Sweden,2016-04-05,Stage I,Yes,Passive Smoker,29.4,199,0,0,1,0,Chemotherapy,2017-09-10,0
1,2,50,Female,Netherlands,2023-04-20,Stage III,Yes,Passive Smoker,41.2,280,1,1,0,0,Surgery,2024-06-17,1
2,3,65,Female,Hungary,2023-04-05,Stage III,Yes,Former Smoker,44.0,268,1,1,0,0,Combined,2024-04-09,0
3,4,51,Female,Belgium,2016-02-05,Stage I,No,Passive Smoker,43.0,241,1,1,0,0,Chemotherapy,2017-04-23,0
4,5,37,Male,Luxembourg,2023-11-29,Stage I,No,Passive Smoker,19.7,178,0,0,0,0,Combined,2025-01-08,0


In [6]:
# Drop unnecessary columns
df.drop(columns=["id", "country", "diagnosis_date", "end_treatment_date"], inplace=True)

In [7]:
# Encode categorical variables
label_encoders = {}
categorical_cols = ["gender", "cancer_stage", "family_history", "smoking_status", "treatment_type"]

In [8]:
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [9]:
# Convert integer columns to smaller types
int_cols = ["age", "hypertension", "asthma", "cirrhosis", "other_cancer", "survived"]
df[int_cols] = df[int_cols].astype("int8")

In [10]:
df[categorical_cols] = df[categorical_cols].astype("int8")

In [11]:
# Convert float columns to float32
scale_cols = ["bmi", "cholesterol_level"]
df[scale_cols] = df[scale_cols].astype("float32")

In [12]:
# Standardize numerical features
scaler = StandardScaler()
df[scale_cols] = scaler.fit_transform(df[scale_cols])

In [13]:
# # Split data into training and testing sets
# X = df.drop(columns=["survived"])
# y = df["survived"]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)


X = df.drop(columns=["survived"])
y = df["survived"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy=0.5, random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

In [14]:
# Train Logistic Regression model
log_model = LogisticRegression()
log_model.fit(X_train_balanced, y_train_balanced)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
# Predict and evaluate Logistic Regression
y_pred_log = log_model.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))

Logistic Regression Accuracy: 0.7797696629213483
              precision    recall  f1-score   support

           0       0.78      1.00      0.88    138799
           1       0.00      0.00      0.00     39201

    accuracy                           0.78    178000
   macro avg       0.39      0.50      0.44    178000
weighted avg       0.61      0.78      0.68    178000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
# Train Neural Network model
nn_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_balanced.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
nn_model.fit(X_train_balanced, y_train_balanced, epochs=2, batch_size=32, verbose=1)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/2
[1m26025/26025[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 3ms/step - accuracy: 0.6638 - loss: 0.6441 
Epoch 2/2
[1m26025/26025[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 3ms/step - accuracy: 0.6659 - loss: 0.6358 


<keras.src.callbacks.history.History at 0x287a93f1390>

In [17]:
# Evaluate Neural Network
y_pred_nn = (nn_model.predict(X_test) > 0.5).astype("int32")
print("Neural Network Accuracy:", accuracy_score(y_test, y_pred_nn))
print(classification_report(y_test, y_pred_nn))

[1m5563/5563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step
Neural Network Accuracy: 0.7797696629213483
              precision    recall  f1-score   support

           0       0.78      1.00      0.88    138799
           1       0.00      0.00      0.00     39201

    accuracy                           0.78    178000
   macro avg       0.39      0.50      0.44    178000
weighted avg       0.61      0.78      0.68    178000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
# Save models and scaler
joblib.dump(log_model, "logistic_model.pkl")
nn_model.save("neural_network_model.h5")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(label_encoders, "label_encoders.pkl")



['label_encoders.pkl']

In [19]:
from tensorflow.keras.models import load_model

In [20]:
def predict_survival(input_data, model_type='logistic', threshold=0.3):  # Lowered threshold
    # Load models and encoders
    scaler = joblib.load("scaler.pkl")
    label_encoders = joblib.load("label_encoders.pkl")
    
    if model_type == 'logistic':
        model = joblib.load("logistic_model.pkl")
    elif model_type == 'neural':
        model = load_model("neural_network_model.h5")
    else:
        raise ValueError("Invalid model type. Choose 'logistic' or 'neural'")
    
    # Default values for missing features
    default_values = {col: X_train[col].mode()[0] for col in X_train.columns}
    
    # Fill missing values with defaults
    for key in default_values:
        if key not in input_data:
            input_data[key] = default_values[key]
    
    # Convert input to DataFrame with correct order
    input_df = pd.DataFrame([input_data])[X_train.columns]
    
    # Encode categorical values safely
    for col in label_encoders:
        if col in input_df:
            if input_df[col][0] in label_encoders[col].classes_:
                input_df[col] = label_encoders[col].transform([input_df[col][0]])
            else:
                input_df[col] = label_encoders[col].transform([label_encoders[col].classes_[0]])  # Assign most common value
    
    # Standardize numerical values
    input_df[scale_cols] = scaler.transform(input_df[scale_cols])
    
    # Make prediction
    if model_type == 'logistic':
        prediction = model.predict_proba(input_df)[:, 1]  # Use probability instead of direct prediction
    else:
        prediction = model.predict(input_df).flatten()  # Convert to 1D array
    
    return "Survived" if prediction[0] > threshold else "Did not survive"


In [26]:
input_data = {
    "age": 80,  # Older age increases risk
    "gender": 1,  # Male (typically higher risk)
    "cancer_stage": 0,  # Advanced cancer stage
    "family_history": 0,  # Family history of cancer
    "smoking_status": 0,  # Heavy smoker
    "bmi": 35.0,  # Obese BMI (higher risk)
    "cholesterol_level": 180,  # High cholesterol
    "hypertension": 1,  # Has high blood pressure
    "asthma": 0,  # Has asthma
    "cirrhosis": 0,  # Has liver disease
    "other_cancer": 0,  # Has other types of cancer
    "treatment_type": 0  # No treatment
}

prediction = predict_survival(input_data, model_type='logistic')  # or 'neural'
print("Prediction:", prediction)


Prediction: Survived
