In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from lifelines import CoxPHFitter

In [2]:
# Load data
raw_train = pd.read_csv("../data/train.csv", index_col="ID")
raw_test = pd.read_csv("../data/test.csv", index_col="ID")

# Separate target columns from training data and cast event_os as int
raw_targets = raw_train[["event_os", "interval_os"]].astype({"event_os": int})
raw_train.drop(["event_os", "interval_os"], axis=1, inplace=True)

In [3]:
# Create training and testing datasets from the raw training data
X_train, X_test, Y_train, Y_test = train_test_split(raw_train, raw_targets)

In [4]:
cat_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]
num_cols = [col for col in X_train.columns if X_train[col].dtype in ("int64", "float64")]

In [5]:
num_transformer = SimpleImputer(strategy="mean")
cat_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ]
)

In [6]:
fitted = preprocessor.fit(X_train)
transformed = fitted.transform(X_train)
results = pd.DataFrame(transformed, columns=fitted.get_feature_names_out(), index=Y_train.index)

In [7]:
# results = pd.DataFrame(preprocessor.fit_transform(X_train), index=Y_train.index)
cph = CoxPHFitter()
# results.index = Y_train.index
cph.fit(pd.concat([results, Y_train], axis=1), duration_col="interval_os", event_col="event_os")


>>> events = df['event_os'].astype(bool)
>>> print(df.loc[events, 'cat__donor_abo_type_AB'].var())
>>> print(df.loc[~events, 'cat__donor_abo_type_AB'].var())

A very low variance means that the column cat__donor_abo_type_AB completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.



ConvergenceError: Convergence halted due to matrix inversion problems. Suspicion is high collinearity. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-modelMatrix is singular.