In [1]:
import pandas as pd

In [2]:
df = pd.DataFrame({'ascites': [0,1,0,1],
                   'edema': [0.5,0,1,0.5],
                   'stage': [3,4,3,4],
                   'cholesterol': [200.5,180.2,190.5,210.3]
                  })
df

Unnamed: 0,ascites,edema,stage,cholesterol
0,0,0.5,3,200.5
1,1,0.0,4,180.2
2,0,1.0,3,190.5
3,1,0.5,4,210.3


### Multi-Collinearity of one-hot encoded features

In [3]:
df_stage = pd.get_dummies(
    data=df, columns=['stage']
)
df_stage[['stage_3', 'stage_4']]

Unnamed: 0,stage_3,stage_4
0,1,0
1,0,1
2,1,0
3,0,1


In [5]:
df_stage

Unnamed: 0,ascites,edema,cholesterol,stage_3,stage_4
0,0,0.5,200.5,1,0
1,1,0.0,180.2,0,1
2,0,1.0,190.5,1,0
3,1,0.5,210.3,0,1


In [6]:
df_stage_drop_first = df_stage.drop(columns='stage_3')
df_stage_drop_first

Unnamed: 0,ascites,edema,cholesterol,stage_4
0,0,0.5,200.5,0
1,1,0.0,180.2,1
2,0,1.0,190.5,0
3,1,0.5,210.3,1


In [7]:
import numpy as np

In [11]:
df_stage = pd.get_dummies(data=df,
               columns=['stage'],
              )
df_stage[['stage_4']]

Unnamed: 0,stage_4
0,0
1,1
2,0
3,1


In [12]:
df_stage_float64 = pd.get_dummies(
    data=df, columns=['stage'], dtype=np.float64
)
df_stage_float64[['stage_4']]

Unnamed: 0,stage_4
0,0.0
1,1.0
2,0.0
3,1.0


## Hazart Function
Let's say we fit the hazard function
$$
\lambda(t, x) = \lambda_0(t)e^{\theta^T X_i}
$$

So that we have the coefficients $\theta$ for the features in $X_i$

If you have a new patient, let's predict their hazard $\lambda(t,x)$

In [13]:
lambda_0 = 1
coef = np.array([0.5, 2.])
coef

array([0.5, 2. ])

In [14]:
X = pd.DataFrame({
    'age': [20, 30, 40],
    'cholesterol': [180, 220, 170]
})
X

Unnamed: 0,age,cholesterol
0,20,180
1,30,220
2,40,170


In [15]:
coef.shape

(2,)

In [16]:
X.shape

(3, 2)

It looks like the coefficient is a 1D array, so transposing it won't do anything.  
- We can transpose the X so that we're multiplying a (2,) array by a (2,3) dataframe.

So the formula looks more like this (transpose $X_i$ instead of $\theta$
$$
\lambda(t, x) = \lambda_0(t)e^{\theta X_i^T}
$$

- Let's multiply $\theta X_i^T$

In [17]:
np.dot(coef, X.T)

array([370., 455., 360.])

In [18]:
lambdas = lambda_0 * np.exp(np.dot(coef, X.T))
patients_df = X.copy()
patients_df['hazards'] = lambdas
patients_df

Unnamed: 0,age,cholesterol,hazards
0,20,180,4.886054e+160
1,30,220,4.0178090000000006e+197
2,40,170,2.218265e+156


### Permissible pairs with censoring and time

In [19]:
df = pd.DataFrame({'time': [2,4,2,4,2,4,2,4],
                   'event': [1,1,1,1,0,1,1,0],
                   'risk_score': [20,40,40,20,20,40,40,20] 
                  })
df

Unnamed: 0,time,event,risk_score
0,2,1,20
1,4,1,40
2,2,1,40
3,4,1,20
4,2,0,20
5,4,1,40
6,2,1,40
7,4,0,20


In [20]:
pd.concat([df.iloc[0:1], df.iloc[1:2]], axis=0)


Unnamed: 0,time,event,risk_score
0,2,1,20
1,4,1,40


In [22]:
if df['event'][0] == 1 or df['event'][1] == 1:
    print(f"May be permissible pair: 0 and 1")
else:
    print(f"Definitely not permissible pair: 0 and 1")

May be permissible pair: 0 and 1


In [24]:
pd.concat([df.iloc[4:5], df.iloc[7:8]], axis=0)

Unnamed: 0,time,event,risk_score
4,2,0,20
7,4,0,20


In [25]:
if df['event'][4] == 1 or df['event'][7] == 1:
    print(f"May be a permissible pair: 4 and 7")
else:
    print(f"Definitely not permissible pair: 4 and 7")

Definitely not permissible pair: 4 and 7


In [26]:
pd.concat([df.iloc[0:1],df.iloc[1:2]],axis=0)

Unnamed: 0,time,event,risk_score
0,2,1,20
1,4,1,40


In [27]:
if df['event'][0] == 1 and df['event'][1] == 1:
    print(f"Definitely a permissible pair: 0 and 1")
else:
    print(f"May be a permissible pair: 0 and 1")

Definitely a permissible pair: 0 and 1


In [28]:
pd.concat([df.iloc[6:7],df.iloc[7:8]],axis=0)

Unnamed: 0,time,event,risk_score
6,2,1,40
7,4,0,20


In [29]:
if df['time'][7] >= df['time'][6]:
    print(f"Permissible pair: Censored patient 7 lasted at least as long as uncensored patient 6")
else:
    print("Not a permisible pair")

Permissible pair: Censored patient 7 lasted at least as long as uncensored patient 6


In [30]:
pd.concat([df.iloc[4:5],df.iloc[5:6]],axis=0)

Unnamed: 0,time,event,risk_score
4,2,0,20
5,4,1,40


In [31]:
if df['time'][4] >= df['time'][5]:
    print(f"Permissible pair")
else:
    print("Not a permisible pair: censored patient 4 was censored before patient 5 had their event")

Not a permisible pair: censored patient 4 was censored before patient 5 had their event
