In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.regression import AFTSurvivalRegression

In [3]:
spark = SparkSession \
    .builder \
    .appName("Survival Regression") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/11 00:11:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Logged Time To Failure Data

label = unit of time, say months, equipment fails
censor = 1 means occured, say time (by the label) to failure, uncensored
censor = 0 means censored, failure not occured, say time (by the label) to maintenance
features contains feature columns, such as machine age and temperature, more example such as 

Haeat Attack Study:
https://web.archive.org/web/20170517071528/http://www.umass.edu/statdata/statdata/data/whas500.txt


In [4]:
training = spark.createDataFrame((
(1.218, 1.0, Vectors.dense(1.560, -0.605)), 
(2.949, 0.0, Vectors.dense(0.346, 2.158)),
(3.627, 0.0, Vectors.dense(1.380, 0.231)), 
(0.273, 1.0, Vectors.dense(0.520, 1.151)), 
(4.199, 0.0, Vectors.dense(0.795, -0.226))
)).toDF("label", "censor", "features")

In [5]:
training.show()

                                                                                

+-----+------+--------------+
|label|censor|      features|
+-----+------+--------------+
|1.218|   1.0| [1.56,-0.605]|
|2.949|   0.0| [0.346,2.158]|
|3.627|   0.0|  [1.38,0.231]|
|0.273|   1.0|  [0.52,1.151]|
|4.199|   0.0|[0.795,-0.226]|
+-----+------+--------------+



### Predict 2 quantile time to failure at 30% chance and at 60% chance 

In [6]:
quantileProbabilities = (0.3, 0.6)

### train the model with training data above with AFTSurvivalRegression

In [7]:
aft = AFTSurvivalRegression(quantileProbabilities=quantileProbabilities,quantilesCol="quantiles",\
                            censorCol="censor",featuresCol="features",labelCol="label") 
model = aft.fit(training)

23/05/11 00:11:27 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/05/11 00:11:27 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


### # Print the coefficients, intercept and scale parameter for AFT survival regression

In [8]:
print("Coefficients: {}".format(model.coefficients))
print("Intercept: {}".format(model.intercept))
print("Scale: {}".format(model.scale)) 


Coefficients: [-0.4963068060199854,0.19844393975928598]
Intercept: 2.638090563156019
Scale: 1.5472326865488455


### transform the data based on model

prediction = time unit to fail when censor = 1 uncensored
prediction = time unit to other event such as maintenance when censor = 0 (Censored)
1st element of quantiles = time unit at 30% chance
2nd element of quantiles = time unit at 60% chance

In [9]:
model.transform(training).show(truncate=False)

+-----+------+--------------+------------------+---------------------------------------+
|label|censor|features      |prediction        |quantiles                              |
+-----+------+--------------+------------------+---------------------------------------+
|1.218|1.0   |[1.56,-0.605] |5.7189965530298865|[1.1603295951029065,4.995471733719635] |
|2.949|0.0   |[0.346,2.158] |18.07645802858896 |[3.667540106156399,15.789559285491249] |
|3.627|0.0   |[1.38,0.231]  |7.381875365763499 |[1.4977117707333785,6.447975512763023] |
|0.273|1.0   |[0.52,1.151]  |13.577581299077902|[2.7547611307597735,11.859846908963423]|
|4.199|0.0   |[0.795,-0.226]|9.013093216625709 |[1.8286702406091497,7.872823838856861] |
+-----+------+--------------+------------------+---------------------------------------+

