In [5]:
require(data.table)
require(ggplot2)
require(survival)

options(repr.plot.width=6, repr.plot.height=4)

In [10]:
path = '/Users/davide/Documents/universita/tesi/src/survival_process/'
setwd(path)

load('../../data/main_process_preprocessed_data.RData')

In [11]:
head(new_df)

id,sex,age_in,PC1,PC2,time_event,status
10000717,M,73,-10.299129,-3.34178201,705,0
10003004,F,75,-8.94404,-0.44160539,1658,1
10006065,M,69,7.011157,3.96387733,1950,0
10007000,F,84,-14.379666,0.06735727,238,1
10007024,M,71,-6.815942,0.15195553,1914,0
10007217,F,75,-13.031433,-0.58113572,1172,0


# Cross Validate

In [49]:
set.seed(143)
#Randomly shuffle the observations
new_df = new_df[sample(1:dim(new_df)[1]),]
#Create 10 equally size folds
folds <- cut(seq(1,dim(new_df)[1]),breaks=10,labels=FALSE)

#Perform 10 fold cross validation
scores = NULL
pb <- txtProgressBar(min = 0, max = 10, style = 3)
for(i in 1:10){
    setTxtProgressBar(pb, i)
    #Segement data by fold 
    validIndexes <- which(folds==i,arr.ind=TRUE)
    valid = new_df[validIndexes,]
    train = new_df[-validIndexes,]

    # fit
    models = list(
        model0 = coxph(Surv(time_event,status)~ age_in, data = train),
        model1 = coxph(Surv(time_event,status)~ age_in + sex , data = train),
        model2 = coxph(Surv(time_event,status)~ age_in + sex + PC1, data = train),
        model3 = coxph(Surv(time_event,status)~ age_in + sex + PC2, data = train),
        model4 = coxph(Surv(time_event,status)~ age_in + sex + PC1 + PC2, data = train)
    )
    # predict, evaluate
    fold_scores = c()
    for(model in models){
        prediction = predict(model, newdata = valid)
        # evaluate concordance probability
        score = survConcordance(Surv(time_event,status) ~ prediction, data = valid)$concordance[[1]]
        fold_scores = c(fold_scores,score)
    }
    scores = rbind(scores,fold_scores)
}
rownames(scores) = 1:10
colnames(scores) = names(models)



In [50]:
# print cv scores
mean_scores = colMeans(scores)
model_names = colnames(scores)
print('Cross validation concordance probability')
for(i in 1:length(models)){
    print(paste(model_names[i],':',round(mean_scores[i]*100,1),'%'))
}

[1] "Cross validation concordance probability"
[1] "model0 : 67 %"
[1] "model1 : 67.1 %"
[1] "model2 : 67.2 %"
[1] "model3 : 67 %"
[1] "model4 : 67.1 %"


# Fit

In [51]:
fit = coxph(Surv(time_event,status)~ age_in + sex + PC1 + PC2, data = new_df)
fit

Call:
coxph(formula = Surv(time_event, status) ~ age_in + sex + PC1 + 
    PC2, data = new_df)

            coef exp(coef)  se(coef)      z       p
age_in  0.065955  1.068179  0.004663 14.145 < 2e-16
sexM    0.215429  1.240394  0.077702  2.773 0.00556
PC1    -0.003765  0.996242  0.002153 -1.749 0.08035
PC2    -0.027000  0.973361  0.016231 -1.663 0.09622

Likelihood ratio test=277  on 4 df, p=< 2.2e-16
n= 2916, number of events= 718 