In [1]:
require(data.table)
require(ggplot2)
require(survival)

options(repr.plot.width=6, repr.plot.height=4)

Loading required package: data.table
Loading required package: ggplot2
Loading required package: survival


In [2]:
path = '/Users/davide/Documents/universita/tesi/src/survival_process/'
setwd(path)

load('../../data/main_process_preprocessed_data_2.RData')

In [3]:
head(new_df)

id,sex,age_in,PC1,PC2,time_event,status
10000243,M,70,-40.851181,1.0158516,51,1
10000471,F,80,-41.405426,0.8888934,1058,1
10000717,M,73,-10.854551,-9.4426737,705,0
10003004,F,75,4.612457,-2.4369352,1658,1
10006065,M,69,61.216558,6.4277908,1950,0
10007000,F,84,-2.63421,3.4520296,238,1


In [7]:
dim(new_df)

# Cross Validate

In [23]:
set.seed(143)
#Randomly shuffle the observations
new_df = new_df[sample(1:dim(new_df)[1]),]
#Create 10 equally size folds
folds <- cut(seq(1,dim(new_df)[1]),breaks=10,labels=FALSE)

#Perform 10 fold cross validation
scores = NULL
pb <- txtProgressBar(min = 0, max = 10, style = 3)
for(i in 1:10){
    setTxtProgressBar(pb, i)
    #Segement data by fold 
    validIndexes <- which(folds==i,arr.ind=TRUE)
    valid = new_df[validIndexes,]
    train = new_df[-validIndexes,]

    # fit
    models = list(
        coxph(Surv(time_event,status)~ age_in, data = train),
        coxph(Surv(time_event,status)~ age_in + sex, data = train),
        coxph(Surv(time_event,status)~ age_in + sex , data = train),
        coxph(Surv(time_event,status)~ age_in  + PC1, data = train),
        coxph(Surv(time_event,status)~ age_in + sex + PC1, data = train),
        coxph(Surv(time_event,status)~ age_in + sex + PC2, data = train),
        coxph(Surv(time_event,status)~ age_in + sex + PC1 + PC2, data = train)
    )
    # predict, evaluate
    fold_scores = c()
    for(model in models){
        prediction = predict(model, newdata = valid)
        # evaluate concordance probability
        score = survConcordance(Surv(time_event,status) ~ prediction, data = valid)$concordance[[1]]
        fold_scores = c(fold_scores,score)
    }
    scores = rbind(scores,fold_scores)
}
rownames(scores) = 1:10
colnames(scores) = names(models)



In [25]:
# summarize CV scores
mean_scores = colMeans(scores)
best_score = max(mean_scores)
worst_score = min(mean_scores)

In [32]:
# print cv scores
print('****************** Cross validation Concordance Probability index ******************')
cat('\n')
print('|------------------------|--------------------------------------|')
print('|        C-INDEX         |               FORMULA                |')
print('|------------------------|--------------------------------------|')
for(i in 1:length(models)){
    score = mean_scores[i]
    if(score != best_score & score != worst_score){
        print(paste('|       ',round(mean_scores[i]*100,1),'%          | ',models[[i]]$formula[3]))
    }else if (score == best_score){
        print(paste('|       ',round(mean_scores[i]*100,1),'%          | ',models[[i]]$formula[3], '                 <------ BEST MODEL'))
    }else{
        print(paste('|       ',round(mean_scores[i]*100,1),'%          | ',models[[i]]$formula[3], '                 <------ WORST MODEL'))
    }
}

[1] "****************** Cross validation Concordance Probability index ******************"

[1] "|------------------------|--------------------------------------|"
[1] "|        C-INDEX         |               FORMULA                |"
[1] "|------------------------|--------------------------------------|"
[1] "|        66.9 %          |  age_in                  <------ WORST MODEL"
[1] "|        67.1 %          |  age_in + sex"
[1] "|        67.1 %          |  age_in + sex"
[1] "|        67.2 %          |  age_in + PC1"
[1] "|        67.5 %          |  age_in + sex + PC1                  <------ BEST MODEL"
[1] "|        67.1 %          |  age_in + sex + PC2"
[1] "|        67.5 %          |  age_in + sex + PC1 + PC2"


# Fit

In [33]:
fit = coxph(Surv(time_event,status)~ age_in + sex + PC1, data = new_df)
fit

Call:
coxph(formula = Surv(time_event, status) ~ age_in + sex + PC1, 
    data = new_df)

             coef  exp(coef)   se(coef)      z       p
age_in  0.0657175  1.0679249  0.0035578 18.471 < 2e-16
sexM    0.1886813  1.2076560  0.0596932  3.161 0.00157
PC1    -0.0029608  0.9970436  0.0007483 -3.957 7.6e-05

Likelihood ratio test=455  on 3 df, p=< 2.2e-16
n= 4541, number of events= 1200 