In [59]:
##############################
###   LANGUAGE SELECTION   ###
###      en -> English     ###
###      cs -> Czech       ###
###      de -> German      ###
###      es -> Spanish     ###
###      fr -> French      ###
###      it -> Italian     ###
###      nl -> Dutch       ###
###      pl -> Polish      ###
###      pt -> Portuguese  ###
##############################

language = "fr"

In [60]:
# We launch the libraries
library(jsonlite)
library(rlist)
library(data.table)
library(dplyr)
library(XML)
library(methods)

In [61]:
# We open xml "product1_en_cross_jdbor_evo"
product1=xmlParse(paste0("xml/",language,"_product1.xml"))
product1=xmlToList(product1)
# We remove the last element of the list, which is the numeric count of all elements of the list
product1=product1$DisorderList
product1=product1[-length(product1)]

### Our list for working
indice_list=1
product=list()
for (i in 1:length(product1))
{
  product[[indice_list]]=product1[[i]]
  indice_list=indice_list+1
}
# Each element in the list is a disorder

In [62]:
# We previously need to select the active clinical entities:
# Head of classification (flag=128), Historical entities (flag=512) & On-line (flag=1)
indice_list=1
all_active_clinical_entities=list()
for (i in 1:length(product))
  if (product[[i]]$DisorderFlagList$DisorderFlag$Value=='1' |
      product[[i]]$DisorderFlagList$DisorderFlag$Value=='128' |
      product[[i]]$DisorderFlagList$DisorderFlag$Value=='512')
  {
    all_active_clinical_entities[[indice_list]]=product[[i]]
    indice_list=indice_list+1
  }

In [63]:
### We look for all the active clinical entities which have synonyms
indice_list=1
synonyms_total=list()
for (i in 1:length(all_active_clinical_entities))
{
  synonyms_total[[indice_list]]=data.frame(as.numeric(all_active_clinical_entities[[i]]$SynonymList$.attrs))
  indice_list=indice_list+1
}
synonyms_total=rbindlist(synonyms_total,use.names = TRUE)
colnames(synonyms_total)=c('Number of synonyms')

In [64]:
# Total number of synonyms
total_number_synonyms=sum(synonyms_total)
total_number_synonyms

In [65]:
# Statistics about all synonyms in English
stats_total_synonyms=summary(synonyms_total)
stats_total_synonyms

 Number of synonyms
 Min.   : 0.000    
 1st Qu.: 0.000    
 Median : 1.000    
 Mean   : 1.294    
 3rd Qu.: 2.000    
 Max.   :18.000    

In [66]:
### We look for the groups of disorder which have synonyms
indice_list=1
synonyms_group=list()
for (i in 1:length(all_active_clinical_entities))
  if (all_active_clinical_entities[[i]]$DisorderGroup$.attrs=='36540')
  {
    synonyms_group[[indice_list]]=data.frame(as.numeric(all_active_clinical_entities[[i]]$SynonymList$.attrs))
    indice_list=indice_list+1
  }
synonyms_group=rbindlist(synonyms_group,use.names = TRUE)
colnames(synonyms_group)=c('Number of synonyms')

In [67]:
# Number of synonyms for groups of disorders in English
number_synonyms_group=sum(synonyms_group)
number_synonyms_group

In [68]:
# Percentage of synonyms for groups of disorders in English
percentage_synonyms_group=number_synonyms_group/total_number_synonyms*100
percentage_synonyms_group

In [69]:
# Statistics about groups of disorder's synonyms in English
stats_synonyms_group=summary(synonyms_group)
stats_synonyms_group

 Number of synonyms
 Min.   :0.0000    
 1st Qu.:0.0000    
 Median :0.0000    
 Mean   :0.5757    
 3rd Qu.:1.0000    
 Max.   :7.0000    

In [70]:
### We look for the disorders which have synonyms
indice_list=1
synonyms_disorder=list()
for (i in 1:length(all_active_clinical_entities))
  if (all_active_clinical_entities[[i]]$DisorderGroup$.attrs=='36547')
  {
    synonyms_disorder[[indice_list]]=data.frame(as.numeric(all_active_clinical_entities[[i]]$SynonymList$.attrs))
    indice_list=indice_list+1
  }
synonyms_disorder=rbindlist(synonyms_disorder,use.names = TRUE)
colnames(synonyms_disorder)=c('Number of synonyms')

In [71]:
# Number of synonyms for disorders in English
number_synonyms_disorder=sum(synonyms_disorder)
number_synonyms_disorder

In [72]:
### Percentage of synonyms for disorders in English
percentage_synonyms_disorder=number_synonyms_disorder/total_number_synonyms*100
percentage_synonyms_disorder

In [73]:
# Statistics about disorder' synonyms in English
stats_synonyms_disorder=summary(synonyms_disorder)
stats_synonyms_disorder

 Number of synonyms
 Min.   : 0.000    
 1st Qu.: 0.000    
 Median : 1.000    
 Mean   : 1.476    
 3rd Qu.: 2.000    
 Max.   :18.000    

In [74]:
### We look for the subtypes of disorder which have synonyms
indice_list=1
synonyms_subtype=list()
for (i in 1:length(all_active_clinical_entities))
  if (all_active_clinical_entities[[i]]$DisorderGroup$.attrs=='36554')
  {
    synonyms_subtype[[indice_list]]=data.frame(as.numeric(all_active_clinical_entities[[i]]$SynonymList$.attrs))
    indice_list=indice_list+1
  }
synonyms_subtype=rbindlist(synonyms_subtype,use.names = TRUE)
colnames(synonyms_subtype)=c('Number of synonyms')

In [75]:
# Number of synonyms for subtypes of disorders in English
number_synonyms_subtype=sum(synonyms_subtype)
number_synonyms_subtype

In [76]:
# Percentage of synonyms for subtypes of disorders in English
percentage_synonyms_subtype=number_synonyms_subtype/total_number_synonyms*100
percentage_synonyms_subtype

In [77]:
# Statistics about subtypes of disorder's synonyms in English
stats_synonyms_subtype=summary(synonyms_subtype)
stats_synonyms_subtype

 Number of synonyms
 Min.   : 0.000    
 1st Qu.: 0.000    
 Median : 1.000    
 Mean   : 1.682    
 3rd Qu.: 2.000    
 Max.   :15.000    

In [78]:
###############################################################################################


##### Number of nomenclature terms (preferred terms + synonyms)


### We look for the groups of disorder which have preferred terms
indice_list=1
preferred_terms_group=list()
for (i in 1:length(all_active_clinical_entities))
  if (all_active_clinical_entities[[i]]$DisorderGroup$.attrs=='36540')
  {
    preferred_terms_group[[indice_list]]=data.frame(all_active_clinical_entities[[i]]$Name$text)
    indice_list=indice_list+1
  }
preferred_terms_group=rbindlist(preferred_terms_group)
preferred_terms_group=preferred_terms_group[!duplicated(preferred_terms_group)]
names(preferred_terms_group)=c('Preferred_terms')

In [79]:
# Number of preferred terms for groups of disorder in English
number_preferred_terms_group=nrow(preferred_terms_group)
number_preferred_terms_group

In [80]:
# Number of nomenclature terms (preferred terms + synonyms) for groups of disorders in English
number_nomenclature_terms_group=number_synonyms_group + number_preferred_terms_group
number_nomenclature_terms_group

In [81]:
### We look for the disorders which have preferred terms
indice_list=1
preferred_terms_disorder=list()
for (i in 1:length(all_active_clinical_entities))
  if (all_active_clinical_entities[[i]]$DisorderGroup$.attrs=='36547')
  {
    preferred_terms_disorder[[indice_list]]=data.frame(all_active_clinical_entities[[i]]$Name$text)
    indice_list=indice_list+1
  }
preferred_terms_disorder=rbindlist(preferred_terms_disorder)
preferred_terms_disorder=preferred_terms_disorder[!duplicated(preferred_terms_disorder)]
names(preferred_terms_disorder)=c('Preferred_terms')

In [82]:
# Number of preferred terms for disorders in English
number_preferred_terms_disorder=nrow(preferred_terms_disorder)
number_preferred_terms_disorder

In [83]:
# Number of nomenclature terms (preferred terms + synonyms) for disorders in English
number_nomenclature_terms_disorder=number_synonyms_disorder + number_preferred_terms_disorder
number_nomenclature_terms_disorder

In [84]:
### We look for the subtypes of disorder which have preferred terms
indice_list=1
preferred_terms_subtype=list()
for (i in 1:length(all_active_clinical_entities))
  if (all_active_clinical_entities[[i]]$DisorderGroup$.attrs=='36554')
  {
    preferred_terms_subtype[[indice_list]]=data.frame(all_active_clinical_entities[[i]]$Name$text)
    indice_list=indice_list+1
  }
preferred_terms_subtype=rbindlist(preferred_terms_subtype)
preferred_terms_subtype=preferred_terms_subtype[!duplicated(preferred_terms_subtype)]
names(preferred_terms_subtype)=c('Preferred_terms')

In [85]:
# Number of preferred terms for subtypes of disorder in English
number_preferred_terms_subtype=nrow(preferred_terms_subtype)
number_preferred_terms_subtype

In [86]:
# Number of nomenclature terms (preferred terms + synonyms) for groups of disorders in English
number_nomenclature_terms_subtype=number_synonyms_subtype + number_preferred_terms_subtype
number_nomenclature_terms_subtype

In [87]:
### Total number of nomenclature terms for active clinical entities in English
total_number_nomenclature_terms=number_nomenclature_terms_subtype + 
                                number_nomenclature_terms_disorder +
                                number_nomenclature_terms_group
total_number_nomenclature_terms