In [1]:
# install.packages(c("readr", "devtools"))
# devtools::install_github('PheWAS/PheWAS')


In [2]:
library(dplyr)
library(tidyr)
library(readr)
library(stringr)
library(PheWAS)



Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: ggplot2

Loading required package: parallel

Welcome to the new version of PheWAS. This version has many updates; please see https://github.com/PheWAS/PheWAS/tree/legacy for the legacy release if needed. Check ?PheWAS for more documentation



In [3]:
# Read files
icd_file <- "/mnt/project/notebooks/bmi/data/icd_raw.csv.gz"
icd_codes_file <- "/mnt/project/fields/data/phenotype_processing/coding19.tsv"
pheno_file <- "/mnt/project/notebooks/bmi/data/pheno.csv.gz"

# Prepare icd dataframe in long format
icd_df <- read_csv(icd_file, col_types = cols(sample_names = col_character()))
icd_df <- icd_df %>%
  mutate(icd = strsplit(icd, "\\|")) %>%
  unnest(icd)

# Prepare icd codes file to get the actual icd 10 codes
icd_codes_df <- read_tsv(icd_codes_file, col_types = cols(.default = col_character())) %>%
  mutate(coding = gsub(" ", "", coding),
         code = word(meaning, 1))

# Merge coding and meaning information, also add other required info
icd_df <- icd_df %>%
  left_join(icd_codes_df, by = c("icd" = "coding")) %>%
  mutate(count = 1, vocabulary_id = "ICD10", id = sample_names, code = as.character(code)) %>%
  select(id, vocabulary_id, code, index=count)


# Read pheno df and add sex info
pheno_df <- read_csv(pheno_file, col_types = cols(sample_names = col_character()))
id_sex <- pheno_df %>%
  select(id = sample_names, sex) %>%
  mutate(sex = ifelse(sex == "Male", "M", "F"))

# Only keep pheno df samples in icd df
icd_df <- icd_df %>%
  semi_join(id_sex, by = "id")


In [4]:
# Map ICD10 codes to phecodes
phe_df <- createPhenotypes(icd_df, min.code.count=1, id.sex=id_sex,
                           add.phecode.exclusions=T, translate=T, vocabulary.map=PheWAS::phecode_map_icd10)


Mapping codes to phecodes...

Aggregating codes...

Mapping exclusions...

Coalescing exclusions and min.code.count as applicable...

Reshaping data...



In [5]:
length(phe_df)

In [6]:
head(phe_df)

Unnamed: 0_level_0,id,008,008.5,008.51,008.52,008.6,008.7,010,031,031.1,⋯,985,986,987,988,989,990,994,994.1,994.2,994.21
Unnamed: 0_level_1,<chr>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,⋯,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>
1,1000010,False,False,False,False,False,False,False,False,False,⋯,False,False,False,False,False,False,False,False,False,False
2,1000028,False,False,False,False,False,False,False,False,False,⋯,False,False,False,False,False,False,False,False,False,False
3,1000034,False,False,False,False,False,False,False,False,False,⋯,False,False,False,False,False,False,False,False,False,False
4,1000052,False,False,False,False,False,False,,,,⋯,False,False,False,False,False,,True,,True,
5,1000069,False,False,False,False,False,False,False,False,False,⋯,False,False,False,False,False,False,False,False,False,False
6,1000076,False,False,False,False,False,False,False,False,False,⋯,False,False,False,False,False,False,False,False,False,False


In [6]:
head(phe_df)

Unnamed: 0_level_0,id,008,008.5,008.51,008.52,008.6,008.7,010,031,031.1,⋯,985,986,987,988,989,990,994,994.1,994.2,994.21
Unnamed: 0_level_1,<chr>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,⋯,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>
1,1000010,False,False,False,False,False,False,False,False,False,⋯,False,False,False,False,False,False,False,False,False,False
2,1000028,False,False,False,False,False,False,False,False,False,⋯,False,False,False,False,False,False,False,False,False,False
3,1000034,False,False,False,False,False,False,False,False,False,⋯,False,False,False,False,False,False,False,False,False,False
4,1000052,False,False,False,False,False,False,,,,⋯,False,False,False,False,False,,True,,True,
5,1000069,False,False,False,False,False,False,False,False,False,⋯,False,False,False,False,False,False,False,False,False,False
6,1000076,False,False,False,False,False,False,False,False,False,⋯,False,False,False,False,False,False,False,False,False,False


In [7]:
# Remove any duplicate rows
phe_df <- phe_df[!duplicated(phe_df$id), ]


In [8]:
length(phe_df)

In [9]:
# Save results_df in CSV.gz format
write.csv(phe_df, gzfile("phecodes.csv.gz"), row.names = FALSE)


In [10]:
system('dx upload phecodes.csv.gz --path /notebooks/bmi/data/downstream/phecodes/')


In [11]:
dim(phe_df)


In [12]:
dim(id_sex)
