#### 1. import data

In [1]:
import hail as hl
rett = hl.import_vcf('/home/titan/Hail/rett/191102-rett-wgs.vep_tx_191206.vcf.bgz', reference_genome = 'GRCh38')  # .write('/home/titan/Hail/rett/191102-rett-wgs.vep_tx_191206.mt', overwrite = True)

Initializing Spark and Hail with default parameters...
Running on Apache Spark version 2.4.1
SparkUI available at http://163.152.180.157:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.30-2ae07d872f43
LOGGING: writing to /home/titan/Hail/rett/hail-20200129-1349-0.2.30-2ae07d872f43.log


In [3]:
rett = rett.annotate_cols(fam = rett.s.split('\_')[0],  # divided by '_', fam = family(before _)
                          role = rett.s.split('\_')[1])   # role = proband(1), mother(2), father(3) (after _)

#### 2. variant filtering
> - `filter == PASS`인 variant만 남기기
> - multi allelic 확인 및 제외
> - ~chrX, chrY, chrM 제외~
> - LCR(low complexity region) 제외
> - gnomad filtering

In [4]:
##  `filter == PASS`인 variant만 남기기     
##   'filters' with non-zero line means that the variant did not meet the GATK standard
rett = rett.filter_rows(hl.len(rett.filters) == 0) # select row field 'filters' with zero length

In [5]:
## multi allelic 제외
rett = hl.split_multi(rett) # 'split_multi' adds 'was_split' fields -> True for multi allelic
rett = rett.filter_rows(rett.was_split == False)

In [6]:
## chrX, chY, chM제외
#rett = rett.filter_rows(rett.locus.in_autosome())

## lcr(low complexity region)제외
lcr_bed = hl.import_bed('/home/titan/resources/LCR-hs38.bed', reference_genome = 'GRCh38')
rett = rett.filter_rows(~hl.is_defined(lcr_bed[rett.locus])) # ~ means for 'not'

2020-01-29 13:49:58 Hail: INFO: Reading table with no type imputation
  Loading column 'f0' as type 'str' (user-specified)
  Loading column 'f1' as type 'int32' (user-specified)
  Loading column 'f2' as type 'int32' (user-specified)



In [7]:
## gnomad filtering   # csq = consequence of VEP processing
rett = rett.annotate_rows(csq = rett.info.CSQ)
rett = rett.transmute_rows(csq_gnomADg_AF = rett.csq.map(lambda x: x.split('\|')[70]))
l = hl.array(['0','1',''])  # Allele frequency 0 or 1 means that the opposite allele never appeared in gnomad database. '' also means there were no variants like this at gnomad database.
rett = rett.filter_rows(rett.csq_gnomADg_AF.all(lambda x: l.contains(x)))

#### 3. (rare) de novo variants(unique to one family) : 부모에겐 없고, 자녀에게만 있는 heterozygous

In [8]:
rett = hl.variant_qc(rett)  # 'variant_qc' makes dp_stats, gq_stats, het, ref ...

## rare and (GT)heterozygous
rett_rare = rett.filter_rows(rett.info.AC[0]==1)  # Alternative allele count == 1  -> unique to one family
rett_rare_het = rett_rare.filter_rows(rett_rare.variant_qc.n_het ==1)  # number of heterozygous samples == 1

In [9]:
DNV = rett_rare_het.filter_entries(rett_rare_het.GT.is_het()) # is call(GT) includes two different alleles?
DNV = DNV.filter_entries(DNV.s.split('\_')[1]=="1") # select proband (because the heterozygous must be in proband, not parent)
DNV = DNV.key_cols_by().entries() # select selected probands' entries

rett_DNV = rett_rare_het.semi_join_rows(DNV) # via entries, select vcf of selected probands

In [9]:
rett_DNV.count()

2020-01-24 14:20:28 Hail: INFO: Coerced sorted dataset
2020-01-24 14:20:41 Hail: INFO: Coerced sorted dataset
2020-01-24 14:21:08 Hail: INFO: Coerced sorted dataset
2020-01-24 14:21:21 Hail: INFO: Coerced sorted dataset
2020-01-24 14:21:34 Hail: INFO: Coerced sorted dataset
2020-01-24 14:22:01 Hail: INFO: Coerced sorted dataset


(7525, 27)

In [10]:
# is_snp, is_indel can be evaluated by ref and alt allele
rett_DNV_SNP = rett_DNV.filter_rows(hl.is_snp(rett_DNV.alleles[0], rett_DNV.alleles[1]))
rett_DNV_Indel = rett_DNV.filter_rows(hl.is_indel(rett_DNV.alleles[0], rett_DNV.alleles[1]))

In [11]:
rett_DNV_Indel = rett_DNV_Indel.annotate_rows(Total_DP = hl.agg.sum(rett_DNV_Indel.DP))

In [9]:
#rett_DNV_SNP.count_rows() 

2019-12-29 11:54:32 Hail: INFO: Coerced sorted dataset
2019-12-29 11:54:49 Hail: INFO: Coerced sorted dataset
2019-12-29 11:55:23 Hail: INFO: Coerced sorted dataset
2019-12-29 11:55:40 Hail: INFO: Coerced sorted dataset
2019-12-29 11:55:56 Hail: INFO: Coerced sorted dataset
2019-12-29 11:56:30 Hail: INFO: Coerced sorted dataset


4584

In [10]:
#rett_DNV_Indel.count_rows()

2019-12-29 11:59:05 Hail: INFO: Coerced sorted dataset
2019-12-29 11:59:21 Hail: INFO: Coerced sorted dataset
2019-12-29 11:59:55 Hail: INFO: Coerced sorted dataset
2019-12-29 12:00:11 Hail: INFO: Coerced sorted dataset
2019-12-29 12:00:27 Hail: INFO: Coerced sorted dataset
2019-12-29 12:01:02 Hail: INFO: Coerced sorted dataset


2941

#### 4. HQ variant filtering

#### (방법 1 ROCube)

> - **SNP**
    - Step 1 va_QUAL  214.900  0.99004  0.84787
    - Step 2 g_AB    0.227  0.99022  0.50365
    - Step 3 va_MQ   57.350  0.95745  1.00000

In [12]:
## SNP
rett_SNP_HQdnv1 = rett_DNV_SNP.filter_cols(rett_DNV_SNP.role =='1')
rett_SNP_HQdnv1 = rett_SNP_HQdnv1.filter_rows((rett_SNP_HQdnv1.qual >= 214.900) &
                                          (rett_SNP_HQdnv1.info.MQ >= 57.350))
rett_SNP_HQdnv1 = rett_SNP_HQdnv1.annotate_entries(AB=hl.min(rett_SNP_HQdnv1.AD.map(lambda x: x/rett_SNP_HQdnv1.DP)))
rett_SNP_HQdnv1 = rett_SNP_HQdnv1.filter_entries((rett_SNP_HQdnv1.AB>=0.227)&
                                                    (rett_SNP_HQdnv1.AB<=0.773))

rett_SNP_HQdnv1 = hl.sample_qc(rett_SNP_HQdnv1)

In [13]:
rett_SNP_HQdnv1.sample_qc.n_het.show(9)

2020-01-24 14:24:16 Hail: INFO: Coerced sorted dataset
2020-01-24 14:24:29 Hail: INFO: Coerced sorted dataset
2020-01-24 14:25:01 Hail: INFO: Coerced sorted dataset
2020-01-24 14:25:14 Hail: INFO: Coerced sorted dataset
2020-01-24 14:25:27 Hail: INFO: Coerced sorted dataset
2020-01-24 14:25:54 Hail: INFO: Coerced sorted dataset
2020-01-24 14:28:00 Hail: INFO: Coerced sorted dataset


s,Unnamed: 1_level_0
str,int64
"""TWGS1_1""",59
"""TWGS2_1""",52
"""TWGS3_1""",57
"""TWGS4_1""",75
"""TWGS5_1""",63
"""TWGS6_1""",57
"""TWGS7_1""",67
"""TWGS8_1""",56
"""TWGS9_1""",50


In [13]:
Tb_rett_SNP_HQdnv1 = rett_SNP_HQdnv1.entries()
Tb_rett_SNP_HQdnv1 = Tb_rett_SNP_HQdnv1.key_by(Tb_rett_SNP_HQdnv1.locus, 
                                               Tb_rett_SNP_HQdnv1.alleles, 
                                               Tb_rett_SNP_HQdnv1.fam)

2020-01-29 13:50:15 Hail: WARN: entries(): Resulting entries table is sorted by '(row_key, col_key)'.
    To preserve row-major matrix table order, first unkey columns with 'key_cols_by()'




> - **Indel**
    - Step 1 va_QUAL  65.780  0.99522  0.25581
    - Step 2 va_SOR  -2.964  0.99501  0.23958
    - Step 3 va_DP  101.000  0.99508  0.21190
    - Step 4 va_QD   2.330  0.99515  0.24651
    - Step 5 va_MQ  27.960  0.99512  0.24691
    - Step 6 g_AB   0.129  0.99510  0.18699
    - Step 7 va_GQM  25.000  0.99034  0.33
    - Step 8 va_AN  50.000  0.99351  0.23529
    - Step 9 g_GQ  69.000  0.99079  0.28333
    - Step 10 va_MQRankSum   -1.674  0.99071  0.26531
    - Step 11 g_DP    8.000  0.99580  0.27027
    - Step 12 va_FS -100.210  1.0000  0.03571
    - Step 13 va_BaseQRankSum -2.799  0.9999   0.0
    


In [14]:
##Indel
rett_Indel_HQdnv1 = rett_DNV_Indel.filter_cols(rett_DNV_Indel.role =='1')
rett_Indel_HQdnv1 = rett_Indel_HQdnv1.filter_rows((rett_Indel_HQdnv1.qual >=65.780)&
                                               (rett_Indel_HQdnv1.info.SOR <= 2.964)&
                                               (rett_Indel_HQdnv1.info.DP >=101.000) &
                                               (rett_Indel_HQdnv1.info.QD >=2.330) & 
                                               (rett_Indel_HQdnv1.info.MQ >=27.960) &
                                               (rett_Indel_HQdnv1.variant_qc.gq_stats.mean >=25.000)&
                                               (rett_Indel_HQdnv1.info.AN >=50.000) &
                                               (rett_Indel_HQdnv1.info.MQRankSum >=-1.674)&
                                               (rett_Indel_HQdnv1.info.FS<=100.210)&
                                               (rett_Indel_HQdnv1.info.BaseQRankSum >= -2.799))

rett_Indel_HQdnv1 = rett_Indel_HQdnv1.annotate_entries(AB=hl.min(rett_Indel_HQdnv1.AD.map(lambda x: x/rett_Indel_HQdnv1.DP)))
rett_Indel_HQdnv1 = rett_Indel_HQdnv1.filter_entries((rett_Indel_HQdnv1.GQ>=69)&
                                                 (rett_Indel_HQdnv1.DP>=8)&
                                                 (rett_Indel_HQdnv1.AB>=0.129)&
                                                    (rett_Indel_HQdnv1.AB<=0.871))
rett_Indel_HQdnv1 = hl.sample_qc(rett_Indel_HQdnv1)

In [15]:
rett_Indel_HQdnv1.sample_qc.n_het.show(9)

2019-12-29 12:11:46 Hail: INFO: Coerced sorted dataset
2019-12-29 12:12:03 Hail: INFO: Coerced sorted dataset
2019-12-29 12:12:48 Hail: INFO: Coerced sorted dataset
2019-12-29 12:13:05 Hail: INFO: Coerced sorted dataset
2019-12-29 12:13:21 Hail: INFO: Coerced sorted dataset
2019-12-29 12:13:55 Hail: INFO: Coerced sorted dataset
2019-12-29 12:16:42 Hail: INFO: Coerced sorted dataset


s,Unnamed: 1_level_0
str,int64
"""TWGS1_1""",62
"""TWGS2_1""",33
"""TWGS3_1""",39
"""TWGS4_1""",44
"""TWGS5_1""",47
"""TWGS6_1""",62
"""TWGS7_1""",47
"""TWGS8_1""",40
"""TWGS9_1""",41


In [15]:
Tb_rett_Indel_HQdnv1 = rett_Indel_HQdnv1.entries()
Tb_rett_Indel_HQdnv1 = Tb_rett_Indel_HQdnv1.key_by(Tb_rett_Indel_HQdnv1.locus, 
                                                   Tb_rett_Indel_HQdnv1.alleles, 
                                                   Tb_rett_Indel_HQdnv1.fam)

#### (방법 2 슬라이드 기준)

> - **Heterozygous SNP**
    - Step 1; Genotype Quality (GQ);≥99;99.1%;74.3%
    - Step 2; Quality by depth (QD);≥3;98.8%;88.6%
    - Step 3;Strand Bias Odd Ratio (SOR);≤2.5;98.6%;90.8%
    - Step 4;Allele Depth (DP);≥10;98.4%;92.5%
    - Step 5;ReadPosRankSum;≥-1.4;98.2%;94.0%
    - Step 6;Mean genotype quality (GQ_Mean);50;97.3%;96.3%
    - Step 7;MQRankSum;-1.7;96.7%;96.8%
    - Step 8;Allele balance (AB);0.24; 0.76;95.0%;98.3%

In [16]:
##SNP
rett_SNP_HQdnv2 = rett_DNV_SNP.filter_cols(rett_DNV_SNP.role =='1')
rett_SNP_HQdnv2 = rett_SNP_HQdnv2.filter_rows((rett_SNP_HQdnv2.info.QD>=3) &
                                          (rett_SNP_HQdnv2.info.SOR<=2.5) &
                                          (rett_SNP_HQdnv2.info.ReadPosRankSum>=-1.4) &
                                          (rett_SNP_HQdnv2.variant_qc.gq_stats.mean>=50)&
                                          (rett_SNP_HQdnv2.info.MQRankSum>=-1.7))
rett_SNP_HQdnv2 = rett_SNP_HQdnv2.annotate_entries(AB=hl.cond(rett_SNP_HQdnv2.GT.is_het(), 
                                                              hl.min(rett_SNP_HQdnv2.AD.map(lambda x: x/rett_SNP_HQdnv2.DP)),0))
rett_SNP_HQdnv2 = rett_SNP_HQdnv2.filter_entries((rett_SNP_HQdnv2.GQ>=99)&
                                                 (rett_SNP_HQdnv2.DP>=10)&
                                                 (rett_SNP_HQdnv2.AB>=0.24)&
                                                 (rett_SNP_HQdnv2.AB<=0.76))
rett_SNP_HQdnv2 = hl.sample_qc(rett_SNP_HQdnv2)

In [21]:
rett_SNP_HQdnv2.sample_qc.n_het.show(9)

2019-12-26 10:01:06 Hail: INFO: Coerced sorted dataset
2019-12-26 10:01:23 Hail: INFO: Coerced sorted dataset
2019-12-26 10:02:07 Hail: INFO: Coerced sorted dataset
2019-12-26 10:02:23 Hail: INFO: Coerced sorted dataset
2019-12-26 10:02:40 Hail: INFO: Coerced sorted dataset
2019-12-26 10:03:15 Hail: INFO: Coerced sorted dataset
2019-12-26 10:06:05 Hail: INFO: Coerced sorted dataset


s,Unnamed: 1_level_0
str,int64
"""TWGS1_1""",63
"""TWGS2_1""",61
"""TWGS3_1""",58
"""TWGS4_1""",79
"""TWGS5_1""",67
"""TWGS6_1""",66
"""TWGS7_1""",72
"""TWGS8_1""",61
"""TWGS9_1""",62


In [17]:
Tb_rett_SNP_HQdnv2 = rett_SNP_HQdnv2.entries()
Tb_rett_SNP_HQdnv2 = Tb_rett_SNP_HQdnv2.key_by(Tb_rett_SNP_HQdnv2.locus, 
                                               Tb_rett_SNP_HQdnv2.alleles, 
                                               Tb_rett_SNP_HQdnv2.fam)

> - **Heterozygous Indel**
    - Step 1;Genotype Quality (GQ);≥99;98.5%;56.1%
    - Step 2;Quality by depth (QD);≥4;97.6%;73.3%
    - Step 3;Strand Bias Odd Ratio (SOR);≤3;97.3%;79.4%
    - Step 4;Allele Depth (DP);≥10;95.9%;84.3%
    - Step 5;ReadPosRankSum;≥-1.7;94.9%;90.9%
    - Step 6;Mean genotype quality (GQ_Mean);≥50;93.2%;94.0%
    - Step 7;Allele balance (AB);≥0.20; ≤0.80;85.2%;97.6%



In [18]:
##Indel
rett_Indel_HQdnv2 = rett_DNV_Indel.filter_cols(rett_DNV_Indel.role =='1')
rett_Indel_HQdnv2 = rett_Indel_HQdnv2.filter_rows((rett_Indel_HQdnv2.info.QD>=4) &
                                          (rett_Indel_HQdnv2.info.SOR<=3) &
                                          (rett_Indel_HQdnv2.info.ReadPosRankSum>=-1.7) &
                                          (rett_Indel_HQdnv2.variant_qc.gq_stats.mean>=50))
rett_Indel_HQdnv2 = rett_Indel_HQdnv2.annotate_entries(AB=hl.cond(rett_Indel_HQdnv2.GT.is_het(), 
                                                              hl.min(rett_Indel_HQdnv2.AD.map(lambda x: x/rett_Indel_HQdnv2.DP)),0))
rett_Indel_HQdnv2 = rett_Indel_HQdnv2.filter_entries((rett_Indel_HQdnv2.GQ>=99)&
                                                 (rett_Indel_HQdnv2.DP>=10)&
                                                 (rett_Indel_HQdnv2.AB>=0.20)&
                                                 (rett_Indel_HQdnv2.AB<=0.80))
rett_Indel_HQdnv2 = hl.sample_qc(rett_Indel_HQdnv2)

In [22]:
rett_Indel_HQdnv2.sample_qc.n_het.show(9)

2019-12-26 10:06:22 Hail: INFO: Coerced sorted dataset
2019-12-26 10:06:39 Hail: INFO: Coerced sorted dataset
2019-12-26 10:07:24 Hail: INFO: Coerced sorted dataset
2019-12-26 10:07:41 Hail: INFO: Coerced sorted dataset
2019-12-26 10:07:58 Hail: INFO: Coerced sorted dataset
2019-12-26 10:08:32 Hail: INFO: Coerced sorted dataset
2019-12-26 10:11:22 Hail: INFO: Coerced sorted dataset


s,Unnamed: 1_level_0
str,int64
"""TWGS1_1""",11
"""TWGS2_1""",5
"""TWGS3_1""",18
"""TWGS4_1""",10
"""TWGS5_1""",15
"""TWGS6_1""",17
"""TWGS7_1""",17
"""TWGS8_1""",18
"""TWGS9_1""",12


In [19]:
Tb_rett_Indel_HQdnv2 = rett_Indel_HQdnv2.entries()
Tb_rett_Indel_HQdnv2 = Tb_rett_Indel_HQdnv2.key_by(Tb_rett_Indel_HQdnv2.locus, 
                                                   Tb_rett_Indel_HQdnv2.alleles, 
                                                   Tb_rett_Indel_HQdnv2.fam)

#### 5. 부모 quality metrics 추가 filtering¶

In [20]:
Tb_rett_DNV_SNP = rett_DNV_SNP.entries()
Tb_rett_DNV_SNP = Tb_rett_DNV_SNP.key_by(Tb_rett_DNV_SNP.locus,
                                        Tb_rett_DNV_SNP.alleles,
                                        Tb_rett_DNV_SNP.fam)

In [21]:
Tb_rett_DNV_Indel = rett_DNV_Indel.entries()
Tb_rett_DNV_Indel = Tb_rett_DNV_Indel.key_by(Tb_rett_DNV_Indel.locus,
                                        Tb_rett_DNV_Indel.alleles,
                                        Tb_rett_DNV_Indel.fam)

In [22]:
## parents까지 포함한 Table
Tb_SNP_1 = Tb_rett_DNV_SNP.semi_join(Tb_rett_SNP_HQdnv1)
Tb_SNP_2 = Tb_rett_DNV_SNP.semi_join(Tb_rett_SNP_HQdnv2)
Tb_Indel_1 = Tb_rett_DNV_Indel.semi_join(Tb_rett_Indel_HQdnv1)
Tb_Indel_2 = Tb_rett_DNV_Indel.semi_join(Tb_rett_Indel_HQdnv2)

---

In [23]:
## SNVs GQ parents >= 90, AB parents <= 0.05/va_MQ(proband) >=60
def make_tag_SNP(table):
    temp = table
    temp = temp.filter(temp.role!='1')
    temp = temp.annotate(AB=hl.min(temp.AD.map(lambda x: x/temp.DP)))
    tag = temp.group_by(temp.locus, temp.alleles, temp.fam).aggregate(t1 = hl.agg.all(temp.GQ>=90),
                                                                      t2 = hl.agg.all(temp.AB<=0.05))
    tag = tag.annotate(t = hl.all(lambda x : x==True, [tag.t1, tag.t2]))
    tag = tag.filter(tag.t==True)
    return tag

##ROCube
tag_SNP_1 = make_tag_SNP(Tb_SNP_1)
Tb_rett_SNP_HQdnv1 = Tb_rett_SNP_HQdnv1.semi_join(tag_SNP_1)
Tb_rett_SNP_HQdnv1 = Tb_rett_SNP_HQdnv1.filter(Tb_rett_SNP_HQdnv1.info.MQ >=60)

In [25]:
Tb_rett_SNP_HQdnv1.group_by(Tb_rett_SNP_HQdnv1.s).aggregate(n = hl.agg.count()).show(9)

2019-12-29 12:20:10 Hail: INFO: Coerced sorted dataset
2019-12-29 12:20:26 Hail: INFO: Coerced sorted dataset
2019-12-29 12:21:08 Hail: INFO: Coerced sorted dataset
2019-12-29 12:21:24 Hail: INFO: Coerced sorted dataset
2019-12-29 12:21:41 Hail: INFO: Coerced sorted dataset
2019-12-29 12:22:15 Hail: INFO: Coerced sorted dataset
2019-12-29 12:24:41 Hail: INFO: Coerced sorted dataset
2019-12-29 12:24:57 Hail: INFO: Coerced sorted dataset
2019-12-29 12:25:14 Hail: INFO: Coerced sorted dataset
2019-12-29 12:25:56 Hail: INFO: Coerced sorted dataset
2019-12-29 12:26:12 Hail: INFO: Coerced sorted dataset
2019-12-29 12:26:28 Hail: INFO: Coerced sorted dataset
2019-12-29 12:27:03 Hail: INFO: Coerced sorted dataset
2019-12-29 12:29:30 Hail: INFO: Coerced sorted dataset
2019-12-29 12:29:46 Hail: INFO: Coerced sorted dataset
2019-12-29 12:30:02 Hail: INFO: Coerced sorted dataset
2019-12-29 12:30:44 Hail: INFO: Coerced sorted dataset
2019-12-29 12:31:00 Hail: INFO: Coerced sorted dataset
2019-12-29

s,n
str,int64
"""TWGS1_1""",12
"""TWGS2_1""",29
"""TWGS3_1""",14
"""TWGS4_1""",30
"""TWGS5_1""",35
"""TWGS6_1""",30
"""TWGS7_1""",19
"""TWGS8_1""",9
"""TWGS9_1""",16


In [24]:
##슬라이드 기준
tag_SNP_2 = make_tag_SNP(Tb_SNP_2)
Tb_rett_SNP_HQdnv2 = Tb_rett_SNP_HQdnv2.semi_join(tag_SNP_2)
Tb_rett_SNP_HQdnv2 = Tb_rett_SNP_HQdnv2.filter(Tb_rett_SNP_HQdnv2.info.MQ >=60)

In [27]:
Tb_rett_SNP_HQdnv2.group_by(Tb_rett_SNP_HQdnv2.s).aggregate(n = hl.agg.count()).show(9)

2019-12-29 12:46:45 Hail: INFO: Coerced sorted dataset
2019-12-29 12:47:01 Hail: INFO: Coerced sorted dataset
2019-12-29 12:47:46 Hail: INFO: Coerced sorted dataset
2019-12-29 12:48:02 Hail: INFO: Coerced sorted dataset
2019-12-29 12:48:18 Hail: INFO: Coerced sorted dataset
2019-12-29 12:48:52 Hail: INFO: Coerced sorted dataset
2019-12-29 12:51:22 Hail: INFO: Coerced sorted dataset
2019-12-29 12:51:38 Hail: INFO: Coerced sorted dataset
2019-12-29 12:51:54 Hail: INFO: Coerced sorted dataset
2019-12-29 12:52:37 Hail: INFO: Coerced sorted dataset
2019-12-29 12:52:53 Hail: INFO: Coerced sorted dataset
2019-12-29 12:53:09 Hail: INFO: Coerced sorted dataset
2019-12-29 12:53:43 Hail: INFO: Coerced sorted dataset
2019-12-29 12:56:10 Hail: INFO: Coerced sorted dataset
2019-12-29 12:56:27 Hail: INFO: Coerced sorted dataset
2019-12-29 12:56:43 Hail: INFO: Coerced sorted dataset
2019-12-29 12:57:27 Hail: INFO: Coerced sorted dataset
2019-12-29 12:57:43 Hail: INFO: Coerced sorted dataset
2019-12-29

s,n
str,int64
"""TWGS1_1""",11
"""TWGS2_1""",28
"""TWGS3_1""",13
"""TWGS4_1""",29
"""TWGS5_1""",33
"""TWGS6_1""",28
"""TWGS7_1""",14
"""TWGS8_1""",8
"""TWGS9_1""",13


---

In [25]:
## Indels DP parents >= 16, GQ parents >= 30/va_MQ >=60, 864 (16 * 27) ≤ Total DP ≤ 1350 (50 * 27)
def make_tag_Indel(table):
    temp = table
    temp = temp.filter(temp.role!='1')
    tag = temp.group_by(temp.locus, temp.alleles, temp.fam).aggregate(t1 = hl.agg.all(temp.GQ>=30),
                                                                      t2 = hl.agg.all(temp.DP>=16))
    tag = tag.annotate(t = hl.all(lambda x : x==True, [tag.t1, tag.t2]))
    tag = tag.filter(tag.t==True)
    return tag

##ROCube
tag_Indel_1 = make_tag_Indel(Tb_Indel_1)
Tb_rett_Indel_HQdnv1 = Tb_rett_Indel_HQdnv1.semi_join(tag_Indel_1)
Tb_rett_Indel_HQdnv1 = Tb_rett_Indel_HQdnv1.filter((Tb_rett_Indel_HQdnv1.info.MQ >=60)&
                                                   (Tb_rett_Indel_HQdnv1.Total_DP>= 864)&
                                                   (Tb_rett_Indel_HQdnv1.Total_DP<=1350))

In [29]:
Tb_rett_Indel_HQdnv1.group_by(Tb_rett_Indel_HQdnv1.s).aggregate(n = hl.agg.count()).show(9)

2019-12-29 13:08:32 Hail: INFO: Coerced sorted dataset
2019-12-29 13:08:49 Hail: INFO: Coerced sorted dataset
2019-12-29 13:09:34 Hail: INFO: Coerced sorted dataset
2019-12-29 13:09:50 Hail: INFO: Coerced sorted dataset
2019-12-29 13:10:07 Hail: INFO: Coerced sorted dataset
2019-12-29 13:10:41 Hail: INFO: Coerced sorted dataset
2019-12-29 13:13:13 Hail: INFO: Coerced sorted dataset
2019-12-29 13:13:29 Hail: INFO: Coerced sorted dataset
2019-12-29 13:13:45 Hail: INFO: Coerced sorted dataset
2019-12-29 13:14:22 Hail: INFO: Coerced sorted dataset
2019-12-29 13:14:39 Hail: INFO: Coerced sorted dataset
2019-12-29 13:14:55 Hail: INFO: Coerced sorted dataset
2019-12-29 13:15:29 Hail: INFO: Coerced sorted dataset
2019-12-29 13:17:51 Hail: INFO: Coerced sorted dataset
2019-12-29 13:18:07 Hail: INFO: Coerced sorted dataset
2019-12-29 13:18:23 Hail: INFO: Coerced sorted dataset
2019-12-29 13:19:08 Hail: INFO: Coerced sorted dataset
2019-12-29 13:19:24 Hail: INFO: Coerced sorted dataset
2019-12-29

s,n
str,int64
"""TWGS1_1""",10
"""TWGS2_1""",6
"""TWGS3_1""",5
"""TWGS4_1""",6
"""TWGS5_1""",6
"""TWGS6_1""",9
"""TWGS7_1""",11
"""TWGS8_1""",12
"""TWGS9_1""",8


In [26]:
##슬라이드 기준
tag_Indel_2 = make_tag_Indel(Tb_Indel_2)
Tb_rett_Indel_HQdnv2 = Tb_rett_Indel_HQdnv2.semi_join(tag_Indel_2)
Tb_rett_Indel_HQdnv2 = Tb_rett_Indel_HQdnv2.filter((Tb_rett_Indel_HQdnv2.info.MQ >=60)&
                                                   (Tb_rett_Indel_HQdnv2.Total_DP>= 864)&
                                                   (Tb_rett_Indel_HQdnv2.Total_DP<=1350))

In [31]:
Tb_rett_Indel_HQdnv2.group_by(Tb_rett_Indel_HQdnv2.s).aggregate(n = hl.agg.count()).show(9)

2019-12-29 13:29:40 Hail: INFO: Coerced sorted dataset
2019-12-29 13:29:56 Hail: INFO: Coerced sorted dataset
2019-12-29 13:30:40 Hail: INFO: Coerced sorted dataset
2019-12-29 13:30:57 Hail: INFO: Coerced sorted dataset
2019-12-29 13:31:13 Hail: INFO: Coerced sorted dataset
2019-12-29 13:31:47 Hail: INFO: Coerced sorted dataset
2019-12-29 13:34:16 Hail: INFO: Coerced sorted dataset
2019-12-29 13:34:32 Hail: INFO: Coerced sorted dataset
2019-12-29 13:34:49 Hail: INFO: Coerced sorted dataset
2019-12-29 13:35:27 Hail: INFO: Coerced sorted dataset
2019-12-29 13:35:44 Hail: INFO: Coerced sorted dataset
2019-12-29 13:36:01 Hail: INFO: Coerced sorted dataset
2019-12-29 13:36:36 Hail: INFO: Coerced sorted dataset
2019-12-29 13:39:01 Hail: INFO: Coerced sorted dataset
2019-12-29 13:39:18 Hail: INFO: Coerced sorted dataset
2019-12-29 13:39:34 Hail: INFO: Coerced sorted dataset
2019-12-29 13:40:17 Hail: INFO: Coerced sorted dataset
2019-12-29 13:40:34 Hail: INFO: Coerced sorted dataset
2019-12-29

s,n
str,int64
"""TWGS1_1""",4
"""TWGS2_1""",1
"""TWGS3_1""",3
"""TWGS4_1""",2
"""TWGS5_1""",4
"""TWGS6_1""",1
"""TWGS7_1""",6
"""TWGS8_1""",4
"""TWGS9_1""",4


#### 6. DNAse HS peak filtering

In [2]:
DNAse_HS_bed = hl.import_bed('/home/titan/Hail/rett/E082-DNase.hotspot.fdr0.01.broad.bed',
                            contig_recoding={f'chr{i}':str(i) for i in [*range(1,23), 'X', 'Y']})


2020-01-28 13:45:05 Hail: INFO: Reading table with no type imputation
  Loading column 'f0' as type 'str' (user-specified)
  Loading column 'f1' as type 'int32' (user-specified)
  Loading column 'f2' as type 'int32' (user-specified)
  Loading column 'f3' as type 'str' (user-specified)
  Loading column 'f4' as type 'str' (user-specified)



In [66]:
DNAse_HS_bed.show()

interval,target
interval<locus<GRCh37>>,str
[1:10148-1:10317),"""."""
[1:57223-1:57409),"""."""
[1:235673-1:236034),"""."""
[1:237719-1:237785),"""."""
[1:521559-1:521614),"""."""
[1:564619-1:564689),"""."""
[1:568038-1:568090),"""."""
[1:569891-1:569962),"""."""
[1:713909-1:714345),"""."""
[1:762785-1:763144),"""."""


In [13]:
{f'chr{i}':str(i) for i in [*range(1,23), 'X', 'Y', 'M']}

{'chr1': '1',
 'chr2': '2',
 'chr3': '3',
 'chr4': '4',
 'chr5': '5',
 'chr6': '6',
 'chr7': '7',
 'chr8': '8',
 'chr9': '9',
 'chr10': '10',
 'chr11': '11',
 'chr12': '12',
 'chr13': '13',
 'chr14': '14',
 'chr15': '15',
 'chr16': '16',
 'chr17': '17',
 'chr18': '18',
 'chr19': '19',
 'chr20': '20',
 'chr21': '21',
 'chr22': '22',
 'chrX': 'X',
 'chrY': 'Y',
 'chrM': 'M'}

In [27]:
# bed file is a Table (only contains global and row fields)
adrenal_1 = hl.import_bed('/home/titan/Hail/rett/adrenal_gland_fetal/GSM1027310_UW.Fetal_Adrenal_Gland.ChromatinAccessibility.H-24272.DNase.DS19395.bed',
                            skip_invalid_intervals=True, contig_recoding={'chr1':'1', 'chr2':'2', 'chr3':'3','chr4':'4', 'chr5':'5', 'chr6':'6','chr7':'7', 'chr8':'8', 'chr9':'9', 'chr10':'10', 'chr11':'11', 'chr12':'12', 'chr13':'13','chr14':'14', 'chr15':'15', 'chr16':'16','chr17':'17', 'chr18':'18', 'chr19':'19','chr20':'20','chr21':'21', 'chr22':'22', 'chrX':'X', 'chrY':'Y', 'chrM':'MT'})
# import chr1 -> 1  , chrM -> MT (mitochondria)
# skip out of range MT peak

2020-01-29 13:50:56 Hail: INFO: Reading table with no type imputation
  Loading column 'f0' as type 'str' (user-specified)
  Loading column 'f1' as type 'int32' (user-specified)
  Loading column 'f2' as type 'int32' (user-specified)
  Loading column 'f3' as type 'str' (user-specified)
  Loading column 'f4' as type 'str' (user-specified)
  Loading column 'f5' as type 'str' (type not specified)



In [24]:
adrenal_1.show()

interval,target
interval<locus<GRCh37>>,str
[1:10001-1:10035),"""HWI-ST700693:197:D0A3LACXX:1:2203:9186:125677"""
[1:10151-1:10186),"""HWI-ST700693:197:D0A3LACXX:1:2301:12999:45890"""
[1:10152-1:10187),"""HWI-ST700693:197:D0A3LACXX:1:1102:2955:22648"""
[1:10156-1:10188),"""HWI-ST700693:197:D0A3LACXX:1:2303:10322:103885"""
[1:10157-1:10191),"""HWI-ST700693:197:D0A3LACXX:1:1207:13529:73916"""
[1:10158-1:10193),"""HWI-ST700693:197:D0A3LACXX:1:2308:4356:12656"""
[1:10161-1:10196),"""HWI-ST700693:117:C049MABXX:2:1106:20309:175894"""
[1:10232-1:10267),"""HWI-ST700693:117:C049MABXX:2:2202:8480:52407"""
[1:10233-1:10267),"""HWI-ST700693:197:D0A3LACXX:1:1306:16923:129689"""
[1:10234-1:10269),"""HWI-ST700693:117:C049MABXX:2:1105:17485:170755"""


In [63]:
hl.eval(hl.is_valid_contig('chrM', 'GRCh38'))

True

#### liftover(hg19 --> hg38)

In [12]:
rg37.contigs

['1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 'X',
 'Y',
 'MT',
 'GL000207.1',
 'GL000226.1',
 'GL000229.1',
 'GL000231.1',
 'GL000210.1',
 'GL000239.1',
 'GL000235.1',
 'GL000201.1',
 'GL000247.1',
 'GL000245.1',
 'GL000197.1',
 'GL000203.1',
 'GL000246.1',
 'GL000249.1',
 'GL000196.1',
 'GL000248.1',
 'GL000244.1',
 'GL000238.1',
 'GL000202.1',
 'GL000234.1',
 'GL000232.1',
 'GL000206.1',
 'GL000240.1',
 'GL000236.1',
 'GL000241.1',
 'GL000243.1',
 'GL000242.1',
 'GL000230.1',
 'GL000237.1',
 'GL000233.1',
 'GL000204.1',
 'GL000198.1',
 'GL000208.1',
 'GL000191.1',
 'GL000227.1',
 'GL000228.1',
 'GL000214.1',
 'GL000221.1',
 'GL000209.1',
 'GL000218.1',
 'GL000220.1',
 'GL000213.1',
 'GL000211.1',
 'GL000199.1',
 'GL000217.1',
 'GL000216.1',
 'GL000215.1',
 'GL000205.1',
 'GL000219.1',
 'GL000224.1',
 'GL000223.1',
 'GL000195.1',
 'GL000212.1',
 'GL000222.1',
 'GL000200.1',
 'GL000193.

In [28]:
rg37 = hl.get_reference('GRCh37')
rg38 = hl.get_reference('GRCh38')
rg37.add_liftover('/home/titan/resources/grch37_to_grch38.over.chain.gz', rg38) 

In [5]:
DNAse_HS_bed = DNAse_HS_bed.annotate(new_interval=hl.liftover(DNAse_HS_bed.interval, 'GRCh38'))  
DNAse_HS_bed = DNAse_HS_bed.filter(hl.is_defined(DNAse_HS_bed.new_interval))  
DNAse_HS_bed = DNAse_HS_bed.key_by(interval =DNAse_HS_bed.new_interval)

In [52]:
DNAse_HS_bed.show()

interval,target,new_interval
interval<locus<GRCh38>>,str,interval<locus<GRCh38>>
[chr1:10148-chr1:10316],""".""",[chr1:10148-chr1:10316]
[chr1:57223-chr1:57408],""".""",[chr1:57223-chr1:57408]
[chr1:265922-chr1:266282],""".""",[chr1:265922-chr1:266282]
[chr1:267968-chr1:268033],""".""",[chr1:267968-chr1:268033]
[chr1:586179-chr1:586233],""".""",[chr1:586179-chr1:586233]
[chr1:629239-chr1:629308],""".""",[chr1:629239-chr1:629308]
[chr1:632658-chr1:632709],""".""",[chr1:632658-chr1:632709]
[chr1:634511-chr1:634581],""".""",[chr1:634511-chr1:634581]
[chr1:778529-chr1:778964],""".""",[chr1:778529-chr1:778964]
[chr1:827405-chr1:827763],""".""",[chr1:827405-chr1:827763]


In [29]:
adrenal_1 = adrenal_1.annotate(new_interval=hl.liftover(adrenal_1.interval, 'GRCh38'))  
adrenal_1 = adrenal_1.filter(hl.is_defined(adrenal_1.new_interval))  
adrenal_1 = adrenal_1.key_by(interval =adrenal_1.new_interval)

In [26]:
adrenal_1.show()

interval,target,new_interval
interval<locus<GRCh38>>,str,interval<locus<GRCh38>>
[chr1:10001-chr1:10034],"""HWI-ST700693:197:D0A3LACXX:1:2203:9186:125677""",[chr1:10001-chr1:10034]
[chr1:10151-chr1:10185],"""HWI-ST700693:197:D0A3LACXX:1:2301:12999:45890""",[chr1:10151-chr1:10185]
[chr1:10152-chr1:10186],"""HWI-ST700693:197:D0A3LACXX:1:1102:2955:22648""",[chr1:10152-chr1:10186]
[chr1:10156-chr1:10187],"""HWI-ST700693:197:D0A3LACXX:1:2303:10322:103885""",[chr1:10156-chr1:10187]
[chr1:10157-chr1:10190],"""HWI-ST700693:197:D0A3LACXX:1:1207:13529:73916""",[chr1:10157-chr1:10190]
[chr1:10158-chr1:10192],"""HWI-ST700693:197:D0A3LACXX:1:2308:4356:12656""",[chr1:10158-chr1:10192]
[chr1:10161-chr1:10195],"""HWI-ST700693:117:C049MABXX:2:1106:20309:175894""",[chr1:10161-chr1:10195]
[chr1:10232-chr1:10266],"""HWI-ST700693:117:C049MABXX:2:2202:8480:52407""",[chr1:10232-chr1:10266]
[chr1:10233-chr1:10266],"""HWI-ST700693:197:D0A3LACXX:1:1306:16923:129689""",[chr1:10233-chr1:10266]
[chr1:10234-chr1:10268],"""HWI-ST700693:117:C049MABXX:2:1105:17485:170755""",[chr1:10234-chr1:10268]


#### filtering

In [52]:
Tb_rett_SNP_HQdnv1 = Tb_rett_SNP_HQdnv1.filter(hl.is_defined(DNAse_HS_bed[Tb_rett_SNP_HQdnv1.locus]))
Tb_rett_SNP_HQdnv2 = Tb_rett_SNP_HQdnv2.filter(hl.is_defined(DNAse_HS_bed[Tb_rett_SNP_HQdnv2.locus]))
Tb_rett_Indel_HQdnv1 = Tb_rett_Indel_HQdnv1.filter(hl.is_defined(DNAse_HS_bed[Tb_rett_Indel_HQdnv1.locus]))
Tb_rett_Indel_HQdnv2 = Tb_rett_Indel_HQdnv2.filter(hl.is_defined(DNAse_HS_bed[Tb_rett_Indel_HQdnv2.locus]))

In [37]:
Tb_rett_SNP_HQdnv1.group_by(Tb_rett_SNP_HQdnv1.s).aggregate(n = hl.agg.count()).show(9)

2020-01-22 14:50:39 Hail: INFO: Coerced sorted dataset
2020-01-22 14:51:10 Hail: INFO: Coerced sorted dataset
2020-01-22 14:52:20 Hail: INFO: Coerced sorted dataset
2020-01-22 14:52:50 Hail: INFO: Coerced sorted dataset
2020-01-22 14:53:20 Hail: INFO: Coerced sorted dataset
2020-01-22 14:54:26 Hail: INFO: Coerced sorted dataset
2020-01-22 14:59:14 Hail: INFO: Coerced sorted dataset
2020-01-22 14:59:45 Hail: INFO: Coerced sorted dataset
2020-01-22 15:00:17 Hail: INFO: Coerced sorted dataset
2020-01-22 15:01:38 Hail: INFO: Coerced sorted dataset
2020-01-22 15:02:09 Hail: INFO: Coerced sorted dataset
2020-01-22 15:02:42 Hail: INFO: Coerced sorted dataset
2020-01-22 15:03:48 Hail: INFO: Coerced sorted dataset
2020-01-22 15:08:26 Hail: INFO: Coerced sorted dataset
2020-01-22 15:08:57 Hail: INFO: Coerced sorted dataset
2020-01-22 15:09:29 Hail: INFO: Coerced sorted dataset
2020-01-22 15:10:50 Hail: INFO: Coerced sorted dataset
2020-01-22 15:11:20 Hail: INFO: Coerced sorted dataset
2020-01-22

s,n
str,int64
"""TWGS1_1""",1
"""TWGS2_1""",1
"""TWGS4_1""",1
"""TWGS5_1""",2
"""TWGS8_1""",1
"""TWGS9_1""",1


In [30]:
Tb_rett_SNP_HQdnv1_adrenal_1 = Tb_rett_SNP_HQdnv1.filter(hl.is_defined(adrenal_1[Tb_rett_SNP_HQdnv1.locus]))
Tb_rett_SNP_HQdnv2_adrenal_1 = Tb_rett_SNP_HQdnv2.filter(hl.is_defined(adrenal_1[Tb_rett_SNP_HQdnv2.locus]))
Tb_rett_Indel_HQdnv1_adrenal_1 = Tb_rett_Indel_HQdnv1.filter(hl.is_defined(adrenal_1[Tb_rett_Indel_HQdnv1.locus]))
Tb_rett_Indel_HQdnv2_adrenal_1 = Tb_rett_Indel_HQdnv2.filter(hl.is_defined(adrenal_1[Tb_rett_Indel_HQdnv2.locus]))

In [34]:
Tb_rett_SNP_HQdnv1_adrenal_1.group_by(Tb_rett_SNP_HQdnv1_adrenal_1.s).aggregate(n = hl.agg.count()).show(9)

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:42519)
Traceback (most recent call last):
  File "/home/titan/Downloads/miniconda3/envs/hail/lib/python3.7/site-packages/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/titan/Downloads/miniconda3/envs/hail/lib/python3.7/site-packages/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:42519)

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:42519)
Traceback (most recent call last):
  File "/home/titan/Downloads/miniconda3/envs/hail/lib/python3.7/site-packages/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/titan/Downloads/miniconda3/envs/hail/lib/python3.7/site-packages/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:42519)

#### 7. CSQ annotation
: alternative transcripts 고려


In [32]:
import gzip
with gzip.open('/home/titan/Hail/rett/191102-rett-wgs.vep_tx_191206.vcf.bgz', 'rt') as f:
    for l in f:
        if 'ID=CSQ' in l:
            temp = l.strip('\n').split('|')
            break

temp[0] = 'Allele';temp[70] = 'gnomADg_AF'

print(temp)
#print(temp.index('Consequence'))

['Allele', 'Consequence', 'IMPACT', 'SYMBOL', 'Gene', 'Feature_type', 'Feature', 'BIOTYPE', 'EXON', 'INTRON', 'HGVSc', 'HGVSp', 'cDNA_position', 'CDS_position', 'Protein_position', 'Amino_acids', 'Codons', 'Existing_variation', 'DISTANCE', 'STRAND', 'FLAGS', 'VARIANT_CLASS', 'SYMBOL_SOURCE', 'HGNC_ID', 'CANONICAL', 'MANE', 'TSL', 'APPRIS', 'CCDS', 'ENSP', 'SWISSPROT', 'TREMBL', 'UNIPARC', 'SOURCE', 'GENE_PHENO', 'NEAREST', 'SIFT', 'PolyPhen', 'DOMAINS', 'miRNA', 'HGVS_OFFSET', 'AF', 'AFR_AF', 'AMR_AF', 'EAS_AF', 'EUR_AF', 'SAS_AF', 'AA_AF', 'EA_AF', 'gnomAD_AF', 'gnomAD_AFR_AF', 'gnomAD_AMR_AF', 'gnomAD_ASJ_AF', 'gnomAD_EAS_AF', 'gnomAD_FIN_AF', 'gnomAD_NFE_AF', 'gnomAD_OTH_AF', 'gnomAD_SAS_AF', 'MAX_AF', 'MAX_AF_POPS', 'CLIN_SIG', 'SOMATIC', 'PHENO', 'PUBMED', 'MOTIF_NAME', 'MOTIF_POS', 'HIGH_INF_POS', 'MOTIF_SCORE_CHANGE', 'MPC', 'gnomADg', 'gnomADg_AF']


In [35]:
ref = ['transcript_ablation','splice_acceptor_variant','splice_donor_variant','stop_gained','frameshift_variant','stop_lost','start_lost','transcipt_amplification','inframe_insertion','inframe_deletion','missense_variant','protein_altering_variant','splice_region_variant','incomplete_terminal_codon_variant','start_retained_variant','stop_retained_variant','synonymous_variant','coding_sequence_variant','mature_miRNA_variant','5_prime_UTR_variant','3_prime_UTR_variant','non_coding_transcript_exon_variant','intron_variant','NMD_transcript_variant','non_coding_transcript_variant','upstream_gene_variant','downstream_gene_variant','TFBS_ablation','TFBS_amplification','TF_binding_site_variant','regulatory_region_ablation','regulatory_region_amplification','feature_enlogation', 'regulatory_region_variant','feature_truncation','intergenic_variant']
ref = hl.array(ref)

In [31]:
def CSQ(table):
    t = table
    t = t.annotate(v1 = hl.tuple([t.locus.contig.replace("chr", ""),hl.str(t.locus.position)]),
                   v2 = hl.tuple([t.alleles[0],t.alleles[1]]))
    t = t.transmute(variant = hl.delimit(hl.array([t.v1[0],t.v1[1],t.v2[0],t.v2[1]]), ":"))
    t = t.annotate(CSQ= t.info.CSQ).explode('CSQ')
    t = t.transmute(csq = t.CSQ.split('\|')).explode('csq')
    return t

In [None]:
# before the loop, run it just once for liftover
rg37 = hl.get_reference('GRCh37')
rg38 = hl.get_reference('GRCh38')
rg37.add_liftover('/home/titan/resources/grch37_to_grch38.over.chain.gz', rg38) 

# CSQ annotation function -> used in DHS_filter function
def CSQ(table):
    t = table
    t = t.annotate(v1 = hl.tuple([t.locus.contig.replace("chr", ""),hl.str(t.locus.position)]),
                   v2 = hl.tuple([t.alleles[0],t.alleles[1]]))
    t = t.transmute(variant = hl.delimit(hl.array([t.v1[0],t.v1[1],t.v2[0],t.v2[1]]), ":"))
    t = t.annotate(CSQ= t.info.CSQ).explode('CSQ')
    t = t.transmute(csq = t.CSQ.split('\|')).explode('csq')
    return t

# DHS peak filtering function  
# f :str 파일의 경로 / example: '/home/titan/Hail/rett/resources/adrenal_gland_fetal/GSM1027310_UW.Fetal_Adrenal_Gland.ChromatinAccessibility.H-24272.DNase.DS19395.bed'
# rett에 resources 만들어야 되고, rett/tables 에 각 조직의 파일 만들어야 됨
# include this function at the loop
def DHS_filter(f, Tb_rett_SNP_HQdnv1, Tb_rett_SNP_HQdnv2, Tb_rett_Indel_HQdnv1, Tb_rett_Indel_HQdnv2):
    bed = hl.import_bed(f, 
                        skip_invalid_intervals=True, 
                        contig_recoding={'chr1':'1', 'chr2':'2', 'chr3':'3','chr4':'4', 'chr5':'5', 'chr6':'6','chr7':'7', 'chr8':'8', 'chr9':'9', 'chr10':'10', 'chr11':'11', 'chr12':'12', 'chr13':'13','chr14':'14', 'chr15':'15', 'chr16':'16','chr17':'17', 'chr18':'18', 'chr19':'19','chr20':'20','chr21':'21', 'chr22':'22', 'chrX':'X', 'chrY':'Y', 'chrM':'MT'})
    
    bed = bed.annotate(new_interval = hl.liftover(bed.interval, 'GRCh38'))  
    bed = bed.filter(hl.is_defined(bed.new_interval))  
    bed = bed.key_by(interval = bed.new_interval)
    
    Tb_rett_SNP_HQdnv1 = Tb_rett_SNP_HQdnv1.filter(hl.is_defined(bed[Tb_rett_SNP_HQdnv1.locus]))
    Tb_rett_SNP_HQdnv2 = Tb_rett_SNP_HQdnv2.filter(hl.is_defined(bed[Tb_rett_SNP_HQdnv2.locus]))
    Tb_rett_Indel_HQdnv1 = Tb_rett_Indel_HQdnv1.filter(hl.is_defined(bed[Tb_rett_Indel_HQdnv1.locus]))
    Tb_rett_Indel_HQdnv2 = Tb_rett_Indel_HQdnv2.filter(hl.is_defined(bed[Tb_rett_Indel_HQdnv2.locus]))
    
    Tb_rett_SNP_HQdnv1 = CSQ(Tb_rett_SNP_HQdnv1)
    Tb_rett_SNP_HQdnv2 = CSQ(Tb_rett_SNP_HQdnv2)
    Tb_rett_Indel_HQdnv1 = CSQ(Tb_rett_Indel_HQdnv1)
    Tb_rett_Indel_HQdnv2 = CSQ(Tb_rett_Indel_HQdnv2)
    
    name_list = f.split('/')
    tissue = name_list[6]
    name = name_list[7]
    Tb_rett_SNP_HQdnv1.export("/home/titan/Hail/rett/tables/" + tissue + '/' + name + '.df_SNP_1_csq.tsv')
    Tb_rett_SNP_HQdnv2.export("/home/titan/Hail/rett/tables/df_SNP_2_csq.tsv")
    Tb_rett_Indel_HQdnv1.export("/home/titan/Hail/rett/tables/df_Indel_1_csq.tsv")
    Tb_rett_Indel_HQdnv2.export("/home/titan/Hail/rett/tables/df_Indel_2_csq.tsv")
    
    return None

# loop for all 342 bed files


In [35]:
Tb_rett_SNP_HQdnv1 = CSQ(Tb_rett_SNP_HQdnv1)
Tb_rett_SNP_HQdnv2 = CSQ(Tb_rett_SNP_HQdnv2)
Tb_rett_Indel_HQdnv1 = CSQ(Tb_rett_Indel_HQdnv1)
Tb_rett_Indel_HQdnv2 = CSQ(Tb_rett_Indel_HQdnv2)

In [32]:
Tb_rett_SNP_HQdnv1_adrenal_1 = CSQ(Tb_rett_SNP_HQdnv1_adrenal_1)
Tb_rett_SNP_HQdnv2_adrenal_1 = CSQ(Tb_rett_SNP_HQdnv2_adrenal_1)
Tb_rett_Indel_HQdnv1_adrenal_1 = CSQ(Tb_rett_Indel_HQdnv1_adrenal_1)
Tb_rett_Indel_HQdnv2_adrenal_1 = CSQ(Tb_rett_Indel_HQdnv2_adrenal_1)

In [39]:
Tb_rett_SNP_HQdnv1.export("/home/titan/Hail/rett/tables/df_SNP_1_csq.tsv")
Tb_rett_SNP_HQdnv2.export("/home/titan/Hail/rett/tables/df_SNP_2_csq.tsv")
Tb_rett_Indel_HQdnv1.export("/home/titan/Hail/rett/tables/df_Indel_1_csq.tsv")
Tb_rett_Indel_HQdnv2.export("/home/titan/Hail/rett/tables/df_Indel_2_csq.tsv")

2020-01-22 16:40:18 Hail: INFO: Coerced sorted dataset
2020-01-22 16:40:31 Hail: INFO: Coerced sorted dataset
2020-01-22 16:41:03 Hail: INFO: Coerced sorted dataset
2020-01-22 16:41:16 Hail: INFO: Coerced sorted dataset
2020-01-22 16:41:29 Hail: INFO: Coerced sorted dataset
2020-01-22 16:41:57 Hail: INFO: Coerced sorted dataset
2020-01-22 16:44:05 Hail: INFO: Coerced sorted dataset
2020-01-22 16:44:18 Hail: INFO: Coerced sorted dataset
2020-01-22 16:44:47 Hail: INFO: Coerced sorted dataset
2020-01-22 16:45:01 Hail: INFO: Coerced sorted dataset
2020-01-22 16:45:14 Hail: INFO: Coerced sorted dataset
2020-01-22 16:45:40 Hail: INFO: Coerced sorted dataset
2020-01-22 16:47:40 Hail: INFO: Coerced sorted dataset
2020-01-22 16:47:54 Hail: INFO: Coerced sorted dataset
2020-01-22 16:48:37 Hail: INFO: Coerced sorted dataset
2020-01-22 16:48:51 Hail: INFO: Coerced sorted dataset
2020-01-22 16:49:04 Hail: INFO: Coerced sorted dataset
2020-01-22 16:49:30 Hail: INFO: Coerced sorted dataset
2020-01-22

2020-01-22 18:15:37 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-01-22 18:22:19 Hail: INFO: merging 137 files totalling 10.2M...
2020-01-22 18:22:19 Hail: INFO: while writing:
    /home/titan/Hail/rett/tables/df_Indel_2_csq.tsv
  merge time: 38.172ms


In [35]:
Tb_rett_SNP_HQdnv1_adrenal_1.export("/home/titan/Hail/rett/tables/df_SNP_1_csq_adrenal_1.tsv")

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:41765)
Traceback (most recent call last):
  File "/home/titan/Downloads/miniconda3/envs/hail/lib/python3.7/site-packages/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/titan/Downloads/miniconda3/envs/hail/lib/python3.7/site-packages/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:41765)

In [53]:
import os
os.chdir('/Users/lizzychoi/rett_syndrome/scripts/')

In [None]:
!rscript consequences.R