# Code to get tracks set up for CoVid-relevant regulatory architecture
# Macrophage regulatory architecture

# Minimal tracks to prepare 
Track : format - source

    * CTCF : bigwig - Encode CD14+ monocyte 
    * H3K27Ac : bigwig - Blueprint Macrophage 
    * H3k4Me3 : bigwig - Blueprint Macrophage  
    * H3k9Me3 : bigwig - Blueprint Macrophage
    * ATAC-seq/DNAse-seq : bigwig - Blueprint Macrophage 
    * Methylation : bigwig - Blueprint Macrophage
    * Loops : links - hESC
    * Hi-C : cool - Mphage diff paper
    * Genes : genes_bed -  Gencode
    * Repeats : bed - L1Base2
    * Chromatin state : bed - ENCODE/Segway
    * eQTL list : arcs - none
    * GWAS : bed - pvals and bigwig Ellinghaus and Covid19Hg
    * RNA-Seq : bigwig and txt - Macrophage Blueprint, Macrophage Activated Blueprint
    

In [None]:
import pyensembl, os, sys, re, numpy as np
from helper_funcs import *


### Transcription factor and histones

In [1]:
%%bash
#Get Histone marks:

# Get bigwig H3k4me3 fold change over control 
wget -nc -O /input_dir/corona_analysis/tracks/S0022IH1_macrophage_h3k4me3_hg38.bw --quiet \
    "http://ftp.ebi.ac.uk/pub/databases/blueprint/data/homo_sapiens/GRCh38/venous_blood/S0022I/macrophage/ChIP-Seq/NCMLS/S0022IH1.ERX300718.H3K4me3.bwa.GRCh38.20150528.bw"

# Get bigwig H3K27Ac fold change over control
wget -nc -O /input_dir/corona_analysis/tracks/S0022IH1_macrophage_h3k27Ac_hg38.bw --quiet \
    "http://ftp.ebi.ac.uk/pub/databases/blueprint/data/homo_sapiens/GRCh38/venous_blood/S0022I/macrophage/ChIP-Seq/NCMLS/S0022IH1.ERX300726.H3K27ac.bwa.GRCh38.20150528.bw"    

# Get bigwig H3K9Me3 fold change over control
wget -nc -O /input_dir/corona_analysis/tracks/S0022IH1_macrophage_h3k9me3_hg38.bw --quiet \
    "http://ftp.ebi.ac.uk/pub/databases/blueprint/data/homo_sapiens/GRCh38/venous_blood/S0022I/macrophage/ChIP-Seq/NCMLS/S0022IH1.ERX300730.H3K9me3.bwa.GRCh38.20150528.bw"


In [None]:
%%bash

#Use crossmap to flip hg38 bigwigs to hg19
# pip3 install crossmap
# get chain files
wget -nc -O /input_dir/corona_analysis/annotations/hg38ToHg19.over.chain.gz \
    http://hgdownload.soe.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz
wget -nc -O /input_dir/corona_analysis/annotations/hg19ToHg38.over.chain.gz \
    http://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz

CrossMap.py bigwig /input_dir/corona_analysis/annotations/hg38ToHg19.over.chain.gz /input_dir/corona_analysis/tracks/S0022IH1_macrophage_h3k4me3_hg38.bw \
    /input_dir/corona_analysis/tracks/S0022IH1_macrophage_h3k4me3_hg19

CrossMap.py bigwig /input_dir/corona_analysis/annotations/hg38ToHg19.over.chain.gz /input_dir/corona_analysis/tracks/S0022IH1_macrophage_h3k27Ac_hg38.bw \
    /input_dir/corona_analysis/tracks/S0022IH1_macrophage_h3k27Ac_hg19

CrossMap.py bigwig /input_dir/corona_analysis/annotations/hg38ToHg19.over.chain.gz /input_dir/corona_analysis/tracks/S0022IH1_macrophage_h3k9me3_hg38.bw \
    /input_dir/corona_analysis/tracks/S0022IH1_macrophage_h3k9me3_hg19


In [None]:
%%bash
#Get Bigwigs:

# Get CTCF chip-seq fold change over control
#Might have to process from source https://www.ncbi.nlm.nih.gov/sra?term=SRX2655461

#ENCODE CD14+ monocyte ctcf
wget -nc --quiet -O /input_dir/corona_analysis/tracks/ENCFF114WQR_cd14_monocyte_hg19_CTCF.bw \
    https://encode-public.s3.amazonaws.com/2017/04/11/c61bd1f7-1379-4ae1-b2b6-605ae1be9b0a/ENCFF114WQR.bigWig
wget -nc --quiet -O /input_dir/corona_analysis/tracks/ENCFF437LHG_cd14_monocyte_hg19_CTCF_peaks.bed.gz \
    https://encode-public.s3.amazonaws.com/2017/04/11/73258d7d-e741-4ed7-86a8-76679735c997/ENCFF437LHG.bed.gz

    
# Get called CTCF peaks
#Raw edgeR output https://ftp.ncbi.nlm.nih.gov/geo/series/GSE96nnn/GSE96800/suppl/GSE96800_CTCF_peak_edgeR_raw.txt.gz
    

### Chromatin accessibility

In [2]:
%%bash

# Get bigwig average scATAC-seq of monocytes 2 
wget -nc -O /input_dir/corona_analysis/tracks/Mono_2_scATAC_hg19.bw --quiet \
"https://chang-public-data.s3-us-west-1.amazonaws.com/10X_scATAC/Heme/Cluster13.RIP_norm.bw"


In [None]:
%%bash

# Get bigwig DNAse normalized read count 
wget -nc -O /input_dir/corona_analysis/tracks/S0022I44_macrophage_DNAse_hg38.bw --quiet \
"http://ftp.ebi.ac.uk/pub/databases/blueprint/data/homo_sapiens/GRCh38/venous_blood/S0022I/macrophage/DNase-Hypersensitivity/NCMLS/S0022I44.ERX616977.Dnase.bwa.GRCh38.20150529.bw"


In [None]:
%%bash

CrossMap.py bigwig /input_dir/corona_analysis/annotations/hg38ToHg19.over.chain.gz /input_dir/corona_analysis/tracks/S0022I44_macrophage_DNAse_hg38.bw \
    /input_dir/corona_analysis/tracks/S0022I44_macrophage_DNAse_hg19


In [30]:
%%bash

# Get bed narrowpeak DNAse 
wget -nc -O /input_dir/corona_analysis/tracks/S0022I44_macrophage_DNAse_hg38.bed.gz --quiet \
"http://ftp.ebi.ac.uk/pub/databases/blueprint/data/homo_sapiens/GRCh38/venous_blood/S0022I/macrophage/DNase-Hypersensitivity/NCMLS/S0022I44.ERX616977.Dnase.GRCh38.hotspot.20150709.bed.gz"
gunzip /input_dir/corona_analysis/tracks/S0022I44_macrophage_DNAse_hg38.bed.gz


In [None]:
%%bash

CrossMap.py bed /input_dir/corona_analysis/annotations/hg38ToHg19.over.chain.gz /input_dir/corona_analysis/tracks/S0022I44_macrophage_DNAse_hg38.bed  \
    /input_dir/corona_analysis/tracks/S0022I44_macrophage_DNAse_hg19.bed


### Chromatin looping

In [None]:
#Got loops from previous hESC paper in:
# /input_dir/corona_analysis/tracks/primed_.7_origami_intra.arcs
#Get loops from Macrophage differentiation paper:
https://www.cell.com/cms/10.1016/j.molcel.2017.08.006/attachment/0a5229f1-46bb-4aae-aa42-33651377e633/mmc3.zip
    

In [None]:
%%bash

#Found dope dataset from Static and Dynamic DNA Loops form AP-1-Bound Activation Hubs during Macrophage Development
# https://www.cell.com/molecular-cell/pdfExtended/S1097-2765(17)30603-2
# https://bcm.app.box.com/v/aidenlab/folder/47796980666

#Flip from Hic to cool
hicConvertFormat -m A_inter.hic --inputFormat hic --outputFormat cool -o Macrophage_HiC_hg19_50kb_50000.cool --resolutions 50000


In [None]:
hicConvertFormat -m A_inter.hic --inputFormat hic --outputFormat h5 -o Macrophage_HiC_hg19.h5 --resolutions 50000


In [None]:
#Get PCHiC from CD4+ T-Cells
https://genomebiology.biomedcentral.com/articles/10.1186/s13059-017-1285-0#Sec27
    

### Chromatin state

In [None]:
%%bash

#Get monocyte CD14+ chromatin state as predicted by Segway HMM model
wget --quiet -nc https://noble.gs.washington.edu/proj/encyclopedia/interpreted/MONOCYTES-CD14+_RO01746.bed.gz
gunzip MONOCYTES-CD14+_RO01746.bed.gz
sort-bed MONOCYTES-CD14+_RO01746.bed | bgzip > /input_dir/corona_analysis/tracks/Segway_CD14_mono.bed.gz
rm MONOCYTES-CD14+_RO01746.bed
tabix -p bed /input_dir/corona_analysis/tracks/Segway_CD14_mono.bed.gz


In [None]:
#Add for E046 (NK) and E034 (T-Cell)

In [28]:
%%bash
#Get CD14+ monocyte ChromHmm model
wget --quiet -nc -O E124_15_coreMarks_dense.bed.gz \
    https://egg2.wustl.edu/roadmap/data/byFileType/chromhmmSegmentations/ChmmModels/coreMarks/jointModel/final/E124_15_coreMarks_dense.bed.gz
gunzip E124_15_coreMarks_dense.bed.gz
sort-bed E124_15_coreMarks_dense.bed | bgzip > /input_dir/corona_analysis/tracks/E124_15_coreMarks_dense.bed.gz
rm E124_15_coreMarks_dense.bed
tabix -p bed /input_dir/corona_analysis/tracks/E124_15_coreMarks_dense.bed.gz


### RNA-seq - non-activated

In [None]:
%%bash

#Get Macrophage total RNA-seq

#Get bigwig of alignments
# Minus strand
wget -nc -O /input_dir/corona_analysis/tracks/S0022I12_Mphage_rnaseq_minus.bw --quiet \
    "http://ftp.ebi.ac.uk/pub/databases/blueprint/data/homo_sapiens/GRCh38/venous_blood/S0022I/macrophage/RNA-Seq/MPIMG/S0022I12.minusStrandMulti.star_grape2_crg.GRCh38.20150815.bw"

# Plus strand
wget -nc -O /input_dir/corona_analysis/tracks/S0022I12_Mphage_rnaseq_plus.bw --quiet \
    "http://ftp.ebi.ac.uk/pub/databases/blueprint/data/homo_sapiens/GRCh38/venous_blood/S0022I/macrophage/RNA-Seq/MPIMG/S0022I12.plusStrandMulti.star_grape2_crg.GRCh38.20150815.bw"

#Transcript quantifications
wget -nc -O /input_dir/corona_analysis/tracks/S0022I12_Mphage_transcript.tsv --quiet \
    "http://ftp.ebi.ac.uk/pub/databases/blueprint/data/homo_sapiens/GRCh38/venous_blood/S0022I/macrophage/RNA-Seq/MPIMG/S0022I12.transcript_quantification.rsem_grape2_crg.GRCh38.20150622.results"
    
#Gene quantifications
wget -nc -O /input_dir/corona_analysis/tracks/S0022I12_Mphage_gene.tsv --quiet \
    "http://ftp.ebi.ac.uk/pub/databases/blueprint/data/homo_sapiens/GRCh38/venous_blood/S0022I/macrophage/RNA-Seq/MPIMG/S0022I12.gene_quantification.rsem_grape2_crg.GRCh38.20150622.results"
    

In [1]:
%%bash

CrossMap.py bigwig /input_dir/corona_analysis/annotations/hg38ToHg19.over.chain.gz /input_dir/corona_analysis/tracks/S0022I12_Mphage_rnaseq_minus.bw \
    /input_dir/corona_analysis/tracks/S0022I12_Mphage_rnaseq_minus_hg19

CrossMap.py bigwig /input_dir/corona_analysis/annotations/hg38ToHg19.over.chain.gz /input_dir/corona_analysis/tracks/S0022I12_Mphage_rnaseq_plus.bw \
    /input_dir/corona_analysis/tracks/


@ 2020-07-05 18:54:10: Read the chain file:  /input_dir/corona_analysis/annotations/hg38ToHg19.over.chain.gz
@ 2020-07-05 18:54:11: Liftover bigwig file: /input_dir/corona_analysis/tracks/S0022I12_Mphage_rnaseq_minus.bw ==> /input_dir/corona_analysis/tracks/S0022I12_Mphage_rnaseq_minus_hg19.bgr
@ 2020-07-05 19:00:51: Merging overlapped entries in bedGraph file ...
@ 2020-07-05 19:00:51: Sorting bedGraph file:/input_dir/corona_analysis/tracks/S0022I12_Mphage_rnaseq_minus_hg19.bgr
@ 2020-07-05 19:05:55: Writing header to "/input_dir/corona_analysis/tracks/S0022I12_Mphage_rnaseq_minus_hg19.bw" ...
@ 2020-07-05 19:05:55: Writing entries to "/input_dir/corona_analysis/tracks/S0022I12_Mphage_rnaseq_minus_hg19.bw" ...
@ 2020-07-05 19:08:21: Read the chain file:  /input_dir/corona_analysis/annotations/hg38ToHg19.over.chain.gz
@ 2020-07-05 19:08:22: Liftover bigwig file: /input_dir/corona_analysis/tracks/S0022I12_Mphage_rnaseq_plus.bw ==> /input_dir/corona_analysis/tracks/.bgr
@ 2020-07-05 19:1

### Get activated M-phage RNA-seq

In [None]:
%%bash

#Get activated Macrophage total RNA-seq

#Get bigwig of alignments
# Minus strand
wget -nc -O /input_dir/corona_analysis/tracks/S0022I12_Mphage_activ_rnaseq_minus.bw --quiet \
    "http://ftp.ebi.ac.uk/pub/databases/blueprint/data/homo_sapiens/GRCh38/venous_blood/S001MJ/inflammatory_macrophage/RNA-Seq/MPIMG/S001MJ12.minusStrandMulti.star_grape2_crg.GRCh38.20150815.bw"

# Plus strand
wget -nc -O /input_dir/corona_analysis/tracks/S0022I12_Mphage_activ_rnaseq_plus.bw --quiet \
    "http://ftp.ebi.ac.uk/pub/databases/blueprint/data/homo_sapiens/GRCh38/venous_blood/S001MJ/inflammatory_macrophage/RNA-Seq/MPIMG/S001MJ12.plusStrandMulti.star_grape2_crg.GRCh38.20150815.bw"

#Transcript quantifications
wget -nc -O /input_dir/corona_analysis/tracks/S0022I12_Mphage_activ_transcript.tsv --quiet \
    "http://ftp.ebi.ac.uk/pub/databases/blueprint/data/homo_sapiens/GRCh38/venous_blood/S001MJ/inflammatory_macrophage/RNA-Seq/MPIMG/S001MJ12.transcript_quantification.rsem_grape2_crg.GRCh38.20150622.results"
    
#Gene quantifications
wget -nc -O /input_dir/corona_analysis/tracks/S0022I12_Mphage_activ_gene.tsv --quiet \
    "http://ftp.ebi.ac.uk/pub/databases/blueprint/data/homo_sapiens/GRCh38/venous_blood/S001MJ/inflammatory_macrophage/RNA-Seq/MPIMG/S001MJ12.gene_quantification.rsem_grape2_crg.GRCh38.20150622.results"



In [None]:
%%bash

CrossMap.py bigwig /input_dir/corona_analysis/annotations/hg38ToHg19.over.chain.gz /input_dir/corona_analysis/tracks/S0022I12_Mphage_activ_rnaseq_minus.bw \
    /input_dir/corona_analysis/tracks/S0022I12_Mphage_activ_rnaseq_minus_hg19

CrossMap.py bigwig /input_dir/corona_analysis/annotations/hg38ToHg19.over.chain.gz /input_dir/corona_analysis/tracks/S0022I12_Mphage_activ_rnaseq_plus.bw \
    /input_dir/corona_analysis/tracks/S0022I12_Mphage_activ_rnaseq_plus_hg19


### Repeats

#### L1 elements

In [None]:
%%bash

#Get LINE elements from L1Base2 all full length (>4500nt) in hg38 and flip to hg19
# http://l1base.charite.de/BED/hsflil1_3836.bed == intact ones
wget --quiet http://l1base.charite.de/BED/hsflnil1_8438_rm.bed
sort-bed hsflnil1_8438_rm.bed > /input_dir/corona_analysis/annotations/hsflnil1_hg38_sorted.bed
rm hsflnil1_8438_rm.bed


In [None]:

#LINE elements from hg38 to hg19
liftover_bed(from_genome="hg38",
             to_genome="hg19",
             in_bed="/input_dir/corona_analysis/annotations/hsflnil1_hg38_sorted.bed",
             out_bed="/input_dir/corona_analysis/annotations/hsflnil1_hg19_sorted.bed",
            canonical_chrom_filter = True)



### eQTLs

### Mendelian diseases

In [None]:
%%bash

#Get mendelian variation tied to disease using clinvar hg38
wget --quiet -nc -O disease_names_clinvar.txt ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/disease_names


In [None]:
%%bash

wget --quiet -nc -O clinvar_curr_hg38.vcf.gz ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar_20200615.vcf.gz
gunzip clinvar_curr_hg38.vcf.gz


In [None]:
%%bash

wget --quiet -nc -O clinvar_curr_hg19.vcf.gz ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar_20200615.vcf.gz
gunzip clinvar_curr_hg19.vcf.gz


In [None]:
%%bash

#get only lung related disease ConceptIDs
grep -i "lung\|COPD" disease_names_clinvar.txt | cut -f 3 | sort -u | sed '/^$/d' > lung_names.txt


In [None]:
import vcf

disease_ids = set()

with open('lung_names.txt', 'r') as diseases:
    for line in diseases.readlines():
        disease_ids.add(line.strip())

vcf_reader = vcf.Reader(open('clinvar_curr_hg19.vcf', 'r',encoding='utf-8'))

bed_lung_var_out = open("/input_dir/corona_analysis/annotations/clinvar_kidney_variants_hg19.bed","w")

for record in vcf_reader:
    record_keys = record.INFO.keys()
    if ("CLNDISDB" in record_keys and record.INFO["CLNDISDB"][0] is not None and "MedGen" in record.INFO["CLNDISDB"][0]):
        cur_id = (record.INFO["CLNDISDB"][0]).split(":")[1]
        cur_disease = (record.INFO["CLNDN"][0])
        cur_rs = "NA"
        if "RS" in record.INFO.keys():
            cur_rs = "rs"+record.INFO["RS"][0]
        
        if cur_id in disease_ids:
            out_record = ("chr" + str(record.CHROM) +
                "\t" + str(record.start) + 
                "\t" + str(record.end) + 
                "\t" + cur_disease + 
                ":" + cur_rs + 
                "\t" + "0" + 
                "\t" + "." + "\n")
            bed_lung_var_out.write(out_record)
            
bed_lung_var_out.close()

### Methylation

In [None]:
%%bash
# download methylation differences in upper vs lower lobe macrophages

https://www.immunohorizons.org/content/suppl/2019/07/02/3.7.274.DCSupplemental


In [None]:
%%bash

#Get methylation coverage in macrophages
wget --quiet -nc -O /input_dir/corona_analysis/tracks/S0022I51_macrophage_CPG_meth_hg38.bw \
    http://ftp.ebi.ac.uk/pub/databases/blueprint/data/homo_sapiens/GRCh38/venous_blood/S0022I/macrophage/Bisulfite-Seq/CNAG/S0022I51.CPG_methylation_calls.bs_call.GRCh38.20160531.bw

    
#Get methylation coverage in activated macrophages
wget --quiet -nc -O /input_dir/corona_analysis/tracks/S00BS451_activ_macrophage_CPG_meth_hg38.bw \
    http://ftp.ebi.ac.uk/pub/databases/blueprint/data/homo_sapiens/GRCh38/venous_blood/S00BS4/alternatively_activated_macrophage/Bisulfite-Seq/CNAG/S00BS451.CPG_methylation_calls.bs_call.GRCh38.20160531.bw


In [None]:
%%bash

CrossMap.py bigwig /input_dir/corona_analysis/annotations/hg38ToHg19.over.chain.gz /input_dir/corona_analysis/tracks/S0022I51_macrophage_CPG_meth_hg38.bw \
    /input_dir/corona_analysis/tracks/S0022I51_macrophage_CPG_meth_hg19

CrossMap.py bigwig /input_dir/corona_analysis/annotations/hg38ToHg19.over.chain.gz /input_dir/corona_analysis/tracks/S00BS451_activ_macrophage_CPG_meth_hg38.bw \
    /input_dir/corona_analysis/tracks/S00BS451_activ_macrophage_CPG_meth_hg19
