# Code to get tracks set up for CoVid-relevant regulatory architecture
# CD8 T Cell regulatory architecture

# Minimal tracks to prepare 
Track : format - source

    * CTCF : bigwig - Imputed CD8 T-cell ENCODE 
    * H3K27Ac : bigwig - Blueprint CD8 eff mem Tcell 
    * H3k4Me3 : bigwig - Blueprint CD8 eff mem Tcell  
    * H3k9Me3 : bigwig - Blueprint CD8 eff mem Tcell 
    * scATAC-seq/DNAse-seq : bigwig - Clustered scATAC-seq Blood paper
    * Methylation : bigwig - Blueprint CD8 Tcell
    * Loops : links - hESC
    * Hi-C : cool - Mphage diff paper
    * Genes : genes_bed -  Gencode
    * Repeats : bed - L1Base2
    * Chromatin state : bed - ENCODE/Segway
    * eQTL list : arcs - none
    * GWAS : bed - pvals and bigwig Ellinghaus and Covid19Hg
    * RNA-Seq : bigwig and txt - CD8 eff mem T cell Blueprint
    

In [None]:
import pyensembl, os, sys, re, numpy as np
from helper_funcs import *

### Transcription factor and histones

In [1]:
%%bash
#Get Histone marks C0054X from Blueprint DCC:

# Get bigwig H3k4me3 fold change over control 
wget -nc -O /input_dir/corona_analysis/tracks/C0054X_cd8_tcell_h3k4me3_hg38.bw --quiet \
    "http://ftp.ebi.ac.uk/pub/databases/blueprint/data/homo_sapiens/GRCh38/venous_blood/C0054X/effector_memory_CD8-positive_alpha-beta_T_cell/ChIP-Seq/NCMLS/C0054XH3.ERX547963.H3K4me3.bwa.GRCh38.20150528.bw"

# Get bigwig H3K27Ac fold change over control
wget -nc -O /input_dir/corona_analysis/tracks/C0054X_cd8_tcell_h3k27Ac_hg38.bw --quiet \
    "http://ftp.ebi.ac.uk/pub/databases/blueprint/data/homo_sapiens/GRCh38/venous_blood/C0054X/effector_memory_CD8-positive_alpha-beta_T_cell/ChIP-Seq/NCMLS/C0054XH3.ERX682362.H3K27ac.bwa.GRCh38.20150529.bw"    

# Get bigwig H3K9Me3 fold change over control
wget -nc -O /input_dir/corona_analysis/tracks/C0054X_cd8_tcell_h3k9me3_hg38.bw --quiet \
    "http://ftp.ebi.ac.uk/pub/databases/blueprint/data/homo_sapiens/GRCh38/venous_blood/C0054X/effector_memory_CD8-positive_alpha-beta_T_cell/ChIP-Seq/NCMLS/C0054XH3.ERX406992.H3K9me3.bwa.GRCh38.20150528.bw"


In [1]:
%%bash
#Get Bigwigs:

# Get imputed CTCF chip-seq from CD8 T cells
wget -nc --quiet -O /input_dir/corona_analysis/tracks/ENCFF518JVS_cd8_tcell_imputed_hg38.bw \
    https://encode-public.s3.amazonaws.com/2020/01/11/b00516d0-c081-4daf-bb06-9ef9911c5169/ENCFF518JVS.bigWig


### Chromatin accessibility

In [None]:
%%bash

# Get bigwig DNAse normalized read count for CD8 T-Cells in diff states from 
#http://epigenomegateway.wustl.edu/legacy/?genome=hg19&session=HcbHMSgBCc&statusId=28207718

#Get From Fig 2 of https://www.nature.com/articles/s41587-019-0206-z clusters 26-30
wget -nc -O /input_dir/corona_analysis/tracks/CD8_N_T1_scATAC_hg19.bw --quiet \
"https://chang-public-data.s3-us-west-1.amazonaws.com/10X_scATAC/Heme/Cluster26.RIP_norm.bw"
wget -nc -O /input_dir/corona_analysis/tracks/CD8_N_T2_scATAC_hg19.bw --quiet \
"https://chang-public-data.s3-us-west-1.amazonaws.com/10X_scATAC/Heme/Cluster27.RIP_norm.bw"
wget -nc -O /input_dir/corona_analysis/tracks/CD8_N_T3_scATAC_hg19.bw --quiet \
"https://chang-public-data.s3-us-west-1.amazonaws.com/10X_scATAC/Heme/Cluster28.RIP_norm.bw"
wget -nc -O /input_dir/corona_analysis/tracks/CD8_CM_T_scATAC_hg19.bw --quiet \
"https://chang-public-data.s3-us-west-1.amazonaws.com/10X_scATAC/Heme/Cluster29.RIP_norm.bw"
wget -nc -O /input_dir/corona_analysis/tracks/CD8_EM_T_scATAC_hg19.bw --quiet \
"https://chang-public-data.s3-us-west-1.amazonaws.com/10X_scATAC/Heme/Cluster30.RIP_norm.bw"


In [None]:
%%bash

#Flip chromatin accessibility to hg38 from hg19
CrossMap.py bigwig /input_dir/corona_analysis/annotations/hg19ToHg38.over.chain.gz /input_dir/corona_analysis/tracks/CD8_N_T1_scATAC_hg19.bw \
    /input_dir/corona_analysis/tracks/CD8_N_T1_scATAC_hg38
CrossMap.py bigwig /input_dir/corona_analysis/annotations/hg19ToHg38.over.chain.gz /input_dir/corona_analysis/tracks/CD8_N_T2_scATAC_hg19.bw \
    /input_dir/corona_analysis/tracks/CD8_N_T2_scATAC_hg38
CrossMap.py bigwig /input_dir/corona_analysis/annotations/hg19ToHg38.over.chain.gz /input_dir/corona_analysis/tracks/CD8_N_T3_scATAC_hg19.bw \
    /input_dir/corona_analysis/tracks/CD8_N_T3_scATAC_hg38
CrossMap.py bigwig /input_dir/corona_analysis/annotations/hg19ToHg38.over.chain.gz /input_dir/corona_analysis/tracks/CD8_CM_T_scATAC_hg19.bw \
    /input_dir/corona_analysis/tracks/CD8_CM_T_scATAC_hg38
CrossMap.py bigwig /input_dir/corona_analysis/annotations/hg19ToHg38.over.chain.gz /input_dir/corona_analysis/tracks/CD8_EM_T_scATAC_hg19.bw \
    /input_dir/corona_analysis/tracks/CD8_EM_T_scATAC_hg38



### Chromatin looping

In [None]:
#Get in-situ Hi-C for CD8 T cells from 
#https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2827nnn/GSM2827788/suppl/GSM2827788_CD8T1_hg_t.mtx.gz


In [None]:
%%R
#Flipped using R container + Ginteractions + DiffHic
library(diffHiC)
library(GInteractions)

#Get Data from paper in MTX and flip to gints tsv
cd8_t1_hg38_iset = readMTX2IntSet(mtx="/input_dir/corona_analysis/tracks/GSM2827788_CD8T1_hg_t.mtx",
                      bed="/input_dir/corona_analysis/tracks/GSE105776_GenomicRegions.bed")
hic_count = as.integer(assays(cd8_t1_hg38_iset)[[1]])
hic_ints = interactions(cd8_t1_hg38_iset)
hic_ints$counts = hic_count
out_gint_tsv = "/input_dir/corona_analysis/tracks/GSM2827788_CD8T1_hg_t.gints"
export.bedpe(hic_ints, out_gint_tsv, score="counts")

#Second sample CD8 T2
cd8_t2_hg38_iset = readMTX2IntSet(mtx="/input_dir/corona_analysis/tracks/GSM2827789_CD8T2_hg_t.mtx",
                                  bed="/input_dir/corona_analysis/tracks/GSE105776_GenomicRegions.bed")
hic_count = as.integer(assays(cd8_t2_hg38_iset)[[1]])
hic_ints = interactions(cd8_t2_hg38_iset)
hic_ints$counts = hic_count
out_gint_tsv = "/input_dir/corona_analysis/tracks/GSM2827789_CD8T2_hg_t.gints"
export.bedpe(hic_ints, out_gint_tsv, score="counts")


In [None]:
#Flip to cool
cooler load --assembly hg38 --field count=8 -f bg2 \
    /input_dir/corona_analysis/tracks/GSE105776_GenomicRegions.bed \
    /input_dir/corona_analysis/tracks/GSM2827788_CD8T1_hg_t.gints \
    /input_dir/corona_analysis/tracks/GSM2827788_CD8T1_hg_t.cool

cooler load --assembly hg38 --field count=8 -f bg2 \
    /input_dir/corona_analysis/tracks/GSE105776_GenomicRegions.bed \
    /input_dir/corona_analysis/tracks/GSM2827789_CD8T2_hg_t.gints \
    /input_dir/corona_analysis/tracks/GSM2827789_CD8T2_hg_t.cool


### Chromatin state

In [None]:
%%bash

#Get naive and memory CD8+ T-cell state tracks
wget --quiet -nc https://noble.gs.washington.edu/proj/encyclopedia/interpreted/CD8_MEMORY_PRIMARY_CELLS.bed.gz
wget --quiet -nc https://noble.gs.washington.edu/proj/encyclopedia/interpreted/CD8_NAIVE_PRIMARY_CELLS.bed.gz   
    
gunzip CD8_MEMORY_PRIMARY_CELLS.bed.gz
gunzip CD8_NAIVE_PRIMARY_CELLS.bed.gz
sort-bed CD8_MEMORY_PRIMARY_CELLS.bed | bgzip > /input_dir/corona_analysis/tracks/CD8_MEMORY_PRIMARY_CELLS.bed.gz
sort-bed CD8_NAIVE_PRIMARY_CELLS.bed | bgzip > /input_dir/corona_analysis/tracks/CD8_NAIVE_PRIMARY_CELLS.bed.gz

rm CD8_MEMORY_PRIMARY_CELLS.bed
rm CD8_NAIVE_PRIMARY_CELLS.bed

tabix -p bed /input_dir/corona_analysis/tracks/CD8_MEMORY_PRIMARY_CELLS.bed.gz
tabix -p bed /input_dir/corona_analysis/tracks/CD8_NAIVE_PRIMARY_CELLS.bed.gz


In [None]:
%%bash

#Flip chromatin state to hg38 from hg19
CrossMap.py bed /input_dir/corona_analysis/annotations/hg19ToHg38.over.chain.gz /input_dir/corona_analysis/tracks/CD8_MEMORY_PRIMARY_CELLS.bed.gz \
    /input_dir/corona_analysis/tracks/CD8_MEMORY_PRIMARY_CELLS_hg38
CrossMap.py bed /input_dir/corona_analysis/annotations/hg19ToHg38.over.chain.gz /input_dir/corona_analysis/tracks/CD8_NAIVE_PRIMARY_CELLS.bed.gz \
    /input_dir/corona_analysis/tracks/CD8_NAIVE_PRIMARY_CELLS_hg38


In [None]:
#Add for E046 (NK) and E034 (T-Cell)

In [28]:
%%bash
#Get CD8 T Naive ChromHmm model
wget --quiet -nc -O E047_15_coreMarks_dense.bed.gz \
    https://egg2.wustl.edu/roadmap/data/byFileType/chromhmmSegmentations/ChmmModels/coreMarks/jointModel/final/E047_15_coreMarks_dense.bed.gz
gunzip E047_15_coreMarks_dense.bed.gz
sort-bed E047_15_coreMarks_dense.bed | bgzip > /input_dir/corona_analysis/tracks/E047_15_coreMarks_dense.bed.gz
rm E047_15_coreMarks_dense.bed
tabix -p bed /input_dir/corona_analysis/tracks/E047_15_coreMarks_dense.bed.gz

#Get CD8 T Memory ChromHmm model
wget --quiet -nc -O E048_15_coreMarks_dense.bed.gz \
    https://egg2.wustl.edu/roadmap/data/byFileType/chromhmmSegmentations/ChmmModels/coreMarks/jointModel/final/E048_15_coreMarks_dense.bed.gz
gunzip E048_15_coreMarks_dense.bed.gz
sort-bed E048_15_coreMarks_dense.bed | bgzip > /input_dir/corona_analysis/tracks/E048_15_coreMarks_dense.bed.gz
rm E048_15_coreMarks_dense.bed
tabix -p bed /input_dir/corona_analysis/tracks/E048_15_coreMarks_dense.bed.gz


In [None]:
%%bash

#Flip chromatin state to hg38 from hg19
CrossMap.py bed /input_dir/corona_analysis/annotations/hg19ToHg38.over.chain.gz /input_dir/corona_analysis/tracks/E047_15_coreMarks_dense.bed.gz \
    E047_15_coreMarks_dense_hg38.bed
CrossMap.py bed /input_dir/corona_analysis/annotations/hg19ToHg38.over.chain.gz /input_dir/corona_analysis/tracks/E048_15_coreMarks_dense.bed.gz \
    E048_15_coreMarks_dense_hg38.bed

sort-bed E047_15_coreMarks_dense_hg38.bed | bgzip > /input_dir/corona_analysis/tracks/E047_15_coreMarks_dense_hg38.bed.gz
sort-bed E048_15_coreMarks_dense_hg38.bed | bgzip > /input_dir/corona_analysis/tracks/E048_15_coreMarks_dense_hg38.bed.gz
 
rm E047_15_coreMarks_dense_hg38.bed E048_15_coreMarks_dense_hg38.bed
tabix -p bed /input_dir/corona_analysis/tracks/E047_15_coreMarks_dense_hg38.bed.gz
tabix -p bed /input_dir/corona_analysis/tracks/E048_15_coreMarks_dense_hg38.bed.gz



### RNA-seq

In [None]:
%%bash

#Get C001HN total RNA-seq Blueprint Effector memory CD8 T cell

#Get bigwig of alignments
# Minus strand multi
wget -nc -O /input_dir/corona_analysis/tracks/C001HN_CD8_T_mem_rnaseq_minus.bw --quiet \
    "http://ftp.ebi.ac.uk/pub/databases/blueprint/data/homo_sapiens/GRCh38/venous_blood/C001HN/central_memory_CD8-positive_alpha-beta_T_cell/RNA-Seq/MPIMG/C001HNB2.minusStrandMulti.star_grape2_crg.GRCh38.20150815.bw"

# Plus strand multi
wget -nc -O /input_dir/corona_analysis/tracks/C001HN_CD8_T_mem_rnaseq_plus.bw --quiet \
    "http://ftp.ebi.ac.uk/pub/databases/blueprint/data/homo_sapiens/GRCh38/venous_blood/C001HN/central_memory_CD8-positive_alpha-beta_T_cell/RNA-Seq/MPIMG/C001HNB2.plusStrandMulti.star_grape2_crg.GRCh38.20150815.bw"

#Transcript quantifications
wget -nc -O /input_dir/corona_analysis/tracks/C001HN_CD8_T_mem_transcript.tsv --quiet \
    "http://ftp.ebi.ac.uk/pub/databases/blueprint/data/homo_sapiens/GRCh38/venous_blood/C001HN/central_memory_CD8-positive_alpha-beta_T_cell/RNA-Seq/MPIMG/C001HNB2.transcript_quantification.rsem_grape2_crg.GRCh38.20150622.results"
    
#Gene quantifications
wget -nc -O /input_dir/corona_analysis/tracks/C001HN_CD8_T_mem_gene.tsv --quiet \
    "http://ftp.ebi.ac.uk/pub/databases/blueprint/data/homo_sapiens/GRCh38/venous_blood/C001HN/central_memory_CD8-positive_alpha-beta_T_cell/RNA-Seq/MPIMG/C001HNB2.gene_quantification.rsem_grape2_crg.GRCh38.20150622.results"
    

### eQTLs

### Mendelian diseases

In [None]:
%%bash

#Get mendelian variation tied to disease using clinvar hg38
wget --quiet -nc -O disease_names_clinvar.txt ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/disease_names


In [None]:
%%bash

wget --quiet -nc -O clinvar_curr_hg38.vcf.gz ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar_20200615.vcf.gz
gunzip clinvar_curr_hg38.vcf.gz


In [None]:
%%bash

#get only lung related disease ConceptIDs
grep -i "lung\|COPD" disease_names_clinvar.txt | cut -f 3 | sort -u | sed '/^$/d' > lung_names.txt


In [None]:
import vcf

disease_ids = set()

with open('lung_names.txt', 'r') as diseases:
    for line in diseases.readlines():
        disease_ids.add(line.strip())

vcf_reader = vcf.Reader(open('clinvar_curr_hg38.vcf', 'r',encoding='utf-8'))

bed_lung_var_out = open("/input_dir/corona_analysis/annotations/clinvar_kidney_variants_hg38.bed","w")

for record in vcf_reader:
    record_keys = record.INFO.keys()
    if ("CLNDISDB" in record_keys and record.INFO["CLNDISDB"][0] is not None and "MedGen" in record.INFO["CLNDISDB"][0]):
        cur_id = (record.INFO["CLNDISDB"][0]).split(":")[1]
        cur_disease = (record.INFO["CLNDN"][0])
        cur_rs = "NA"
        if "RS" in record.INFO.keys():
            cur_rs = "rs"+record.INFO["RS"][0]
        
        if cur_id in disease_ids:
            out_record = ("chr" + str(record.CHROM) +
                "\t" + str(record.start) + 
                "\t" + str(record.end) + 
                "\t" + cur_disease + 
                ":" + cur_rs + 
                "\t" + "0" + 
                "\t" + "." + "\n")
            bed_lung_var_out.write(out_record)
            
bed_lung_var_out.close()

### Methylation

In [None]:
%%bash

#Get methylation coverage in effector CD8 Memory T cells
wget --quiet -nc -O /input_dir/corona_analysis/tracks/S002ND51_CD8_effmemT_CPG_meth_hg38.bw \
    http://ftp.ebi.ac.uk/pub/databases/blueprint/data/homo_sapiens/GRCh38/venous_blood/S002ND/effector_memory_CD8-positive_alpha-beta_T_cell_terminally_differentiated/Bisulfite-Seq/CNAG/S002ND51.CPG_methylation_calls.bs_call.GRCh38.20160531.bw
