In [24]:
import glob
import os
import sys
import numpy as np
import pandas as pd
import pyBigWig as pw
from jinja2 import Template


In [25]:
os.chdir('/home/chaodai/cdai/SpliFi/analysis')

# Brain-Cortex_v_Muscle-Skeletal

## Bigwig files

In [4]:
!ls ../code/resources/GTEx/BigWig/

Brain-Anteriorcingulatecortex_BA24_  Liver
Brain-Cortex			     Lung
Brain-FrontalCortex_BA9_	     Muscle-Skeletal
Brain-Putamen_basalganglia_	     Skin-NotSunExposed_Suprapubic_
Heart-AtrialAppendage		     WholeBlood


In [5]:
bwDirs = {'Muscle-Skeletal': '../code/resources/GTEx/BigWig/Muscle-Skeletal',
          'Brain-Cortex': '../code/resources/GTEx/BigWig/Brain-Cortex',
          'Brain-Putamen_basalganglia_': '../code/resources/GTEx/BigWig/Brain-Putamen_basalganglia_',
          'Liver': '../code/resources/GTEx/BigWig/Liver',
          'Lung': '../code/resources/GTEx/BigWig/Lung',
          'WholeBlood': '../code/resources/GTEx/BigWig/WholeBlood',
          'Skin-NotSunExposed_Suprapubic_': '../code/resources/GTEx/BigWig/Skin-NotSunExposed_Suprapubic_',
          }


In [10]:
all_contrasts = glob.glob('../code/results/ds/GTEx/*')
all_contrasts = [os.path.basename(x) for x in all_contrasts]

contrasts = []
for c in all_contrasts:
  groups = c.split('_v_')
  if all([g in bwDirs for g in groups]):
    contrasts.append(c)

    
    

In [11]:
contrasts.sort()
contrasts = contrasts[:6]
contrasts

['Brain-Cortex_v_Liver',
 'Brain-Cortex_v_Lung',
 'Brain-Cortex_v_Muscle-Skeletal',
 'Brain-Cortex_v_Skin-NotSunExposed_Suprapubic_',
 'Brain-Cortex_v_WholeBlood',
 'Brain-Putamen_basalganglia__v_Liver']

In [37]:
# store bw file paths for each tissue
bwPlotFiles = {}
for c in contrasts:
  grps = c.split('_v_')
  sample_f = f'../code/results/ds/GTEx/{c}/ds_sample_group.txt'
  dsSamples = getSamplesByGroup(sample_f, grps)
  bwDir= {g: bwDirs[g] for g in grps} # available bw file dir by tissue
  bwFiles = {x: getBigWigFiles(bwDir[x]) for x in grps} # available bw files by tissue
  # plot bw files are sampels intersect the previous two
  bwTemp = {x: getPlottableBigWig(bwFiles[x], dsSamples[x]) for x in grps}

  for t in bwTemp:
    if t not in bwPlotFiles:
      print(f'adding {t} to bwPlotFiles from {c}')
      bwPlotFiles[t] = bwTemp[t]

adding Brain-Cortex to bwPlotFiles from Brain-Cortex_v_Liver
adding Liver to bwPlotFiles from Brain-Cortex_v_Liver
adding Lung to bwPlotFiles from Brain-Cortex_v_Lung
adding Muscle-Skeletal to bwPlotFiles from Brain-Cortex_v_Muscle-Skeletal
adding Skin-NotSunExposed_Suprapubic_ to bwPlotFiles from Brain-Cortex_v_Skin-NotSunExposed_Suprapubic_
adding WholeBlood to bwPlotFiles from Brain-Cortex_v_WholeBlood
adding Brain-Putamen_basalganglia_ to bwPlotFiles from Brain-Putamen_basalganglia__v_Liver


In [38]:
for t in bwPlotFiles:
  print(f'{t}: {len(bwPlotFiles[t].values())}')

Brain-Cortex: 78
Liver: 79
Lung: 77
Muscle-Skeletal: 78
Skin-NotSunExposed_Suprapubic_: 80
WholeBlood: 80
Brain-Putamen_basalganglia_: 75


clu_13984_+:chr5:181048869-181054699



clu_14941_-:chr5:177489646-177493904

clu_3411_-:chr1:94538868-94543036

clu_3531_-:chr1:113698299-113702293

clu_6632_-:chr2:27490978-27494888



In [None]:
target = 'chr5:177489646-177493904'
plotclu = target.split(':')[0]

bwRange = 'chr5', 177389646, 177593904

In [None]:
177493904-177489646

4258

In [39]:
bwRange

('chr5', 177389646, 177593904)

In [40]:
# average bigwig files
outBwInfo = {}
for grp in bwPlotFiles:
  print(f'computing average bigwig for {grp} in {bwRange[0]}:{str(bwRange[1])}-{str(bwRange[2])}')
  avgBW = avgBigwig(bwPlotFiles[grp].values(), bwRange)
  outBWFile = f'../data/sashimi/GTEx/ex1_{grp}.bw'
  writeBigwig(outBWFile, avgBW, bwRange)

  if grp not in outBwInfo:
    outBwInfo[grp] = os.path.basename(outBWFile)

computing average bigwig for Brain-Cortex in chr5:177389646-177593904
Bigwig file written: ../data/ex1_Brain-Cortex.bw
computing average bigwig for Liver in chr5:177389646-177593904
Bigwig file written: ../data/ex1_Liver.bw
computing average bigwig for Lung in chr5:177389646-177593904
Bigwig file written: ../data/ex1_Lung.bw
computing average bigwig for Muscle-Skeletal in chr5:177389646-177593904
Bigwig file written: ../data/ex1_Muscle-Skeletal.bw
computing average bigwig for Skin-NotSunExposed_Suprapubic_ in chr5:177389646-177593904
Bigwig file written: ../data/ex1_Skin-NotSunExposed_Suprapubic_.bw
computing average bigwig for WholeBlood in chr5:177389646-177593904
Bigwig file written: ../data/ex1_WholeBlood.bw
computing average bigwig for Brain-Putamen_basalganglia_ in chr5:177389646-177593904
Bigwig file written: ../data/ex1_Brain-Putamen_basalganglia_.bw


In [31]:
outBwInfo

{'Brain-Cortex': 'ex1_Brain-Cortex.bw',
 'Liver': 'ex1_Liver.bw',
 'Lung': 'ex1_Lung.bw',
 'Muscle-Skeletal': 'ex1_Muscle-Skeletal.bw',
 'Skin-NotSunExposed_Suprapubic_': 'ex1_Skin-NotSunExposed_Suprapubic_.bw',
 'WholeBlood': 'ex1_WholeBlood.bw',
 'Brain-Putamen_basalganglia_': 'ex1_Brain-Putamen_basalganglia_.bw'}

In [32]:
# get PSI values for each tissue
allLinks = {'sashimiLinks': {}, 'upLinks': {}}
for c in contrasts:
  print(f'Working on {c}')
  grps = c.split('_v_')
  dsEffectFile = f'../code/results/ds/GTEx/{c}/ds_effect_sizes.txt'
  EfSize = pd.read_csv(dsEffectFile, sep='\t')
  Introns = readIntronTable(f'../code/results/ds/GTEx/{c}/ds_perind_numers.counts.noise_by_intron.gz')
  links = getPSI(EfSize, plotclu, Introns, grps)
  
  for g in grps:
    if g not in allLinks['sashimiLinks']:
      allLinks['sashimiLinks'][g] = links['sashimiLinks'][g]
    if g not in allLinks['upLinks']:
      allLinks['upLinks'][g] = links['upLinks']
  
# row concatenate all allLinks['upLinks']
allLinks['upLinks'] = pd.concat(allLinks['upLinks'], axis=0, ignore_index=True).drop_duplicates()


Working on Brain-Cortex_v_Liver
Working on Brain-Cortex_v_Lung
Working on Brain-Cortex_v_Muscle-Skeletal
Working on Brain-Cortex_v_Skin-NotSunExposed_Suprapubic_
Working on Brain-Cortex_v_WholeBlood
Working on Brain-Putamen_basalganglia__v_Liver


In [33]:
# write sashimi links
sashimiLinkInfo = {}
for g in allLinks['sashimiLinks']:
  outSashimi = f'../data/sashimi/GTEx/ex1_{g}_sashimi.links'
  allLinks['sashimiLinks'][g].to_csv(outSashimi, sep='\t', header=False, index=False)
  if g not in sashimiLinkInfo:
    sashimiLinkInfo[g] = os.path.basename(outSashimi)

# write upLinks
outUpLinks = f'../data/sashimi/GTEx/ex1_up.links'
allLinks['upLinks'].to_csv(outUpLinks, sep='\t', header=False, index=False)
upLinkInfo = os.path.basename(outUpLinks)

## Link files

In [34]:
outBwInfo

sashimiLinkInfo

upLinkInfo

{'Brain-Cortex': 'ex1_Brain-Cortex.bw',
 'Liver': 'ex1_Liver.bw',
 'Lung': 'ex1_Lung.bw',
 'Muscle-Skeletal': 'ex1_Muscle-Skeletal.bw',
 'Skin-NotSunExposed_Suprapubic_': 'ex1_Skin-NotSunExposed_Suprapubic_.bw',
 'WholeBlood': 'ex1_WholeBlood.bw',
 'Brain-Putamen_basalganglia_': 'ex1_Brain-Putamen_basalganglia_.bw'}

{'Brain-Cortex': 'ex1_Brain-Cortex_sashimi.links',
 'Liver': 'ex1_Liver_sashimi.links',
 'Lung': 'ex1_Lung_sashimi.links',
 'Muscle-Skeletal': 'ex1_Muscle-Skeletal_sashimi.links',
 'Skin-NotSunExposed_Suprapubic_': 'ex1_Skin-NotSunExposed_Suprapubic__sashimi.links',
 'WholeBlood': 'ex1_WholeBlood_sashimi.links',
 'Brain-Putamen_basalganglia_': 'ex1_Brain-Putamen_basalganglia__sashimi.links'}

'ex1_up.links'

NameError: name 'os' is not defined

## clu_13984_+:chr5:181048869-181054699
chr5:181052833:181053199:clu_13984_+

In [93]:
plotRange

'chr5:181048869-181054699'


<br>
<br>

--------------------------------------------------------------------------------

<br>
<br>



In [29]:
# %load ../code/workflow/scripts/prepSashimi.py

def avgBigwig(bwfiles, grange):
    """Average multiple bigwig files in a specific region

    bwfiles : list of bigwig files (path).
    grange  : tuple. Genomic range, BED like 0-based coordinates,
              eg. ('chr1', 25101, 27101)

    return  : a dictionary of keys:
        - header, for pyBigWig to write as header
        - values, for pyBigwig to addentries as values
    """
    chrom, start, end = grange
    values = []
    bwo = {}
    for bw in bwfiles:
        if not os.path.isfile(bw):
            continue
        with pw.open(bw, "rt") as b:
            header = list(b.chroms().items())
            vals = b.values(chrom, start, end, numpy=True)
            vals = np.nan_to_num(vals)
            values.append(vals)

    if values != [] and header != []:
        avgValues = np.mean(values, axis=0)
        bwo = {"header": header, "values": avgValues}
    return bwo


def writeBigwig(outFile, bigwig, grange):
    """Write a bigwig file

    outFile : str. Output bigwig file path
    bigwig  : dict. Dictionary with keys: header, values of a bigwig object
    grange  : tuple. Genomic range, BED like 0-based coordinates,
                eg. ('chr1', 25101, 27101)
    """
  
    chrom, start, end = grange
    with pw.open(outFile, "w") as b:
        b.addHeader(bigwig["header"])
        b.addEntries(chrom, start, values=bigwig["values"], span=1, step=1)
    print(f"Bigwig file written: {outFile}")


def readIntronTable(intronTableFile, sep=" ", SelectCols=[0]):
    """Read intron table file

    intronTableFile : str. Path to intron table file
                      eg. 'ds_perind_numers.counts.noise_by_intron.gz'

    return : pandas.DataFrame
    """
    introns = pd.read_csv(intronTableFile, sep=sep, usecols=SelectCols)
    introns = [
        (":".join(x[:-1]), f"{x[0]}:{x[3]}", x[4])
        for x in introns["chrom"].str.split(":")
    ]
    introns = {x[0]: x[2] for x in introns}

    return introns


def getSamplesByGroup(
    sampleGroupFile,
    groups,
    sep=" ",
    cols=["fname", "group"],
    header=None,
    splitBy=".",
    splitCapture=[1],
):
    """Read sample group file from diff splicing

    sampleGroupFile : str. Path to sample group file
                      eg. 'ds_sample_group.txt'
    sep             : str. Separator used in the file
    cols            : list. Column names to read from the file
    header          : int. Row number to use as header
    splitBy         : str. Split the file name by this character
    splitCapture    : list. Index to capture after splitting
    groups          : list. List of groups to select, eg. ['Liver', 'Kidney']


    return : dict. Dictionary of sample groups
    """
    samples = pd.read_csv(sampleGroupFile, sep=sep, header=header, names=cols)
    samples["sname"] = [x.split(splitBy)[splitCapture[0]] for x in samples["fname"]]

    group1, group2 = groups
    samples1 = samples[samples["group"] == group1]["sname"].tolist()
    samples2 = samples[samples["group"] == group2]["sname"].tolist()

    return {group1: samples1, group2: samples2}


def getBigWigFiles(bwDir, filePattern="*.bw"):
    """Get bigwig files from a directory

    bwDir      : str. Path to the directory containing bigwig files
    filePattern: str. File pattern to search for, eg: '*.bw'

    return     : dict. Dictionary of sample names and bigwig files
    """
    bwFiles = glob.glob(os.path.join(bwDir, filePattern))
    bwSamples = ["-".join(x.split("/")[-1].split("-")[:2]) for x in bwFiles]

    return {x[0]: x[1] for x in zip(bwSamples, bwFiles)}


def getPlottableBigWig(bw, ds):
    """Get plottable bigwig files
    bw : dict. key = sample name, value = bigwig file path
    ds : list. List of sample names in differential splicing.

    return: dict. Key = sample names in both bw and ds, value = bigwig file path
    """
    shared = set(bw.keys()).intersection(ds)
    return {x: bw[x] for x in shared}


def getPSI(effects, clu, introns, groups, minPSI=0.05):
    """Get PSI values from a file

    effects : dataframe.
    clu     : str. cluster id, eg. 'clu_261_+'
    introns: dict. Dictionary of introns
    groups  : list. List of groups, eg. ['Liver', 'Kidney']
    minPSI  : float. Minimum PSI value to output

    return : dict. Dictionary of links
    """
    effects["itype"] = [introns[x] for x in effects["intron"]]
    effects = effects[
        effects.intron.str.contains(clu)
    ].copy()  # use copy() to avoid SettingWithCopyWarning

    # make pyGenomeTracks compatible link tables
    effects.loc[:, "chrom1"] = effects.intron.str.split(":").str[0]
    effects.loc[:, "start1"] = effects.intron.str.split(":").str[1]
    effects.loc[:, "end1"] = effects["start1"]
    effects.loc[:, "chrom2"] = effects["chrom1"]
    effects.loc[:, "start2"] = effects.intron.str.split(":").str[2]
    effects.loc[:, "end2"] = effects["start2"]

    grp1, grp2 = groups
    outdfs = {
        grp1: effects[["chrom1", "start1", "end1", "chrom2", "start2", "end2", grp1]][
            effects[grp1] > minPSI
        ],
        grp2: effects[["chrom1", "start1", "end1", "chrom2", "start2", "end2", grp2]][
            effects[grp2] > minPSI
        ],
    }

    # links for UP junctions only
    upLinks = effects[effects.itype == "UP"].drop_duplicates()[
        ["chrom1", "start1", "end1", "chrom2", "start2", "end2"]
    ]

    return {"sashimiLinks": outdfs, "upLinks": upLinks}




'\nPrepare dataset required for plotting sashimi plots using pyGenomeTracks\n'

In [29]:
!ls ../data/sashimi/GTEx/*links

../data/sashimi/GTEx/ex1_Brain-Cortex_sashimi.links
../data/sashimi/GTEx/ex1_Brain-Putamen_basalganglia__sashimi.links
../data/sashimi/GTEx/ex1_Liver_sashimi.links
../data/sashimi/GTEx/ex1_Lung_sashimi.links
../data/sashimi/GTEx/ex1_Muscle-Skeletal_sashimi.links
../data/sashimi/GTEx/ex1_Skin-NotSunExposed_Suprapubic__sashimi.links
../data/sashimi/GTEx/ex1_up.links
../data/sashimi/GTEx/ex1_WholeBlood_sashimi.links


In [34]:
bc_links = pd.read_csv('../data/sashimi/GTEx/ex1_Brain-Cortex_sashimi.links', sep='\t', header=None, names=['chrom1', 'start1', 'end1', 'chrom2', 'start2', 'end2', 'PSI'])
muscle_links = pd.read_csv('../data/sashimi/GTEx/ex1_Muscle-Skeletal_sashimi.links', sep='\t', header=None, names=['chrom1', 'start1', 'end1', 'chrom2', 'start2', 'end2', 'PSI'])

In [35]:
bc_links
muscle_links

Unnamed: 0,chrom1,start1,end1,chrom2,start2,end2,PSI
0,chr5,181050369,181050369,chr5,181053199,181053199,0.681562
1,chr5,181052833,181052833,chr5,181053199,181053199,0.318438


Unnamed: 0,chrom1,start1,end1,chrom2,start2,end2,PSI
0,chr5,181050369,181050369,chr5,181053199,181053199,0.979715


In [54]:
bc_bed = bc_links[['chrom1', 'start1', 'start2']].copy()
muscle_bed = muscle_links[['chrom1', 'start1', 'start2']].copy()

bc_bed['param'] = ['motif=GT/AG;annotated_junction=true', 'motif=GT/AG;annotated_junction=true']
bc_bed['score'] = (bc_links['PSI'] * 1000).astype(int)
bc_bed['strand'] = ['+' for _ in range(bc_bed.shape[0])]


muscle_bed['param'] = ['motif=GT/AG;annotated_junction=true']
muscle_bed['score'] = (muscle_links['PSI'] * 1000).astype(int)
muscle_bed['strand'] = ['+' for _ in range(muscle_bed.shape[0])]

In [55]:
bc_bed
muscle_bed

Unnamed: 0,chrom1,start1,start2,param,score,strand
0,chr5,181050369,181053199,motif=GT/AG;annotated_junction=true,681,+
1,chr5,181052833,181053199,motif=GT/AG;annotated_junction=true,318,+


Unnamed: 0,chrom1,start1,start2,param,score,strand
0,chr5,181050369,181053199,motif=GT/AG;annotated_junction=true,979,+


In [56]:
bc_bed.to_csv('../data/sashimi/GTEx/ex1_Brain-Cortex_sashimi.bed', sep='\t', header=False, index=False)
muscle_bed.to_csv('../data/sashimi/GTEx/ex1_Muscle-Skeletal_sashimi.bed', sep='\t', header=False, index=False)