In [1]:
import csv
import glob
import re
import subprocess

In [2]:
from collections import defaultdict

In [3]:
from pybedtools import BedTool

In [4]:
for path in glob.iglob("data/ref/*/annot/*.gtf.gz"):
    name = re.search('(?<=ref/)[^/]+', path).group(0)

    save_path = f"data/ref/{name}/annot/out/meta_exon.csv"
    subprocess.call(f"python main/merge.py -g {path} -o {save_path}", shell=True)

In [5]:
name2hsap = defaultdict(str)

In [6]:
for fp in glob.iglob("data/homology/exons/chrom/*.csv"):
    with open(fp, 'r') as f:
        reader = csv.DictReader(f)
        
        for row in reader:
            hsap = row.pop('homo_sapiens')
            
            if hsap == "-":
                # `row['anchor']`, the H. sap. meta-exon, has no alignment.
                continue
                
            for name, pos in row.items():
                if name == 'anchor' or pos == "-":
                    continue

                chrom, beg, end, strand = re.split('[:-]', pos)
                strand = "+" if int(strand) else "-"
                
                name2hsap[name] += (
                    f"{chrom}\t"
                    f"{beg}\t"
                    f"{end}\t"
                    f"{hsap}\t"
                    f"0\t"
                    f"{strand}\n"
                )

In [7]:
for fp in glob.iglob("data/ref/*/annot/out/meta_exon.csv"):
    name = re.search("(?<=ref/)[^/]+", fp).group(0)
    exon_bed = str()

    with open(fp, 'r') as f:
        reader = csv.reader(f)
        
        for row in reader:
            gene, gene_func, chrom, meta_beg, meta_end, strand, meta_kind, make_up = row

            bin_strand = int(strand == "+")
            meta_exon = f"{meta_kind}={chrom}:{meta_beg}-{meta_end}:{bin_strand}"
            
            exon_bed += (
                f"{chrom}\t"
                f"{meta_beg}\t"
                f"{meta_end}\t"
                f"{gene_func}\t"
                f"{meta_exon}\t"
                f"{strand}\n"
            )

    hsap_bed = BedTool(name2hsap[name], from_string=True)
    exon_bed = BedTool(exon_bed, from_string=True)
    
    hsap_exon_bed = hsap_bed.intersect(exon_bed, wao=True)
    hsap_exon_bed.saveas(f"data/ref/{name}/annot/out/hsap_exon_homology.bed")

In [8]:
exon_map = defaultdict(dict)
names = list()

In [9]:
hsap_exon_data = dict()
hsap_gene_data = dict()

In [10]:
with open("../data/ref/homo_sapiens/annot/out/meta_exon.csv", 'r') as f:
    reader = csv.reader(f)
    next(reader)

    for row in reader:
        gene, gene_func, chrom, beg, end, strand, exon_type, make_up = row

        strand = int(strand == "+")
        meta_exon = f"{chrom}:{beg}-{end}:{strand}"

        hsap_exon_data[meta_exon] = exon_type
        hsap_gene_data[meta_exon] = gene, gene_func

In [11]:
for fp in glob.iglob("data/ref/*/annot/out/hsap_exon_homology.bed"):
    name = re.search("(?<=ref/)[^/]+", fp).group(0)

    if name == "homo_sapiens":
        continue

    # I want H. sap. to be column 1, so I add it to the header
    # manually.
    names.append(name)

    with open(fp, 'r') as f:
        reader = csv.reader(f, delimiter="\t")

        for row in reader:
            overlap = int(row.pop())

            h_chrom, *h_pos, h_exon, _, h_strand = row[:6]
            o_chrom, *o_pos, _, o_exon, o_strand = row[6:]

            if overlap:
                exon_map[h_exon][name] = o_exon
            else:
                exon_map[h_exon][name] = "-"

In [12]:
with open("../data/homology/exons/exons.csv", 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["hsap_exon", "hsap_gene", "hsap_gene_biotype"] + names)

    # `exon_map` is populated from the `bedtools` output, so if an exon
    # has no alignment in any species, it's excluded.
    for h_exon in exon_map:
        h_type = hsap_exon_data[h_exon]
        h_gene, h_func = hsap_gene_data[h_exon]

        row = [
            h_type + "=" + h_exon,
            h_gene,
            h_func,
        ]

        for name in names:
            exon = exon_map[h_exon].get(name, "-")
            row.append(exon)

        writer.writerow(row)