In [1]:
from Bio import SeqIO
import os
import pandas as pd
import sys

In [2]:
def readPIL(fileName, asm=None, component="all", verbose=False):
    baseName = os.path.basename(fileName)
    if asm is None:
        asm = os.path.splitext(baseName)[0].replace("_pilerCR", "")
    tool = "pilerCR"
    f = open(fileName)
    for _ in range(4):
        line = f.readline()
    arr_num = int(line.strip().split(" ")[1])
    for _ in range(7):
        line = f.readline()
    crip = []
    copd = {}
    for line in f:
        if line == "\n":
            continue
        elif line.startswith("Array"):
            cri_id = asm + "_" + tool[0] + "_" + line.strip().split(" ")[1]
        elif line.startswith(">"):
            contig = line.strip().split(" ")[0].replace(">", "")
        elif line.startswith("       Pos"):
            for _ in range(2):
                line = f.readline()
            idx = 0
            gap_tot = 0
            while not line.startswith("="):
                idx += 1
                rep_id = cri_id + "_r" + str(idx)
                spa_id = cri_id + "_s" + str(idx)
                chars = list(line)
                chars[33] = chars[33].replace(" ", "-")
                chars[45] = chars[45].replace(" ", "-")
                line = "".join(chars)
                if line.endswith(" \n"):
                    line += "-"
                eles = line.strip().split()
                if len(eles) != 7:
                    print("Non-7-column line: {}".format(eles), file=sys.stderr)
                    break
                gap_num = eles[5].count("-")
                rep_stt = int(eles[0]) - 1 - gap_tot    # 0-based
                rep_end = rep_stt + len(eles[5]) - gap_num
                rep_seq = eles[5]
                rifs = [contig, rep_stt, rep_end, rep_id, 500, ".", asm, tool, cri_id, "repeat", rep_seq, len(rep_seq) - gap_num]
                crip.append(rifs)
                if eles[3] != "-":
                    spa_stt = rep_end    # 0-based
                    spa_end = spa_stt + len(eles[6])
                    spa_seq = eles[6]
                    sifs = [contig, spa_stt, spa_end, spa_id, 1000, ".", asm, tool, cri_id, "spacer", spa_seq, len(spa_seq)]
                    crip.append(sifs)
                gap_tot += gap_num
                line = f.readline()
            line = f.readline()
            eles = line.strip().split()
            copd[cri_id] = eles[-1]
        elif line.startswith("SUMMARY"):
            break
        else:
            print("Unrecognized line: {}".format(line.strip()), file=sys.stderr)
    f.close()
    crip = pd.DataFrame(crip, columns=["contig", "start", "end", "id", "score", "strand", "asm", "tool", "cid", "component", "seq", "length"])
    for idx, row in crip.iterrows():
        if row["component"] == "repeat":
            chars = list(row["seq"])
            for i, char in enumerate(chars):
                if char == ".":
                    chars[i] = copd[row["cid"]][i]
            crip.loc[idx, "seq"] = "".join(chars).replace("-", "")
    copp = pd.DataFrame(copd.items(), columns=["cid", "conSeq"])
    copp.insert(0, "asm", asm)
    copp.insert(1, "tool", tool)
    if component != "all":
        crip = crip.loc[crip["component"] == component, ]
    if verbose:
        print("CRISPR array number: {}".format(arr_num))
        print(crip["component"].value_counts().to_string())
    return crip, copp

In [3]:
def readMIN(fileName, asm=None, component="all", verbose=False):
    baseName = os.path.basename(fileName)
    if asm is None:
        asm = os.path.splitext(baseName)[0].replace("_minCED", "")
    tool = "minCED"
    f = open(fileName)
    crip = []
    copd = {}
    for line in f:
        if line.startswith("Sequence"):
            contig = line.strip().split(" ")[1].replace("'", "")
        elif line == "\n" or line.startswith("Repeats") or line.startswith("Time"):
            continue
        elif line.startswith("CRISPR"):
            cri_id = asm + "_" + tool[0] + "_" + line.strip().split(" ")[1]
        elif line.startswith("POSITION"):
            for _ in range(2):
                line = f.readline()
            idx = 0
            while not line.startswith("-"):
                idx += 1
                rep_id = cri_id + "_r" + str(idx)
                spa_id = cri_id + "_s" + str(idx)
                eles = line.strip().split("\t")
                rep_stt = int(eles[0]) - 1    # 0-based
                rep_end = rep_stt + len(eles[2])
                rep_seq = eles[2]
                rifs = [contig, rep_stt, rep_end, rep_id, 500, ".", asm, tool, cri_id, "repeat", rep_seq, len(rep_seq)]
                if cri_id not in copd:
                    copd[cri_id] = []
                copd[cri_id].append(rep_seq)
                crip.append(rifs)
                if len(eles) == 5:
                    spa_stt = rep_end    # 0-based
                    spa_end = spa_stt + len(eles[3])
                    spa_seq = eles[3]
                    sifs = [contig, spa_stt, spa_end, spa_id, 1000, ".", asm, tool, cri_id, "spacer", spa_seq, len(spa_seq)]
                    crip.append(sifs)
                line = f.readline()
            copd[cri_id] = max(copd[cri_id], key=copd[cri_id].count)
        else:
            print("Unrecognized line: {}".format(line.strip()), file=sys.stderr)
    f.close()
    crip = pd.DataFrame(crip, columns=["contig", "start", "end", "id", "score", "strand", "asm", "tool", "cid", "component", "seq", "length"])
    copp = pd.DataFrame(copd.items(), columns=["cid", "conSeq"])
    copp.insert(0, "asm", asm)
    copp.insert(1, "tool", tool)
    if component != "all":
        crip = crip.loc[crip["component"] == component, ]
    if verbose:
        print("CRISPR array number: {}".format(arr_num))
        print(crip["component"].value_counts().to_string())
    return crip, copp

In [4]:
def validateSeq(criRes, genomeDir="input/asm"):
    rexd = {}
    for asm in criRes["asm"].unique():
        rexd[asm] = SeqIO.index(genomeDir + "/" + asm + ".fa", "fasta")
    for idx, row in criRes.iterrows():
        recd = rexd[row["asm"]]
        seq_gt = str(recd[row["contig"]].seq[row["start"]: row["end"]])
        if row["seq"] != seq_gt:
            print(row)
            break
    else:
        print("All the sequences have been validated successfully.")
    return None

In [5]:
if __name__ == "__main__":
    criDir = "output/crispr"
    criRes = pd.DataFrame()
    conRes = pd.DataFrame()
    for fl in sorted(os.listdir(criDir)):
        if "pilerCR.out" in fl:
            #print("> {}".format(fl))
            crip, copp = readPIL(os.path.join(criDir, fl))
            criRes = pd.concat([criRes, crip])
            conRes = pd.concat([conRes, copp])
        elif "minCED.out" in fl:
            crip, copp = readMIN(os.path.join(criDir, fl))
            criRes = pd.concat([criRes, crip])
            conRes = pd.concat([conRes, copp])
    print("Consensus repeats...")
    print(criRes.shape)    
    stat = criRes.pivot_table(index=["asm", "tool"], columns="component", values="seq", aggfunc=lambda x: (len(x), min([len(i) for i in x]), max([len(i) for i in x]))).reset_index()
    stat.columns.name = None
    print(stat)
    validateSeq(criRes)
    print()    
    print("Consensus repeats...")
    print(conRes.shape)
    stat_con = conRes.pivot_table(index=["asm", "tool"], values="cid", aggfunc=len).reset_index()
    print(stat_con)
    stat_arr = criRes.pivot_table(index=["asm", "tool"], values="cid", aggfunc=lambda x: len(x.unique())).reset_index()
    assert all(stat_con["cid"] == stat_arr["cid"])

Consensus repeats...
(3789, 12)
                asm     tool         repeat         spacer
0   GCA_001594005.1   minCED  (256, 18, 30)  (250, 10, 60)
1   GCA_001594005.1  pilerCR  (138, 29, 32)  (134, 19, 39)
2   GCA_002426795.1   minCED   (41, 36, 36)   (40, 28, 31)
3   GCA_002426795.1  pilerCR   (35, 36, 36)   (34, 28, 31)
4   GCA_009918665.1   minCED  (131, 16, 24)   (114, 8, 49)
5   GCA_009918665.1  pilerCR    (9, 27, 53)    (6, 16, 44)
6   GCA_009918765.1   minCED  (114, 16, 20)    (99, 8, 87)
7   GCA_009918765.1  pilerCR    (6, 40, 51)    (4, 17, 37)
8   GCA_010672215.1   minCED  (322, 16, 58)   (286, 8, 75)
9   GCA_010672215.1  pilerCR  (150, 36, 48)  (131, 22, 49)
10  GCA_017887005.1   minCED   (10, 32, 32)    (9, 33, 38)
11  GCA_017887005.1  pilerCR    (9, 29, 32)    (8, 33, 40)
12  GCA_903819865.1   minCED  (122, 16, 39)   (101, 8, 60)
13  GCA_903819865.1  pilerCR   (39, 36, 38)   (34, 32, 41)
14  GCA_903830605.1   minCED  (123, 16, 46)   (102, 8, 43)
15  GCA_903830605.1  pil