In [109]:
import os
import pandas as pd
import re

path = os.path.abspath("/group/ag_abi/seiler/daisy_review/sim_run/candidates/")

In [110]:
def parse_line(line, row_list, Type=""):
    if Type == "":
        [Type, Name, Accession, TaxID, ParentTaxID, SpeciesTaxID, Abundance, NumReads, UniqueReads, Coverage, Validity, Homogeneity, MappingError, PropertyScore, Property] = line.split('\t')
    else:
        [_, Name, Accession, TaxID, ParentTaxID, SpeciesTaxID, Abundance, NumReads, UniqueReads, Coverage, Validity, Homogeneity, MappingError, PropertyScore, Property] = line.split('\t')
    
    row_dict = {}
    row_dict["Type"] = Type
    row_dict["Name"] = Name
    row_dict["Accession.Version"] = Accession
    row_dict["TaxID"] = TaxID
    row_dict["Parent TaxID"] = ParentTaxID
    row_dict["Species TaxID"] = SpeciesTaxID
    row_dict["Abundance"] = Abundance
    row_dict["Num. Reads"] = NumReads
    row_dict["Unique Reads"] = UniqueReads
    row_dict["Coverage"] = Coverage
    row_dict["Validity"] = Validity
    row_dict["Homogeneity"] = Homogeneity
    row_dict["Mapping Error"] = MappingError
    row_dict["Property Score"] = PropertyScore
    row_dict["Property"] = Property
    row_list.append(row_dict)
    

In [111]:
def parse_file(file_name, row_list):
    case = 0
    with open(file_name, 'r') as stream:
        next(stream) # skip header
        for line in stream:
            if (line == "======================================\n"):
                continue
            elif (line == "Acceptors within same species as first acceptor and higher score than second acceptor:\n"):
                case = 1
                continue
            elif (line == "Donors within same species as first donor and lower score than second donor:\n"):
                case = 2
                continue
            elif (line == "Donors within same species as second donor and lower score than third donor:\n"):
                case = 3
                continue

            line = line.rstrip()
            if (case == 0):
                parse(line, row_list)
            elif (case == 1):
                parse(line, row_list, "Support Acceptor")
            elif (case == 2):
                parse(line, row_list, "Support Donor 1")
            else:
                parse(line, row_list, "Support Donor 2")

In [112]:
def create_df(file_name):
    row_list = []
    parse_file(file_name, row_list)
    df = pd.DataFrame(row_list)
    df = df[["Type",
             "Name",
             "Accession.Version",
             "TaxID",
             "Parent TaxID",
             "Species TaxID",
             "Abundance",
             "Num. Reads",
             "Unique Reads",
             "Coverage",
             "Validity",
             "Homogeneity",
             "Mapping Error",
             "Property Score",
             "Property"]]
    return df

In [137]:
def analyse_df(df, row_list):
    row_dict = {}
    acc_tp = 0
    don_tp = 0
    accdon_tp = 0
    acc_sup_tp = 0
    don_sup_tp = 0
    acc_tp += "NC_010473.1" in df.loc[df["Type"] == "Acceptor"]["Accession.Version"].values
    don_tp += "NZ_AP014710.1" in df.loc[df["Type"] == "Donor"]["Accession.Version"].values
    accdon_tp += "NZ_AP014710.1" in df.loc[df["Type"] == "Acceptor-like Donor"]["Accession.Version"].values
    acc_sup_tp += "NC_010473.1" in df.loc[df["Type"] == "Support Acceptor"]["Accession.Version"].values
    don_sup_tp += "NZ_AP014710.1" in df.loc[df["Type"] == "Support Donor 1"]["Accession.Version"].values
    don_sup_tp += "NZ_AP014710.1" in df.loc[df["Type"] == "Support Donor 2"]["Accession.Version"].values
    row_dict["SNP Rate"] = snp_rate
    row_dict["Indel Rate"] = indel_rate
    row_dict["Repeat"] = run
    row_dict["TP Acceptor"] = acc_tp
    row_dict["TP Donor"] = don_tp
    row_dict["TP AccDon"] = accdon_tp
    row_dict["TP Support Acceptor"] = acc_sup_tp
    row_dict["TP Support Donor"] = don_sup_tp
    row_list.append(row_dict)

In [138]:
row_list = []
for [snp_rate, indel_rate] in [[str(x/100), str(x/1000)] for x in range(1, 11)]:
    for run in ["1", "2"]:
        file_name = os.path.join(path, "hpylori_{}_{}_{}_candidates.tsv".format(snp_rate, indel_rate, run))
        analyse_df(create_df(file_name), row_list)
df = pd.DataFrame(row_list)
df = df[["SNP Rate",
         "Indel Rate",
         "Repeat",
         "TP Acceptor",
         "TP Donor",
         "TP AccDon",
         "TP Support Acceptor",
         "TP Support Donor"]]

In [140]:
df

Unnamed: 0,SNP Rate,Indel Rate,Repeat,TP Acceptor,TP Donor,TP AccDon,TP Support Acceptor,TP Support Donor
0,0.01,0.001,1,1,1,0,0,0
1,0.01,0.001,2,1,1,0,0,0
2,0.02,0.002,1,1,1,0,0,0
3,0.02,0.002,2,1,0,0,0,0
4,0.03,0.003,1,1,1,0,0,0
5,0.03,0.003,2,1,1,0,0,0
6,0.04,0.004,1,0,1,0,0,0
7,0.04,0.004,2,0,1,0,0,0
8,0.05,0.005,1,0,1,0,0,0
9,0.05,0.005,2,0,0,0,0,0


In [None]:
# df.to_csv(os.path.join(path, "ratios.tsv"), sep='\t', index=False)