In [34]:
import pandas as pd

er = "0075"
rep = "3"

# ------- INPUT ------- 
stellar_out_file = "../stellar/rep" + rep + "_e" + er + ".gff"
# stellar_out_file = snakemake.input.stellar

truth_file = "../ground_truth/rep" + rep + "_e" + er + ".tsv"
# truth_file = snakemake.input.truth

# ------- OUTPUT ------- 
evaluation_file = "../evaluation/" + rep + "_e" + er + ".tsv"
#evaluation_file = snakemake.output[0]

In [35]:
# ------- preprocess stellar output ------- 
stellar_df = pd.read_csv(stellar_out_file, sep='\t', header=None)
stellar_df.columns = ["DNAME", "stellar", "eps-matches", "DBEGIN", "DEND", "PERCID", "DSTRAND", ".", "ATTRIBUTES"]

interesting_attributes = stellar_df['ATTRIBUTES'].str.split(';',expand=True).drop(labels = [0, 2, 3], axis = 1)
interesting_attributes = interesting_attributes[1].str.split('=', expand=True).drop(labels=0, axis = 1)
interesting_attributes = interesting_attributes[1].str.split(',', expand=True)
interesting_attributes.columns = ['QBEGIN', 'QEND']

stellar_df = stellar_df.drop(labels = ["stellar", "eps-matches", "DSTRAND", "ATTRIBUTES"], axis = 1)
stellar_df = stellar_df.join(interesting_attributes)
stellar_df["QBEGIN"] = pd.to_numeric(stellar_df["QBEGIN"])
stellar_df["QEND"] = pd.to_numeric(stellar_df["QEND"])

# convert stellar 1-based indices to 0-based
stellar_df["QBEGIN"] = stellar_df.apply(lambda row: row.QBEGIN - 1, axis=1)
stellar_df["QEND"] = stellar_df.apply(lambda row: row.QEND - 1, axis=1)

sorted_stellar = stellar_df.sort_values('QBEGIN')
sorted_stellar["length"] = sorted_stellar.apply(lambda row: row.DEND - row.DBEGIN, axis=1)

In [36]:
# ------- preprocess ground truth -------
truth_df = pd.read_csv(truth_file, sep='\t')
truth_df['QEND'] =  truth_df.apply(lambda row: row.position + row.length, axis=1)
truth_df.rename(columns = {'position':'QBEGIN'}, inplace = True)

In [37]:
truth_df.head()

Unnamed: 0,id,QBEGIN,length,QEND
0,l200-33,1184,200,1384
1,l200-65,1556,200,1756
2,l200-15,8220,200,8420
3,l150-27,8960,150,9110
4,l50-93,13122,50,13172


In [45]:
def evaluate_ground_truth(ground_truth):
    # ------- evaluate results -------
    total_match_count = len(ground_truth["id"]) 
    true_match_count = 0
    overlap_list = []
    #min_overlap = snakemake.config["min_overlap"]
    min_overlap = 30
    
    for t_ind in range(total_match_count):
        truth_range = range(ground_truth.iloc[t_ind]['QBEGIN'],ground_truth.iloc[t_ind]['QEND'])
        for s_ind in range(len(sorted_stellar['DNAME'])): 
            stellar_range = range(sorted_stellar.iloc[s_ind]['QBEGIN'],sorted_stellar.iloc[s_ind]['QEND'])

            # find overlap between two ranges
            overlap_range = range(max(truth_range[0], stellar_range[0]), min(truth_range[-1], stellar_range[-1])+1)
            if (len(overlap_range) >= min_overlap):
                true_match_count += 1
                overlap_list.append(len(overlap_range)) # TODO: might want to check the overlap lengths
                break # move on to next local match once current one is verified to be true

    missed = 1.0 - min(true_match_count/total_match_count, 1.0)
    data = [[total_match_count, true_match_count, missed]]
    out_df = pd.DataFrame(data, columns = ["total_match_count", "true_match_count", "missed"])

    return out_df
    #out_df.to_csv(evaluation_file, sep='\t')

In [49]:
for l in [50, 100, 150, 200]:
    truth_l = truth_df[truth_df['length']==l]
    out = evaluate_ground_truth(truth_l)
    print("Stellar missed " + str(out['missed'][0]) + "% of local matches of length " + str(l) + "bp.")

Stellar missed 0.76% of local matches of length 50bp.
Stellar missed 0.0% of local matches of length 100bp.
Stellar missed 0.0% of local matches of length 150bp.
Stellar missed 0.0% of local matches of length 200bp.
