In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy.stats import pearsonr

import msastats
from spartaabc.abc_inference import load_data, load_correction_regressors, bias_correction, IndelParams
from spartaabc.utility import get_msa_path, PARAMS_LIST

In [30]:
distance_metric = "mahal"
top_cutoff = 1000
aligner = "mafft"
main_path = Path("data").resolve()
print(main_path)

/home/pupkolab/Dev/SpartaV2/benchmark/data


In [25]:
def get_top_params(main_path: Path):
    MSA_PATH = get_msa_path(main_path)

    empirical_stats = msastats.calculate_fasta_stats(MSA_PATH)

    stats_data = load_data(main_path)
    regressors = load_correction_regressors(main_path, aligner)

    params_data = []
    full_stats_data = []
    for model in  stats_data.keys():
        current_regressors = regressors.get(model, None)
        params_data.append(stats_data[model][PARAMS_LIST])

        if current_regressors is not None:
            temp_df = bias_correction(current_regressors, stats_data[model])
            full_stats_data.append(temp_df)

    params_data = pd.concat(params_data)
    full_stats_data = pd.concat(full_stats_data)

    calculated_distances = None

    if distance_metric == "mahal":
        cov = np.cov(full_stats_data.T)
        cov = cov + np.eye(len(cov))*1e-4
        inv_covmat = np.linalg.inv(cov)
        u_minus_v = empirical_stats-full_stats_data
        left = np.dot(u_minus_v, inv_covmat)
        calculated_distances = np.sqrt(np.sum(u_minus_v*left, axis=1))
    if distance_metric == "euclid":
        weights = 1/(full_stats_data.std(axis=0) + 0.001)
        calculated_distances = np.sum(weights*(full_stats_data - empirical_stats)**2, axis=1)

    full_stats_data["distances"] = calculated_distances
    full_stats_data[PARAMS_LIST] = params_data

    top_stats = full_stats_data.nsmallest(top_cutoff, "distances")
    return top_stats

In [42]:
top_params = {}
for dir in main_path.iterdir():
    try:
        top_params[dir] = get_top_params(dir)
    except:
        continue

/home/pupkolab/Dev/SpartaV2/benchmark/data/BBS30028_89
/home/pupkolab/Dev/SpartaV2/benchmark/data/BBS20040_87
/home/pupkolab/Dev/SpartaV2/benchmark/data/BBS20037_65
/home/pupkolab/Dev/SpartaV2/benchmark/data/BBS50015_32
/home/pupkolab/Dev/SpartaV2/benchmark/data/BBS30024_69
/home/pupkolab/Dev/SpartaV2/benchmark/data/BBS30017_15
/home/pupkolab/Dev/SpartaV2/benchmark/data/BBS20011_21
/home/pupkolab/Dev/SpartaV2/benchmark/data/BBS12029_12
/home/pupkolab/Dev/SpartaV2/benchmark/data/BBS11006_8
/home/pupkolab/Dev/SpartaV2/benchmark/data/BBS50012_49
/home/pupkolab/Dev/SpartaV2/benchmark/data/BBS20019_24
/home/pupkolab/Dev/SpartaV2/benchmark/data/BBS12037_13
/home/pupkolab/Dev/SpartaV2/benchmark/data/BBS12028_8
/home/pupkolab/Dev/SpartaV2/benchmark/data/BBS12027_13
/home/pupkolab/Dev/SpartaV2/benchmark/data/BBS30010_50
/home/pupkolab/Dev/SpartaV2/benchmark/data/BBS11019_10
/home/pupkolab/Dev/SpartaV2/benchmark/data/BBS11038_8
/home/pupkolab/Dev/SpartaV2/benchmark/data/BBS12015_12
/home/pupkola

In [33]:
def get_indel_model(data_path: Path):
    raw_data = (data_path / "true_params.txt").read_text()
    lines = raw_data.splitlines()[:-1]
    model_params = []
    for line in lines:
        param, value = line.split(": ")
        if param == "Model":
            model_params.append(value)
        if param == "Root_length":
            model_params.append(int(value))
        if param in ["R_I", "R_D", "R_ID", "A_I", "A_D", "A_ID"]:
            model_params.append(float(value))
    return model_params    

In [34]:
def get_all_model_params(data_path: Path, models, abc_params: pd.DataFrame):
    top_cutoff = 200
    true_params = get_indel_model(data_path)

    if true_params[0] == "sim":
        filtered_top_params = abc_params[abc_params["insertion_rate"] == abc_params["deletion_rate"]]
        filtered_top_params = filtered_top_params.nsmallest(top_cutoff, "distances")

        root_length = int(filtered_top_params["root_length"].mean())
        R_ID = float(filtered_top_params["insertion_rate"].mean())
        A_ID = float(filtered_top_params["length_param_insertion"].mean())
        models["sim"]["root_lengths"].append((true_params[1], root_length))
        models["sim"]["indel_rates"].append((true_params[2], R_ID))
        models["sim"]["indel_length_params"].append((true_params[3], A_ID))
    else:
        filtered_top_params = abc_params[abc_params["insertion_rate"] != abc_params["deletion_rate"]]
        filtered_top_params = filtered_top_params.nsmallest(top_cutoff, "distances")
        root_length = int(filtered_top_params["root_length"].mean())
        R_I = float(filtered_top_params["insertion_rate"].mean())
        R_D = float(filtered_top_params["deletion_rate"].mean())
        A_I = float(filtered_top_params["length_param_insertion"].mean())
        A_D = float(filtered_top_params["length_param_deletion"].mean())

        models["rim"]["root_lengths"].append((true_params[1], root_length))
        models["rim"]["insertion_rates"].append((true_params[2], R_I))
        models["rim"]["deletion_rates"].append((true_params[3], R_D))
        models["rim"]["insertion_length_params"].append((true_params[4], A_I))
        models["rim"]["deletion_length_params"].append((true_params[5], A_D))

    return models


In [35]:

models = {"rim":
        {
            "root_lengths": [],
            "insertion_rates": [],
            "deletion_rates": [],
            "insertion_length_params": [],
            "deletion_length_params": []
        },
        "sim":
        {
            "root_lengths": [],
            "indel_rates": [],
            "indel_length_params": [],
        }
    }

for dir in main_path.iterdir():
    try:
        get_all_model_params(dir, models, top_params[dir])
    except:
        continue

In [36]:
top_params

{}

In [37]:
def plot_benchmarks(model: str,param: str):
    x, y = zip(*models[model][param])
    r_val = 0#pearsonr(x,y)[0]
    print(r_val)
    plt.scatter(x, y)
    ax = plt.gca()
    ax.text(ax.get_xlim()[1], ax.get_ylim()[0], f"R={r_val}",
            horizontalalignment='right',
            verticalalignment='bottom')

    plt.show()


In [38]:
plot_benchmarks("rim", "root_lengths")
plot_benchmarks("rim", "insertion_rates")
plot_benchmarks("rim", "deletion_rates")
plot_benchmarks("rim", "insertion_length_params")
plot_benchmarks("rim", "deletion_length_params")




ValueError: not enough values to unpack (expected 2, got 0)

In [39]:
plot_benchmarks("sim", "root_lengths")
plot_benchmarks("sim", "indel_rates")
plot_benchmarks("sim", "indel_length_params")


ValueError: not enough values to unpack (expected 2, got 0)