# Model predictions

In [None]:
import seaborn as sns
import numpy as np
import os
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import json
import scipy
import pathlib
from os import path
%matplotlib inline

In [None]:
def plot_confusion_matrix(cm,
                          classes,
                          ax,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Greys):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if title is None:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    im = ax.imshow(cm, cmap=cmap)
    #ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(
        xticks=np.arange(cm.shape[1]),
        yticks=np.arange(cm.shape[0]),
        # ... and label them with the respective list entries
        xticklabels=classes,
        yticklabels=classes,
        title=title,
        ylabel='True label',
        xlabel='Predicted label'
    )
    ax.set_xticklabels(classes, rotation=45, ha="right")

    # Rotate the tick labels and set their alignment.

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else '.1f'
    thresh = np.nan_to_num(cm).max() / 2.
    text_th = 0.01 if normalize else 0
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            if cm[i, j] > text_th:
                ax.text(j,
                        i,
                        format(cm[i, j], fmt),
                        ha="center",
                        va="center",
                        color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax



In [None]:
PROJECTROOTDIR = "."
FIGDIR = "./figures"
RESULTS_PATH = './results'

In [None]:
LABELS = ['no repeat', 'HSAT2,3', 'Alphoid', 'Alu', 'LINE-1']

In [None]:
FIGDIR = pathlib.Path(FIGDIR)
FIGDIR.mkdir(exist_ok=True)

In [None]:
sns.set_context('paper')

In [None]:
def preprocess_data(results):
    data = dict()
    for genome, genomedict in results.items():
        for chrom, models_dicts in genomedict.items():
            dataset = genome+"/"+chrom
            for model, metrics in models_dicts.items():
                for k in metrics:
                    if k == 'confusionmatrix':
                        continue
                    if k not in data:
                        data[k] = list()
                    if isinstance(metrics[k], list):
                        data[k].append([dataset.replace('.fa', ''), model] +
                                       metrics[k][1:])
    for k in data:
        data[k] = pd.DataFrame(data[k],
                               columns=['dataset', 'model'] + LABELS[1:])
    return data

In [None]:
def get_cnf_matrices(results):
    confusion_matrices = dict()
    for genome, genomedict in results.items():
        for chrom, models in genomedict.items():
            dataset = genome+"/"+chrom
            confusion_matrices[dataset] = np.zeros(
                (len(models), len(LABELS), len(LABELS)))
            for i, metrics in enumerate(models.values()):
                confusion_matrices[dataset][i] = metrics['confusionmatrix']
    return confusion_matrices



# Running time and Accuracy

In [None]:
styles = {'margin_titles': True, 'ylim': (0, 1), 'height': 5}

In [None]:
with pathlib.Path(RESULTS_PATH,'deepgrp_results.json').open('rb') as file:
    deepgrp_gpu_data = json.load(file)

with pathlib.Path(RESULTS_PATH,'dnabrnn_results.json').open('rb') as file:
    dnabrnn_data = json.load(file)

In [None]:
mcc_deepgrp = pd.DataFrame({chrom:{k:v["totalMCC"] for k, v in data.items()} for chrom, data in deepgrp_gpu_data["hg19"].items()})

In [None]:
mcc_dnabrnn = pd.DataFrame({chrom:{k:v["totalMCC"] for k, v in data.items()} for chrom, data in dnabrnn_data["hg19"].items()})

In [None]:
mcc_deepgrp.stack().describe()

In [None]:
mcc_dnabrnn.stack().describe()

In [None]:

deepgrp_gpu = preprocess_data(deepgrp_gpu_data)
dnabrnn = preprocess_data(dnabrnn_data)

In [None]:
FPR = deepgrp_gpu['FPR']
FPR[''] = 'FPR'

FNR = deepgrp_gpu['FNR']
FNR[''] = 'FNR'

In [None]:
deepgrp_metrics = FPR.append(FNR).set_index(['', 'dataset'])
deepgrp_metrics['modeltype']='deepgrp'

In [None]:
FPR = dnabrnn['FPR']
FPR[''] = 'FPR'

FNR = dnabrnn['FNR']
FNR[''] = 'FNR'

In [None]:
dnabrnn_metrics= FPR.append(FNR).set_index(['', 'dataset'])#
dnabrnn_metrics['modeltype']='dnabrnn'

In [None]:
metrics = deepgrp_metrics.append(dnabrnn_metrics)

In [None]:
metrics = metrics.reset_index()
metrics = metrics[metrics.dataset.isin(("hg19/chr1", "hg38/chr1", "mm10/chr2"))]
metrics =  metrics.set_index(['modeltype','','dataset'])

In [None]:
FPR = metrics.xs("FPR", level=1).drop(columns=["model"]).stack().to_frame().reset_index()

In [None]:
FNR = metrics.xs("FNR", level=1).drop(columns=["model"]).stack().to_frame().reset_index()

In [None]:
FPR.rename(columns={"level_2":"repeat", 0:"FPR"},inplace=True)

In [None]:
FNR.rename(columns={"level_2":"repeat", 0:"FNR"},inplace=True)

In [None]:
FNR.modeltype = FNR.modeltype.replace({"deepgrp":"DeepGRP","dnabrnn": "dna-brnn"})
FPR.modeltype = FPR.modeltype.replace({"deepgrp":"DeepGRP","dnabrnn": "dna-brnn"})

In [None]:
f = sns.catplot(y="FPR",x="dataset",col="repeat",hue="modeltype",data=FPR, kind="box",color='black',legend=False)
plt.legend(loc='best')
plt.ylim(0,0.05)
plt.savefig(pathlib.Path(FIGDIR, 'figure2.pdf'),
            bbox_inches='tight',
            pad_inches=0)

In [None]:
f = sns.catplot(y="FNR",x="dataset",col="repeat",hue="modeltype",data=FNR, kind="box",color='black',legend=False, sharex=False)
plt.legend(loc='best')
plt.savefig(path.join(FIGDIR, 'figure3.pdf'),
            bbox_inches='tight',
            pad_inches=0)

In [None]:
deepgrp_cnfs = get_cnf_matrices(deepgrp_gpu_data)

In [None]:
dnabrnn_cnfs = get_cnf_matrices(dnabrnn_data)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(5.5, 5.5),sharey=True,sharex=True)
plot_confusion_matrix(deepgrp_cnfs['hg19/chr1'].mean(axis=0),
                      LABELS,
                      ax[0],
                      normalize=True,
                      title='hg19/chr1')
ax[0].set_xticklabels([])
plot_confusion_matrix(deepgrp_cnfs['hg38/chr1'].mean(axis=0),
                      LABELS,
                      ax[1],
                      normalize=True,
                      title='hg38/chr1')
fig.subplots_adjust(wspace=0.2)
fig.suptitle("Evaluation of DeepGRP for hg19/chr1 and hg38/chr1", y=0.78, x=0.55)
plt.savefig(path.join(FIGDIR, 'figure4.pdf'),
            bbox_inches='tight',
            pad_inches=0)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(5.5, 5.5),sharey=True,sharex=True)
plot_confusion_matrix(deepgrp_cnfs['hg19/chr1'].mean(axis=0),
                      LABELS,
                      ax[0],
                      normalize=True,
                      title='DeepGRP')
ax[0].set_xticklabels([])
plot_confusion_matrix(dnabrnn_cnfs['hg19/chr1'].mean(axis=0),
                      LABELS,
                      ax[1],
                      normalize=True,
                      title='dna-brnn')
fig.subplots_adjust(wspace=0.2)
fig.suptitle("Evaluation of DeepGRP and dna-brnn for hg19/chr1", y=0.78, x=0.55)
plt.savefig(path.join(FIGDIR, 'supplement_deepgrp_dnabrnn_hg19_chr1.pdf'),
            bbox_inches='tight',
            pad_inches=0)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(5.5, 5.5),sharey=True,sharex=True)
plot_confusion_matrix(deepgrp_cnfs['hg38/chr1'].mean(axis=0),
                      LABELS,
                      ax[0],
                      normalize=True,
                      title='RepeatMasker annotation')
ax[0].set_xticklabels([])
plot_confusion_matrix(deepgrp_cnfs['dfam/chr1'].mean(axis=0),
                      LABELS,
                      ax[1],
                      normalize=True,
                      title='DFAM annotation')
fig.subplots_adjust(wspace=0.2)
fig.suptitle(" Evaluation of DeepGRP for hg38/chr1 (1)", y=0.78, x=0.55)
plt.savefig(path.join(FIGDIR, 'supplement_dfam.pdf'),
            bbox_inches='tight',
            pad_inches=0)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(5.5, 5.5),sharey=True,sharex=True)
plot_confusion_matrix(deepgrp_cnfs['hg38/chr1'].mean(axis=0),
                      LABELS,
                      ax[0],
                      normalize=True,
                      title='RepeatMasker annotation')
ax[0].set_xticklabels([])
plot_confusion_matrix(deepgrp_cnfs['dfam_and_rm/chr1'].mean(axis=0),
                      LABELS,
                      ax[1],
                      normalize=True,
                      title='DFAM $\cap$ RepeatMasker annotation')
fig.subplots_adjust(wspace=0.2)
fig.suptitle("Evaluation of DeepGRP for hg38/chr1 (2)", y=0.78, x=0.55)
plt.savefig(path.join(FIGDIR, 'supplement_dfam_repeatmasker.pdf'),
            bbox_inches='tight',
            pad_inches=0)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(5.5, 5.5),sharey=True,sharex=True)
plot_confusion_matrix(deepgrp_cnfs['hg38/chr1'].mean(axis=0),
                      LABELS,
                      ax[0],
                      normalize=True,
                      title='RepeatMasker annotation')
ax[0].set_xticklabels([])
plot_confusion_matrix(deepgrp_cnfs['dfam_no_rm/chr1'].mean(axis=0),
                      LABELS,
                      ax[1],
                      normalize=True,
                      title='DFAM \ RepeatMasker annotation')
fig.subplots_adjust(wspace=0.2)
fig.suptitle("Evaluation of DeepGRP for hg38/chr1 (3)", y=0.78, x=0.55)
plt.savefig(path.join(FIGDIR, 'supplement_dfam_no_repeatmasker.pdf'),
            bbox_inches='tight',
            pad_inches=0)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(5.5, 5.5),sharey=True,sharex=True)
plot_confusion_matrix(deepgrp_cnfs['hg38/chr1'].mean(axis=0),
                      LABELS,
                      ax[0],
                      normalize=True,
                      title='all regions of RM annot.')
ax[0].set_xticklabels([])
plot_confusion_matrix(deepgrp_cnfs['hg19_hg38_similar/chr1'].mean(axis=0),
                      LABELS,
                      ax[1],
                      normalize=True,
                      title='regions of hg38/chr1 similar to hg19/chr1')
fig.subplots_adjust(wspace=0.2)
fig.suptitle("Evaluation of DeepGRP for hg38/chr1 (4)", y=0.78, x=0.6)
plt.savefig(path.join(FIGDIR, 'supplement_repeatmasker_similar.pdf'),
            bbox_inches='tight',
            pad_inches=0)

# Complete hg19

In [None]:
REPEATS = {
    1: 'HSAT2,3',
    2: 'Alphoid',
    3: 'Alu',
    4: 'LINE-1',
}

In [None]:
deepgrp_mcc = deepgrp_gpu["MCC"]
dnabrnn_mcc = dnabrnn["MCC"]

In [None]:
deepgrp_mcc = deepgrp_mcc.set_index(["dataset","model"]).stack(dropna=False).reset_index().rename(columns={"level_2":"repeat class", 0:"MCC"})
dnabrnn_mcc = dnabrnn_mcc.set_index(["dataset","model"]).stack(dropna=False).reset_index().rename(columns={"level_2":"repeat class", 0:"MCC"})

In [None]:
dnabrnn_mcc['modeltype']='dna-brnn'
deepgrp_mcc['modeltype']='DeepGRP'
mcc = deepgrp_mcc.append(dnabrnn_mcc)

In [None]:
mcc = mcc[mcc.dataset.str.contains("hg19/chr\d+")]

In [None]:
mcc["chromosome"] = pd.Categorical(mcc.dataset.str.replace("hg19/chr", ""), categories=[str(i) for i in range(1,23)], ordered=True)

In [None]:
from matplotlib.ticker import MultipleLocator

In [None]:
g = sns.catplot(
    x="chromosome",
    y="MCC",
    hue="repeat class",
    col="modeltype",
    data=mcc.fillna({"MCC":0.0}),
    kind="point",
    ci=None,
    dodge=True,
    palette=['black', 'dimgrey', 'grey', 'darkgrey'],
    markers=['v', 'p', 'o', 'D'],
    height=5,
    aspect=1,
    linestyles=[''] * 4,
)
g.axes[0][0].set_ylabel('Matthews correlation coefficient')
g.axes[0][0].set_title("DeepGRP on hg19")
g.axes[0][1].set_title("dna-brnn on hg19")
for ax in g.axes.flat:
    ax.yaxis.set_minor_locator(MultipleLocator(0.1))
    ax.grid(True, axis='both', alpha=0.5, which="both")
plt.savefig(path.join(FIGDIR,'figure5.pdf'), dpi=300, bbox_inches='tight')

In [None]:
mcc.groupby(["repeat class", "modeltype"]).describe()

# Running Time

In [None]:
import pathlib
import pandas as pd
import json
import re

In [None]:
def preprocess_data(results):
    data = dict()
    for dataset, models_dicts in results.items():
        data[dataset] = dict()
        for model, metrics in models_dicts.items():
            data[dataset][model] = metrics["runtime"]
    return pd.DataFrame.from_dict(data, orient="index")

In [None]:
with pathlib.Path("results",'deepgrp_runningtime.json').open('rb') as file:
    deepgrp_gpu_data = preprocess_data(json.load(file))
deepgrp_gpu_data = deepgrp_gpu_data.stack().reset_index().rename(columns={"level_0":"chromosome","level_1":"model", 0:"running time [s]"})
deepgrp_gpu_data["model"] = "DeepGRP"

In [None]:
repeatmasker =  pd.read_csv(pathlib.Path("results","repeatmasker_runningtime.csv"), index_col=0)
repeatmasker = repeatmasker.reset_index().rename(columns={"index":"chromosome","real": "running time [s]"}).drop(columns=["user", "sys"])
repeatmasker["model"] = "RepeatMasker"

In [None]:
dfam =  pd.read_csv(pathlib.Path("results", "dfam_runningtime.csv"), index_col=0)
dfam = dfam.reset_index().rename(columns={"index":"chromosome","real": "running time [s]"}).drop(columns=["user", "sys"])
dfam["model"] = "DFAM/HMMER"

In [None]:
with pathlib.Path("results",'dnabrnn_runningtime.json').open('rb') as file:
    dnabrnn = preprocess_data(json.load(file))
dnabrnn = dnabrnn.stack().reset_index().rename(columns={"level_0":"chromosome","level_1":"model", 0:"running time [s]"})
dnabrnn["model"] = "dna-brnn"

In [None]:
data = pd.concat([deepgrp_gpu_data, repeatmasker,dfam,dnabrnn])

In [None]:
data['chromosome'] = data['chromosome'].str.split('/',expand=True)[1].str.replace(".fa","", regex=False)

In [None]:
chr_length = pd.read_csv(pathlib.Path("data","hg19.chrom.sizes"),
sep='\t',
header=None,
names=['chromosome', 'sequence length'],
index_col=0)

In [None]:
chr_length['sequence length'] = chr_length['sequence length'] / 1e6

In [None]:
data = pd.merge(data,chr_length,left_on='chromosome',right_index=True)

In [None]:
import scipy.stats

In [None]:
reg = data.groupby("model").apply(lambda x: pd.Series(scipy.stats.linregress(x["sequence length"], x["running time [s]"]), index=["slope",
                                                                                                                                  "intercept",
                                                                                                                                  "r-value",
                                                                                                                                  "p-value",
                                                                                                                                  "stderr"]
))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("paper")

In [None]:
header = "{} ({:.3g} + {:.3g}n)"
ax = sns.lmplot(x='sequence length',
            y='running time [s]',
            data=data,
            hue="model",
            markers=["x","o", "*","v"],
            palette='Greys')
ax.set(xlabel='sequence length $n$ [Mbp]', ylabel="running time [s]")

#plt.yscale("log")

plt.savefig(pathlib.Path("figures", 'figure6.pdf'),
            dpi=300,
            bbox_inches='tight',
            pad_inches=0)

In [None]:
reg

In [None]:
reg["slope"] / reg.loc["DeepGRP", "slope"]