# Imports

In [1]:
import itertools
import math
import pandas as pd
import json
import os
import glob

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

import screed
from sklearn import metrics

%matplotlib inline

In [2]:
def describe(df):
    print(df.shape)
    print("--- First 5 entries ---")
    display(df.head())
    print('--- Random subset ---')
    display(df.sample(5))

# Get gold standard reading frames

## Read gold standard reading frame file

In [3]:
parquet = '/mnt/ibm_sm/home/olga/pipeline-results/human-simulated/true_reading_frames.parquet'

true_coding_frame = pd.read_parquet(parquet)
# Create just a series (single column) from this
true_coding_frame = true_coding_frame['is_coding']
true_coding_frame.head()

read1/tr|A0A024R1R8|ENSP00000491117;mate1Start:1;mate2Start:1__frame=1      True
read1/tr|A0A024R1R8|ENSP00000491117;mate1Start:1;mate2Start:1__frame=2     False
read1/tr|A0A024R1R8|ENSP00000491117;mate1Start:1;mate2Start:1__frame=3     False
read1/tr|A0A024R1R8|ENSP00000491117;mate1Start:1;mate2Start:1__frame=-1    False
read1/tr|A0A024R1R8|ENSP00000491117;mate1Start:1;mate2Start:1__frame=-2    False
Name: is_coding, dtype: bool

# Read concatenated scores for metrics

## Human

In [4]:
%%time


has_stop_codon = 'Translation frame has stop codon(s)'


parquet = '/mnt/ibm_sm/home/olga/pipeline-results/human-simulated/nf-predictorthologs--busco-mammalia-human/translate/coding_scores.parquet'
human_scores_for_metrics = pd.read_parquet(parquet, use_threads=True)
human_scores_for_metrics = human_scores_for_metrics.query('(category == "Non-coding") or (category == "Coding")')

# human_scores_for_metrics = human_scores_for_metrics.set_index('read_id_frame')
print(human_scores_for_metrics.shape)
human_scores_for_metrics.head()

(43973736, 12)
CPU times: user 4min 43s, sys: 2min 20s, total: 7min 4s
Wall time: 4min 30s


### Make sure none of the jaccard in peptide db is null

In [5]:
human_scores_for_metrics.category.value_counts()

Non-coding    30860350
Coding        13113386
Name: category, dtype: int64

In [6]:
human_scores_for_metrics.head()

Unnamed: 0_level_0,read_id,jaccard_in_peptide_db,n_kmers,category,translation_frame,filename,alphabet,ksize,species,is_coding,protein_id,uniprot_id
read_id_frame,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
read100001/sp|O43295|ENSP00000373347;mate1:1283-1432;mate2:1391-1540__frame=-3,read100001/sp|O43295|ENSP00000373347;mate1:128...,0.888889,36.0,Non-coding,-3,Homo_sapiens_9606_qfo_dna_01.fq.gz,protein,14,human,False,sp|O43295|ENSP00000373347,O43295
read100001/sp|O43295|ENSP00000373347;mate1:1283-1432;mate2:1391-1540__frame=-3,read100001/sp|O43295|ENSP00000373347;mate1:128...,0.846154,26.0,Non-coding,-3,Homo_sapiens_9606_qfo_dna_01.fq.gz,dayhoff,24,human,False,sp|O43295|ENSP00000373347,O43295
read100001/sp|O43295|ENSP00000373347;mate1:1283-1432;mate2:1391-1540__frame=-3,read100001/sp|O43295|ENSP00000373347;mate1:128...,0.875,32.0,Non-coding,-3,Homo_sapiens_9606_qfo_dna_01.fq.gz,protein,18,human,False,sp|O43295|ENSP00000373347,O43295
read100001/sp|O43295|ENSP00000373347;mate1:1283-1432;mate2:1391-1540__frame=-3,read100001/sp|O43295|ENSP00000373347;mate1:128...,0.789474,19.0,Non-coding,-3,Homo_sapiens_9606_qfo_dna_01.fq.gz,dayhoff,31,human,False,sp|O43295|ENSP00000373347,O43295
read100001/sp|O43295|ENSP00000373347;mate1:1283-1432;mate2:1391-1540__frame=-3,read100001/sp|O43295|ENSP00000373347;mate1:128...,0.84,25.0,Non-coding,-3,Homo_sapiens_9606_qfo_dna_01.fq.gz,protein,25,human,False,sp|O43295|ENSP00000373347,O43295


## Mouse

In [7]:
%%time

parquet = '/mnt/ibm_sm/home/olga/pipeline-results/human-simulated/nf-predictorthologs--busco-mammalia-mouse/translate/coding_scores.parquet'
mouse_scores_for_metrics = pd.read_parquet(parquet, use_threads=True)
# mouse_scores_for_metrics = mouse_scores_for_metrics.set_index('read_id_frame')
mouse_scores_for_metrics = mouse_scores_for_metrics.query('(category == "Non-coding") or (category == "Coding")')
print(mouse_scores_for_metrics.shape)
mouse_scores_for_metrics.head()

(43973736, 12)
CPU times: user 3min 3s, sys: 1min 52s, total: 4min 55s
Wall time: 2min 29s


### Make sure none of the jaccard in peptide db is null

In [8]:
mouse_scores_for_metrics.jaccard_in_peptide_db.isnull().sum()

0

In [9]:
mouse_scores_for_metrics.head()

Unnamed: 0_level_0,read_id,jaccard_in_peptide_db,n_kmers,category,translation_frame,filename,alphabet,ksize,species,is_coding,protein_id,uniprot_id
read_id_frame,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
read100001/sp|O43295|ENSP00000373347;mate1:1283-1432;mate2:1391-1540__frame=-3,read100001/sp|O43295|ENSP00000373347;mate1:128...,0.0,36.0,Non-coding,-3,Homo_sapiens_9606_qfo_dna_01.fq.gz,protein,14,mouse,False,sp|O43295|ENSP00000373347,O43295
read100001/sp|O43295|ENSP00000373347;mate1:1283-1432;mate2:1391-1540__frame=-3,read100001/sp|O43295|ENSP00000373347;mate1:128...,0.0,26.0,Non-coding,-3,Homo_sapiens_9606_qfo_dna_01.fq.gz,dayhoff,24,mouse,False,sp|O43295|ENSP00000373347,O43295
read100001/sp|O43295|ENSP00000373347;mate1:1283-1432;mate2:1391-1540__frame=-3,read100001/sp|O43295|ENSP00000373347;mate1:128...,0.0,32.0,Non-coding,-3,Homo_sapiens_9606_qfo_dna_01.fq.gz,protein,18,mouse,False,sp|O43295|ENSP00000373347,O43295
read100001/sp|O43295|ENSP00000373347;mate1:1283-1432;mate2:1391-1540__frame=-3,read100001/sp|O43295|ENSP00000373347;mate1:128...,0.0,19.0,Non-coding,-3,Homo_sapiens_9606_qfo_dna_01.fq.gz,dayhoff,31,mouse,False,sp|O43295|ENSP00000373347,O43295
read100001/sp|O43295|ENSP00000373347;mate1:1283-1432;mate2:1391-1540__frame=-3,read100001/sp|O43295|ENSP00000373347;mate1:128...,0.0,25.0,Non-coding,-3,Homo_sapiens_9606_qfo_dna_01.fq.gz,protein,25,mouse,False,sp|O43295|ENSP00000373347,O43295


## Combine mouse and human to compute scores on one

In [10]:
combined_scores = pd.concat([human_scores_for_metrics, mouse_scores_for_metrics])
combined_scores = combined_scores.sort_index()
print(combined_scores.shape)
combined_scores.head()

(87947472, 12)


Unnamed: 0_level_0,read_id,jaccard_in_peptide_db,n_kmers,category,translation_frame,filename,alphabet,ksize,species,is_coding,protein_id,uniprot_id
read_id_frame,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
read100001/sp|O43295|ENSP00000373347;mate1:1283-1432;mate2:1391-1540__frame=-3,read100001/sp|O43295|ENSP00000373347;mate1:128...,0.888889,36.0,Non-coding,-3,Homo_sapiens_9606_qfo_dna_01.fq.gz,protein,14,human,False,sp|O43295|ENSP00000373347,O43295
read100001/sp|O43295|ENSP00000373347;mate1:1283-1432;mate2:1391-1540__frame=-3,read100001/sp|O43295|ENSP00000373347;mate1:128...,0.0,27.0,Non-coding,-3,Homo_sapiens_9606_qfo_dna_01.fq.gz,dayhoff,23,mouse,False,sp|O43295|ENSP00000373347,O43295
read100001/sp|O43295|ENSP00000373347;mate1:1283-1432;mate2:1391-1540__frame=-3,read100001/sp|O43295|ENSP00000373347;mate1:128...,0.0,35.0,Non-coding,-3,Homo_sapiens_9606_qfo_dna_01.fq.gz,protein,15,mouse,False,sp|O43295|ENSP00000373347,O43295
read100001/sp|O43295|ENSP00000373347;mate1:1283-1432;mate2:1391-1540__frame=-3,read100001/sp|O43295|ENSP00000373347;mate1:128...,0.0,37.0,Non-coding,-3,Homo_sapiens_9606_qfo_dna_01.fq.gz,protein,13,mouse,False,sp|O43295|ENSP00000373347,O43295
read100001/sp|O43295|ENSP00000373347;mate1:1283-1432;mate2:1391-1540__frame=-3,read100001/sp|O43295|ENSP00000373347;mate1:128...,0.0,31.0,Non-coding,-3,Homo_sapiens_9606_qfo_dna_01.fq.gz,dayhoff,19,mouse,False,sp|O43295|ENSP00000373347,O43295


# Compute accuracy/F1 scores

## Human F1, accuracy, etc...scores

In [11]:
%%time

score_names = 'accuracy', 'f1', 'jaccard', 'precision', 'recall',  


metrics = {
    'f1_score': sklearn.metrics.f1_score,
    'accuracy_score': sklearn.metrics.accuracy_score,
    'jaccard_score': sklearn.metrics.jaccard_score,
    'precision_score': sklearn.metrics.precision_score,
    'recall_score': sklearn.metrics.recall_score,
}

def score_coding_metrics(coding_results, true_coding_frame=true_coding_frame, metrics=metrics):

    dfs = []
    for score_name, scorer in tqdm(metrics.items()):
        print(f'score_name: {score_name}')
    #     scorer = sklearn.metrics.get_scorer(score_name)

        scores = coding_results.groupby(['species', 'alphabet', 'ksize']).is_coding.apply(
            lambda x: scorer(true_coding_frame[x.index], x))

        scores.name = 'score_value'
        scores_df = scores.reset_index()
        scores_df['score_name'] = score_name
        dfs.append(scores_df)
    metrics_df = pd.concat(dfs)
    return metrics_df

combined_metrics = score_coding_metrics(combined_scores)
print(combined_metrics.shape)
combined_metrics.head()

NameError: name 'sklearn' is not defined

### Write human metrics to csv!!!

In [12]:
csv = '/mnt/ibm_sm/home/olga/pipeline-results/human-simulated/human_mouse__coding_scores_metrics.csv'
combined_metrics.to_csv(csv, index=False)

NameError: name 'combined_metrics' is not defined

### Plot metrics

In [None]:
sns.catplot(x='ksize', y='score_value', hue='alphabet', col='score_name', 
            data=combined_metrics, kind='point', col_wrap=3)

## Compute ROC AUC

### Human

In [None]:
%%time

def compute_roc_auc(coding_results, true_coding_frame=true_coding_frame):
    scores = coding_results.groupby(['species', 'alphabet', 'ksize']).jaccard_in_peptide_db.apply(
        lambda x: sklearn.metrics.roc_auc_score(true_coding_frame[x.index], x))

    scores.name = 'score_value'
    scores_df = scores.reset_index()
    scores_df['score_name'] = 'roc_auc'

    return scores_df

combined_roc_auc_score = compute_roc_auc(combined_metrics)
combined_roc_auc_score.head()


## Compute ROC Curve

### Human

In [None]:
%%time

def _get_roc_curve_df(y_true, y_score):
    fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_true, y_score)
    
    df = pd.DataFrame({'fpr': fpr, 'tpr': tpr, 'thresholds': thresholds})
    df.index.name = 'threshold_i'
    return df



def compute_roc_curve(coding_results, true_coding_frame=true_coding_frame):
    curve_df = coding_results.groupby(['species', 'alphabet', 'ksize']).jaccard_in_peptide_db.apply(
        lambda x: _get_roc_curve_df(true_coding_frame[x.index], x))

    curve_df['score_name'] = 'roc_auc'

    return curve_df

combined_roc_curve = compute_roc_curve(combined_metrics)
describe(combined_roc_curve)

# Concatenate ROC AUC with other metrics

## Combine non-curve sscores

In [None]:
concatenated_metrics = pd.concat([combined_roc_auc_score, combined_metrics], ignore_index=True)
describe(concatenated_metrics)

In [None]:
concatenated_metrics.score_name.value_counts()

## Plot combined metrics

### Groupby score name, one by one

In [None]:
for score_name, df in concatenated_metrics.groupby('score_name'):
    g = sns.catplot(x='ksize', y='score_value', hue='species', col='alphabet', 
            data=df, kind='point', palette="Set2")
    g.set(ylabel=score_name, ylim=(0, 1))

## Just plot ROC AUC

In [None]:
combined_roc_auc = concatenated_metrics.query('score_name == "roc_auc"')

In [None]:
combined_roc_auc.loc[combined_roc_auc.groupby(['species', 'score_name', 'alphabet']).score_value.idxmax()]

In [None]:
def plot_argmax(x, y, *args, **kwargs):
    idxmax = y.idxmax()
    plt.vlines(x[idxmax], 0, y.max(), linestyle='--', *args, **kwargs)
#     print('x:', x)
#     print('y:', y)
    print('args:', args)
    print('kwargs:', kwargs)
    pass

alphabet_order = ['protein', 'dayhoff']
g = sns.FacetGrid( hue='species', col='alphabet', col_order=alphabet_order,
        data=combined_roc_auc, palette="Set2", height=3, aspect=1.5)
g.map(sns.scatterplot, 'ksize', 'score_value')
g.add_legend()
g.map(plt.plot, 'ksize', 'score_value')
g.map(plot_argmax, 'ksize', 'score_value')
g.set(ylabel='ROC AUC', ylim=(0, 1))


### All in one grid

In [None]:
g = sns.catplot(x='ksize', y='score_value', hue='species', col='alphabet', 
        data=concatenated_metrics, kind='point', palette='Set2', row='score_name')
g.set(ylim=(0, 1 ))

## Plot ROC curves

### Plot combined roc curves

In [None]:

g = sns.FacetGrid(data=combined_roc_curves, col='alphabet', hue='ksize', palette='viridis', row='species')
g.map(plt.plot, 'fpr', 'tpr')