## Analyse Binette results

In [1]:
import pandas as pd
from pathlib import Path
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "sphinx_gallery"

In [2]:
binette_result_file = "./binette_results/final_bins_quality_reports.tsv"
df_binette = pd.read_csv(binette_result_file, sep='\t')
df_binette['tool'] = "binette"
df_binette['index'] = df_binette.index
df_binette

Unnamed: 0,bin_id,origin,name,completeness,contamination,score,size,N50,contig_count,tool,index
0,17075,diff,44 - 10,100.0,0.05,99.9,4672665,82084,93,binette,0
1,39427,diff,36 - 6,99.9,0.2,99.5,2796605,41151,98,binette,1
2,47060,union,58 | 33,98.59,0.83,96.93,4601336,41016,165,binette,2
3,47177,union,91 | 25 | 55,96.1,0.34,95.42,2598718,11891,312,binette,3
4,21248,diff,65 - 8 - 28,91.98,1.71,88.56,1768095,9976,250,binette,4
5,44137,diff,76 - 13 - 28,92.63,2.41,87.81,3726254,5669,850,binette,5
6,31703,diff,31 - 7 - 61,81.73,0.84,80.05,1665233,8518,248,binette,6
7,13475,diff,47 - 37,72.89,2.39,68.11,1241829,5061,252,binette,7
8,47926,union,75 | 30,74.31,4.26,65.79,3293949,2954,1262,binette,8
9,46775,union,42 | 102,62.94,2.75,57.44,1293571,3783,419,binette,9


In [3]:
input_bins_quality_reports_dir = Path("binette_results/input_bins_quality_reports/")

df_input_bin_list = [df_binette]
for input_bin_metric_file in input_bins_quality_reports_dir.glob("*tsv"):
    tool = input_bin_metric_file.name.split('.')[1].split('_')[0]
    df_input = pd.read_csv(input_bin_metric_file, sep='\t')
    df_input['index'] = df_input.index
    df_input['tool'] = tool
    df_input_bin_list.append(df_input)

df_bins =  pd.concat(df_input_bin_list)
    
set(df_bins['tool'])
df_bins["High quality bin"] =  (df_bins['completeness'] >= 90) & (df_bins['contamination'] <= 5)
#df_binette = pd.read_csv(binette_result_file, sep='\t')
#df_binette
df_bins[["completeness", "contamination", "tool"]]


Unnamed: 0,completeness,contamination,tool
0,100.00,0.05,binette
1,99.90,0.20,binette
2,98.59,0.83,binette
3,96.10,0.34,binette
4,91.98,1.71,binette
...,...,...,...
20,8.28,0.01,semibin2
21,8.12,0.02,semibin2
22,7.74,0.01,semibin2
23,6.18,0.00,semibin2


In [4]:
fig = px.scatter(df_bins, x="completeness",y="contamination", color="High quality bin", size="size",  facet_row="tool")
fig.update_layout(
    width=800,
    height=800)
    
fig.show()

In [5]:
df_bins['completeness - 2*contamination'] = df_bins['completeness'] - 2*df_bins['contamination']
fig = px.line(df_bins, x="index",y='completeness - 2*contamination', color="tool",markers=True)
fig.update_layout(
    width=800,
    height=500)
fig.show()

In [6]:
contamination_cutoff = 10
low_contamination_filt = df_bins['contamination'] <= contamination_cutoff
high_completeness_filt = df_bins['completeness'] > 90
medium_completeness_filt = df_bins['completeness'] > 70
low_completeness_filt = df_bins['completeness'] > 50

quality  = f'Contamination ≤ {contamination_cutoff} and<br>Completeness'
df_bins.loc[low_contamination_filt & low_completeness_filt, quality] =  '> 50% and ≤ 70%'
df_bins.loc[low_contamination_filt & medium_completeness_filt, quality] =  '> 70% and ≤ 90%'
df_bins.loc[low_contamination_filt & high_completeness_filt, quality] = '> 90%'
df_bins.loc[~df_bins[quality].isna()]


Unnamed: 0,bin_id,origin,name,completeness,contamination,score,size,N50,contig_count,tool,index,High quality bin,completeness - 2*contamination,Contamination ≤ 10 and<br>Completeness
0,17075,diff,44 - 10,100.0,0.05,99.9,4672665,82084,93,binette,0,True,99.9,> 90%
1,39427,diff,36 - 6,99.9,0.2,99.5,2796605,41151,98,binette,1,True,99.5,> 90%
2,47060,union,58 | 33,98.59,0.83,96.93,4601336,41016,165,binette,2,True,96.93,> 90%
3,47177,union,91 | 25 | 55,96.1,0.34,95.42,2598718,11891,312,binette,3,True,95.42,> 90%
4,21248,diff,65 - 8 - 28,91.98,1.71,88.56,1768095,9976,250,binette,4,True,88.56,> 90%
5,44137,diff,76 - 13 - 28,92.63,2.41,87.81,3726254,5669,850,binette,5,True,87.81,> 90%
6,31703,diff,31 - 7 - 61,81.73,0.84,80.05,1665233,8518,248,binette,6,False,80.05,> 70% and ≤ 90%
7,13475,diff,47 - 37,72.89,2.39,68.11,1241829,5061,252,binette,7,False,68.11,> 70% and ≤ 90%
8,47926,union,75 | 30,74.31,4.26,65.79,3293949,2954,1262,binette,8,False,65.79,> 70% and ≤ 90%
9,46775,union,42 | 102,62.94,2.75,57.44,1293571,3783,419,binette,9,False,57.44,> 50% and ≤ 70%


In [7]:
df_bins.groupby([quality, 'tool']).value_counts(ascending=True).reset_index()

df_bins_quality_grouped = df_bins.groupby([quality, 'tool']).agg(bin_count=('bin_id', 'count')).reset_index()
df_bins_quality_grouped

Unnamed: 0,Contamination ≤ 10 and<br>Completeness,tool,bin_count
0,> 50% and ≤ 70%,binette,5
1,> 50% and ≤ 70%,maxbin2,1
2,> 50% and ≤ 70%,metabat2,1
3,> 50% and ≤ 70%,semibin2,2
4,> 70% and ≤ 90%,binette,3
5,> 70% and ≤ 90%,concoct,2
6,> 70% and ≤ 90%,metabat2,5
7,> 70% and ≤ 90%,semibin2,4
8,> 90%,binette,6
9,> 90%,concoct,4


In [8]:
color_discrete_map={"> 90%": px.colors.qualitative.Prism[4],
                    "> 70% and ≤ 90%": px.colors.qualitative.Prism[2],
                   "> 50% and ≤ 70%": px.colors.qualitative.Prism[6]}

fig = px.bar(df_bins_quality_grouped, x='tool', y="bin_count", color=quality,
             barmode='stack', color_discrete_map=color_discrete_map, text="bin_count",
             category_orders={"tool":["binette", "semibin2", "concoct", "metabat2",  "maxbin2"]},
            opacity = 0.9)#[ "#008c8a", px.colors.qualitative.Safe[4], '#2596be'])

fig.update_layout(
        width=800,
        height=500,
    legend=dict(
      traceorder="reversed",
    ))
fig