In [1]:
import os
import json
import numpy as np
import pandas as pd
import altair as alt

In [2]:
__author__ = 'Aleksandar Anžel'
__copyright__ = ''
__credits__ = ['Aleksandar Anžel']
__license__ = 'GNU General Public License v3.0'
__version__ = '1.0'
__maintainer__ = 'Aleksandar Anžel'
__email__ = 'aleksandar.anzel@uni-marburg.de'
__status__ = 'Dev'

In [3]:
# Uncomment if you are using dark jupyter lab/notebook theme
alt.renderers.set_embed_options(theme='dark')
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

# 0. Define paths

In [4]:
path_root_data = os.path.join('..', 'Data')
path_cmangoes_vis_data = os.path.join(
    path_root_data, 'Visualization_data', 'data', 'multiple_datasets', 'vis',
    'mds_1_Overview')
path_peptidereactor_vis_data = os.path.join(
    path_root_data, 'Visualization_data', 'peptidereactor_vis_data')

path_peptidereactor_vega_lite_spec = os.path.join(
    path_cmangoes_vis_data, 'mds_1_Overview.json')

path_cmangoes_source_data = os.path.join(
    path_cmangoes_vis_data, 'hm_source_data.json')
path_cmangoes_imb_data = os.path.join(
    path_cmangoes_vis_data, 'hm_imb_data.json')
path_cmangoes_bio_data = os.path.join(
    path_cmangoes_vis_data, 'hm_bio_data.json')

path_peptidereactor_source_data = os.path.join(
    path_peptidereactor_vis_data, 'hm_source_data.json')
path_peptidereactor_imb_data = os.path.join(
    path_peptidereactor_vis_data, 'hm_imb_data.json')
path_peptidereactor_bio_data = os.path.join(
    path_peptidereactor_vis_data, 'hm_bio_data.json')

list_of_datasets = [
    'ace_vaxinpad',
    'acp_anticp',
    'acp_iacp',
    'acp_mlacp',
    'afp_amppred',
    'afp_antifp',
    'aip_aippred',
    'aip_antiinflam',
    'amp_antibp',
    'amp_antibp2',
    'amp_csamp',
    'amp_fernandes',
    'amp_gonzales',
    'amp_iamp2l',
    'amp_modlamp',
    'atb_antitbp',
    'atb_iantitb',
    'avp_amppred',
    'avp_avppred',
    'bce_ibce',
    'cpp_cellppd',
    'cpp_cellppdmod',
    'cpp_cppredfl',
    'cpp_kelmcpp',
    'cpp_mixed',
    'cpp_mlcpp',
    'cpp_mlcppue',
    'cpp_sanders',
    'hem_hemopi'
]

# 1. Import data

In [5]:
df_cmangoes_source_data = pd.read_json(path_cmangoes_source_data)
df_cmangoes_imb_data = pd.read_json(path_cmangoes_imb_data)
df_cmangoes_bio_data = pd.read_json(path_cmangoes_bio_data)
df_peptidereactor_source_data = pd.read_json(path_peptidereactor_source_data)
df_peptidereactor_imb_data = pd.read_json(path_peptidereactor_imb_data)
df_peptidereactor_bio_data = pd.read_json(path_peptidereactor_bio_data)

# 2. Clean the data

In [6]:
def df_clean_cmangoes_data(df_input):
    df_output = df_input.copy(deep=True)
    
    df_output.drop(
        df_output[df_output['Encoding'].str.startswith('str')].index,
        inplace=True)
    
    df_output['Encoding_max'] = df_output[
        'Encoding_max'].str.replace('_levels_1_and_2_encoding','')
    df_output['Encoding_max'] = df_output[
        'Encoding_max'].str.replace('seq','CMANGOES_seq_')
    df_output['Encoding'] = df_output[
        'Encoding'].str.replace('seq','param_seq_')
    
    return df_output.reset_index(drop=True)
    

In [7]:
df_cmangoes_source_data

Unnamed: 0,Dataset,Encoding,Encoding_max,F1,type,is_imbalanced,bio_field,type_field,missing
0,amp_gonzales,seqdis,seqdiscretized_shifted_levels_1_and_2_encoding,0.571429,sequence based,0.209302,amp,1,False
1,amp_gonzales,strdis,strdiscretized_shifted_levels_1_and_2_encoding,0.571429,sequence based,0.209302,amp,1,False
2,amp_gonzales,strbin,strbinary_shifted_levels_1_and_2_encoding,0.500000,sequence based,0.209302,amp,1,False
3,amp_gonzales,seqbin,seqbinary_shifted_levels_1_and_2_encoding,0.500000,sequence based,0.209302,amp,1,False
4,amp_iamp2l,strbin,strbinary_shifted_levels_1_and_2_encoding,0.708928,sequence based,0.267661,amp,1,False
...,...,...,...,...,...,...,...,...,...
111,cpp_mixed,strbin,strbinary_shifted_levels_1_and_2_encoding,0.863636,sequence based,0.757812,cpp,1,False
112,cpp_sanders,strdis,strdiscretized_shifted_levels_1_and_2_encoding,0.862745,sequence based,0.765517,cpp,1,False
113,cpp_sanders,seqdis,seqdiscretized_shifted_levels_1_and_2_encoding,0.862745,sequence based,0.765517,cpp,1,False
114,cpp_sanders,strbin,strbinary_shifted_levels_1_and_2_encoding,0.862745,sequence based,0.765517,cpp,1,False


In [8]:
df_cmangoes_source_data = df_clean_cmangoes_data(df_cmangoes_source_data)
df_cmangoes_imb_data = df_clean_cmangoes_data(df_cmangoes_imb_data)
df_cmangoes_bio_data = df_clean_cmangoes_data(df_cmangoes_bio_data)

df_cmangoes_source_data

Unnamed: 0,Dataset,Encoding,Encoding_max,F1,type,is_imbalanced,bio_field,type_field,missing
0,amp_gonzales,param_seq_dis,CMANGOES_seq_discretized_shifted,0.571429,sequence based,0.209302,amp,1,False
1,amp_gonzales,param_seq_bin,CMANGOES_seq_binary_shifted,0.5,sequence based,0.209302,amp,1,False
2,amp_iamp2l,param_seq_dis,CMANGOES_seq_discretized_shifted,0.716014,sequence based,0.267661,amp,1,False
3,amp_iamp2l,param_seq_bin,CMANGOES_seq_binary_shifted,0.708928,sequence based,0.267661,amp,1,False
4,acp_mlacp,param_seq_dis,CMANGOES_seq_discretized_shifted,0.40339,sequence based,0.319658,acp,1,False
5,acp_mlacp,param_seq_bin,CMANGOES_seq_binary_shifted,0.353913,sequence based,0.319658,acp,1,False
6,cpp_mlcpp,param_seq_dis,CMANGOES_seq_discretized_shifted,0.40197,sequence based,0.387809,cpp,1,False
7,cpp_mlcpp,param_seq_bin,CMANGOES_seq_binary_shifted,0.379744,sequence based,0.387809,cpp,1,False
8,aip_aippred,param_seq_bin,CMANGOES_seq_binary_shifted,0.476557,sequence based,0.400381,aip,1,False
9,aip_aippred,param_seq_dis,CMANGOES_seq_discretized_shifted,0.483782,sequence based,0.400381,aip,1,False


In [9]:
df_peptidereactor_source_data

Unnamed: 0,Dataset,Encoding,Encoding_max,F1,type,is_imbalanced,bio_field,type_field,missing
0,amp_gonzales,seqdis,seqdiscretized_shifted_levels_1_and_2_encoding,0.571429,sequence based,0.209302,amp,1,False
1,amp_gonzales,strdis,strdiscretized_shifted_levels_1_and_2_encoding,0.571429,sequence based,0.209302,amp,1,False
2,amp_gonzales,strbin,strbinary_shifted_levels_1_and_2_encoding,0.500000,sequence based,0.209302,amp,1,False
3,amp_gonzales,seqbin,seqbinary_shifted_levels_1_and_2_encoding,0.500000,sequence based,0.209302,amp,1,False
4,amp_iamp2l,strbin,strbinary_shifted_levels_1_and_2_encoding,0.708928,sequence based,0.267661,amp,1,False
...,...,...,...,...,...,...,...,...,...
111,cpp_mixed,strbin,strbinary_shifted_levels_1_and_2_encoding,0.863636,sequence based,0.757812,cpp,1,False
112,cpp_sanders,strdis,strdiscretized_shifted_levels_1_and_2_encoding,0.862745,sequence based,0.765517,cpp,1,False
113,cpp_sanders,seqdis,seqdiscretized_shifted_levels_1_and_2_encoding,0.862745,sequence based,0.765517,cpp,1,False
114,cpp_sanders,strbin,strbinary_shifted_levels_1_and_2_encoding,0.862745,sequence based,0.765517,cpp,1,False


In [10]:
df_combined_source_data = pd.concat(
    [df_cmangoes_source_data, df_peptidereactor_source_data],
    ignore_index=True)
df_combined_imb_data = pd.concat(
    [df_cmangoes_imb_data, df_peptidereactor_imb_data], ignore_index=True)
df_combined_bio_data = pd.concat(
    [df_cmangoes_bio_data, df_peptidereactor_bio_data], ignore_index=True)

df_combined_source_data

Unnamed: 0,Dataset,Encoding,Encoding_max,F1,type,is_imbalanced,bio_field,type_field,missing
0,amp_gonzales,param_seq_dis,CMANGOES_seq_discretized_shifted,0.571429,sequence based,0.209302,amp,1,False
1,amp_gonzales,param_seq_bin,CMANGOES_seq_binary_shifted,0.500000,sequence based,0.209302,amp,1,False
2,amp_iamp2l,param_seq_dis,CMANGOES_seq_discretized_shifted,0.716014,sequence based,0.267661,amp,1,False
3,amp_iamp2l,param_seq_bin,CMANGOES_seq_binary_shifted,0.708928,sequence based,0.267661,amp,1,False
4,acp_mlacp,param_seq_dis,CMANGOES_seq_discretized_shifted,0.403390,sequence based,0.319658,acp,1,False
...,...,...,...,...,...,...,...,...,...
169,cpp_mixed,strbin,strbinary_shifted_levels_1_and_2_encoding,0.863636,sequence based,0.757812,cpp,1,False
170,cpp_sanders,strdis,strdiscretized_shifted_levels_1_and_2_encoding,0.862745,sequence based,0.765517,cpp,1,False
171,cpp_sanders,seqdis,seqdiscretized_shifted_levels_1_and_2_encoding,0.862745,sequence based,0.765517,cpp,1,False
172,cpp_sanders,strbin,strbinary_shifted_levels_1_and_2_encoding,0.862745,sequence based,0.765517,cpp,1,False


In [11]:
df_combined_source_data

Unnamed: 0,Dataset,Encoding,Encoding_max,F1,type,is_imbalanced,bio_field,type_field,missing
0,amp_gonzales,param_seq_dis,CMANGOES_seq_discretized_shifted,0.571429,sequence based,0.209302,amp,1,False
1,amp_gonzales,param_seq_bin,CMANGOES_seq_binary_shifted,0.500000,sequence based,0.209302,amp,1,False
2,amp_iamp2l,param_seq_dis,CMANGOES_seq_discretized_shifted,0.716014,sequence based,0.267661,amp,1,False
3,amp_iamp2l,param_seq_bin,CMANGOES_seq_binary_shifted,0.708928,sequence based,0.267661,amp,1,False
4,acp_mlacp,param_seq_dis,CMANGOES_seq_discretized_shifted,0.403390,sequence based,0.319658,acp,1,False
...,...,...,...,...,...,...,...,...,...
169,cpp_mixed,strbin,strbinary_shifted_levels_1_and_2_encoding,0.863636,sequence based,0.757812,cpp,1,False
170,cpp_sanders,strdis,strdiscretized_shifted_levels_1_and_2_encoding,0.862745,sequence based,0.765517,cpp,1,False
171,cpp_sanders,seqdis,seqdiscretized_shifted_levels_1_and_2_encoding,0.862745,sequence based,0.765517,cpp,1,False
172,cpp_sanders,strbin,strbinary_shifted_levels_1_and_2_encoding,0.862745,sequence based,0.765517,cpp,1,False


In [12]:
# Add one more column to flag missing values
df_combined_source_data['Value'] = [
    'NaN' if i is True else 'notNaN'
    for i in df_combined_source_data['F1'].isnull().tolist()]
df_combined_imb_data['Value'] = [
    'NaN' if i is True else 'notNaN'
    for i in df_combined_imb_data['F1'].isnull().tolist()]
df_combined_bio_data['Value'] = [
    'NaN' if i is True else 'notNaN'
    for i in df_combined_bio_data['F1'].isnull().tolist()]

In [13]:
df_combined_source_data

Unnamed: 0,Dataset,Encoding,Encoding_max,F1,type,is_imbalanced,bio_field,type_field,missing,Value
0,amp_gonzales,param_seq_dis,CMANGOES_seq_discretized_shifted,0.571429,sequence based,0.209302,amp,1,False,notNaN
1,amp_gonzales,param_seq_bin,CMANGOES_seq_binary_shifted,0.500000,sequence based,0.209302,amp,1,False,notNaN
2,amp_iamp2l,param_seq_dis,CMANGOES_seq_discretized_shifted,0.716014,sequence based,0.267661,amp,1,False,notNaN
3,amp_iamp2l,param_seq_bin,CMANGOES_seq_binary_shifted,0.708928,sequence based,0.267661,amp,1,False,notNaN
4,acp_mlacp,param_seq_dis,CMANGOES_seq_discretized_shifted,0.403390,sequence based,0.319658,acp,1,False,notNaN
...,...,...,...,...,...,...,...,...,...,...
169,cpp_mixed,strbin,strbinary_shifted_levels_1_and_2_encoding,0.863636,sequence based,0.757812,cpp,1,False,notNaN
170,cpp_sanders,strdis,strdiscretized_shifted_levels_1_and_2_encoding,0.862745,sequence based,0.765517,cpp,1,False,notNaN
171,cpp_sanders,seqdis,seqdiscretized_shifted_levels_1_and_2_encoding,0.862745,sequence based,0.765517,cpp,1,False,notNaN
172,cpp_sanders,strbin,strbinary_shifted_levels_1_and_2_encoding,0.862745,sequence based,0.765517,cpp,1,False,notNaN


In [14]:
# We want to drop rows that contain 'zzz' at the end of the necoding
# This flags were used to seperate heatmap
df_combined_source_data.drop(
    df_combined_source_data[
        df_combined_source_data['Encoding'].str.endswith('zzz')].index,
    inplace=True)
df_combined_imb_data.drop(
    df_combined_imb_data[
        df_combined_imb_data['Encoding'].str.endswith('zzz')].index,
    inplace=True)
df_combined_bio_data.drop(
    df_combined_bio_data[
        df_combined_bio_data['Encoding'].str.endswith('zzz')].index,
    inplace=True)

df_combined_source_data

Unnamed: 0,Dataset,Encoding,Encoding_max,F1,type,is_imbalanced,bio_field,type_field,missing,Value
0,amp_gonzales,param_seq_dis,CMANGOES_seq_discretized_shifted,0.571429,sequence based,0.209302,amp,1,False,notNaN
1,amp_gonzales,param_seq_bin,CMANGOES_seq_binary_shifted,0.500000,sequence based,0.209302,amp,1,False,notNaN
2,amp_iamp2l,param_seq_dis,CMANGOES_seq_discretized_shifted,0.716014,sequence based,0.267661,amp,1,False,notNaN
3,amp_iamp2l,param_seq_bin,CMANGOES_seq_binary_shifted,0.708928,sequence based,0.267661,amp,1,False,notNaN
4,acp_mlacp,param_seq_dis,CMANGOES_seq_discretized_shifted,0.403390,sequence based,0.319658,acp,1,False,notNaN
...,...,...,...,...,...,...,...,...,...,...
169,cpp_mixed,strbin,strbinary_shifted_levels_1_and_2_encoding,0.863636,sequence based,0.757812,cpp,1,False,notNaN
170,cpp_sanders,strdis,strdiscretized_shifted_levels_1_and_2_encoding,0.862745,sequence based,0.765517,cpp,1,False,notNaN
171,cpp_sanders,seqdis,seqdiscretized_shifted_levels_1_and_2_encoding,0.862745,sequence based,0.765517,cpp,1,False,notNaN
172,cpp_sanders,strbin,strbinary_shifted_levels_1_and_2_encoding,0.862745,sequence based,0.765517,cpp,1,False,notNaN


# 3. Visualize
## 3.1. CMANGOES results

In [17]:
def chart_visualize(df_input, string_title, flag_cmangoes_style):
    
    if flag_cmangoes_style:
        scale_general = alt.Scale(scheme='greys')
        scale_na_values = alt.Scale(domain=['NaN'], range=['#CD7DA9'])
    else:
        scale_general = alt.Scale(range=["#a6bddb", "#023858"])
        scale_na_values = alt.Scale(domain=['NaN'], range=['#a6611a'])
    
    
    chart_non_null = alt.Chart(
        df_input, title=string_title + " multi-dataset results"
    ).mark_rect().encode(
        x=alt.X('Encoding', type='nominal', axis=alt.Axis(labelAngle=-45)),
        y=alt.Y('Dataset', type='nominal',
                scale=alt.Scale(domain=list_of_datasets)),
        color=alt.Color('F1', type='quantitative', scale=scale_general),
        tooltip=['Dataset', 'Encoding_max', 'F1', 'is_imbalanced', 'bio_field']
    )
    
    chart_null = alt.Chart(
        df_input, title=string_title + " multi-dataset results"
    ).mark_rect().encode(
        x=alt.X('Encoding', type='nominal', axis=alt.Axis(labelAngle=-45)),
        y=alt.Y('Dataset', type='nominal',
                scale=alt.Scale(domain=list_of_datasets)),
        color=alt.Color('Value', type='nominal', scale=scale_na_values),
        tooltip=['Dataset', 'Encoding_max', 'F1', 'is_imbalanced', 'bio_field']
    )
    
    chart_result = chart_non_null + chart_null
    
    return chart_result


def replicate_visualizations(flag_visualization_style):
    chart_cmangoes_source_result = chart_visualize(
        df_cmangoes_source_data, 'CMANGOES source', flag_visualization_style)
    chart_peptidereactor_source_result = chart_visualize(
        df_peptidereactor_source_data, 'Peptidereactor source',
        flag_visualization_style)
    chart_peptidereactor_imb_result = chart_visualize(
        df_peptidereactor_imb_data, 'Peptidereactor imb',
        flag_visualization_style)
    chart_peptidereactor_bio_result = chart_visualize(
        df_peptidereactor_bio_data, 'Peptidereactor bio',
        flag_visualization_style)
    chart_combined_source_result = chart_visualize(
        df_combined_source_data, 'Combined source', flag_visualization_style)
    chart_combined_imb_result = chart_visualize(
        df_combined_imb_data, 'Combined imb', flag_visualization_style)
    chart_combined_bio_result = chart_visualize(
        df_combined_bio_data, 'Combined bio', flag_visualization_style)

    chart_result = chart_cmangoes_source_result &\
                   chart_peptidereactor_imb_result &\
                   chart_peptidereactor_bio_result &\
                   chart_combined_imb_result &\
                   chart_combined_bio_result
    
    
    return chart_result


def filter_and_visualize_datasets():
    
    #.transform_filter(
    #    alt.FieldEqualPredicate(field='year', equal=2000)
    #)
    # https://altair-viz.github.io/user_guide/transform/filter.html
    
    return None


## 3.1. Peptidereactor-styled results

In [18]:
flag_cmangoes_style = True
replicate_visualizations(not flag_cmangoes_style)

## 3.2. Our own visualizations

In [19]:
replicate_visualizations(flag_cmangoes_style)

## 3.3 Further analysis

SyntaxError: invalid syntax (761895771.py, line 1)