In [1]:
import os
import json
import numpy as np
import pandas as pd
import altair as alt

In [2]:
__author__ = 'Aleksandar Anžel'
__copyright__ = ''
__credits__ = ['Aleksandar Anžel']
__license__ = 'GNU General Public License v3.0'
__version__ = '1.0'
__maintainer__ = 'Aleksandar Anžel'
__email__ = 'aleksandar.anzel@uni-marburg.de'
__status__ = 'Dev'

In [3]:
# Uncomment if you are using dark jupyter lab/notebook theme
alt.renderers.set_embed_options(theme='dark')
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

# 0. Define paths

In [4]:
path_root_data = os.path.join('..', 'Data')
path_cmangoes_vis_data = os.path.join(
    path_root_data, 'Visualization_data', 'data', 'multiple_datasets', 'vis',
    'mds_1_Overview')
path_peptidereactor_vis_data = os.path.join(
    path_root_data, 'Visualization_data', 'peptidereactor_vis_data')

path_peptidereactor_vega_lite_spec = os.path.join(
    path_cmangoes_vis_data, 'mds_1_Overview.json')

path_cmangoes_source_data = os.path.join(
    path_cmangoes_vis_data, 'hm_source_data.json')
path_cmangoes_imb_data = os.path.join(
    path_cmangoes_vis_data, 'hm_imb_data.json')
path_cmangoes_bio_data = os.path.join(
    path_cmangoes_vis_data, 'hm_bio_data.json')

path_peptidereactor_imb_data = os.path.join(
    path_peptidereactor_vis_data, 'hm_imb_data.json')
path_peptidereactor_bio_data = os.path.join(
    path_peptidereactor_vis_data, 'hm_bio_data.json')

path_cmangoes_performance_data = os.path.join(
    path_root_data, 'Performance_experiments', 'results.csv')

list_of_datasets = [
    'ace_vaxinpad',
    'acp_anticp',
    'acp_iacp',
    'acp_mlacp',
    'afp_amppred',
    'afp_antifp',
    'aip_aippred',
    'aip_antiinflam',
    'amp_antibp',
    'amp_antibp2',
    'amp_csamp',
    'amp_fernandes',
    'amp_gonzales',
    'amp_iamp2l',
    'amp_modlamp',
    'atb_antitbp',
    'atb_iantitb',
    'avp_amppred',
    'avp_avppred',
    'bce_ibce',
    'cpp_cellppd',
    'cpp_cellppdmod',
    'cpp_cppredfl',
    'cpp_kelmcpp',
    'cpp_mixed',
    'cpp_mlcpp',
    'cpp_mlcppue',
    'cpp_sanders',
    'hem_hemopi'
]

# 1. Import data

In [5]:
df_cmangoes_source_data = pd.read_json(path_cmangoes_source_data)
df_cmangoes_imb_data = pd.read_json(path_cmangoes_imb_data)
df_cmangoes_bio_data = pd.read_json(path_cmangoes_bio_data)

df_peptidereactor_imb_data = pd.read_json(path_peptidereactor_imb_data)
df_peptidereactor_bio_data = pd.read_json(path_peptidereactor_bio_data)

# 2. Clean the data

In [6]:
def df_fix_f1_scores_and_more(df_input):
    df_output = df_input.copy(deep=True)
    
    df_output = df_output[df_output['F1'] != 'separator']
    df_output['F1'] = pd.to_numeric(df_output['F1'])
    
    # We want to drop rows that contain 'zzz' at the end of the necoding
    # This flags were used to seperate heatmap
    df_output.drop(df_output[
        df_output['Encoding'].str.endswith('zzz')].index,
        inplace=True)
    df_output.drop(df_output[
        df_output['Dataset'].str.endswith('zzz')].index,
        inplace=True)
    
    df_output = df_output[df_output['Dataset'].isin(list_of_datasets)]
    
    return df_output

def df_clean_cmangoes_data(df_input):
    df_output = df_input.copy(deep=True)
    df_output = df_fix_f1_scores_and_more(df_output)
    
    df_output.drop(
        df_output[df_output['Encoding'].str.startswith('str')].index,
        inplace=True)
    
    df_output['Encoding_max'] = df_output[
        'Encoding_max'].str.replace('_levels_1_and_2_encoding','')
    df_output['Encoding_max'] = df_output[
        'Encoding_max'].str.replace('seq','CMANGOES_seq_')
    
    list_tmp = []
    for string_encoding in df_output['Encoding_max']:
        string_tmp = 'param_' + '_'.join(
            [string_chunk[:3]
             for string_chunk in string_encoding.split('_')[2:]])
        list_tmp.append(string_tmp)
    
    df_output['Encoding'] = list_tmp
    
    return df_output.reset_index(drop=True)
    

In [7]:
df_cmangoes_imb_data

Unnamed: 0,Dataset,Encoding,Encoding_max,F1,type,is_imbalanced,bio_field,type_field,missing
0,amp_gonzales,seqdis,seqdiscretized_shifted_levels_1_and_2_encoding,0.571429,sequence based,0.209302,amp,1,False
1,amp_gonzales,strdis,strdiscretized_shifted_levels_1_and_2_encoding,0.571429,sequence based,0.209302,amp,1,False
2,amp_gonzales,strbin,strbinary_shifted_levels_1_and_2_encoding,0.5,sequence based,0.209302,amp,1,False
3,amp_gonzales,seqbin,seqbinary_shifted_levels_1_and_2_encoding,0.5,sequence based,0.209302,amp,1,False
4,amp_iamp2l,strbin,strbinary_shifted_levels_1_and_2_encoding,0.708928,sequence based,0.267661,amp,1,False
...,...,...,...,...,...,...,...,...,...
115,cpp_mixed,seqbin,seqbinary_shifted_levels_1_and_2_encoding,0.863636,sequence based,0.757812,cpp,1,False
116,cpp_sanders,seqdis,seqdiscretized_shifted_levels_1_and_2_encoding,0.862745,sequence based,0.765517,cpp,1,False
117,cpp_sanders,strbin,strbinary_shifted_levels_1_and_2_encoding,0.862745,sequence based,0.765517,cpp,1,False
118,cpp_sanders,seqbin,seqbinary_shifted_levels_1_and_2_encoding,0.862745,sequence based,0.765517,cpp,1,False


In [8]:
df_cmangoes_source_data = df_clean_cmangoes_data(df_cmangoes_source_data)
df_cmangoes_imb_data = df_clean_cmangoes_data(df_cmangoes_imb_data)
df_cmangoes_bio_data = df_clean_cmangoes_data(df_cmangoes_bio_data)
df_peptidereactor_imb_data = df_fix_f1_scores_and_more(df_peptidereactor_imb_data)
df_peptidereactor_bio_data = df_fix_f1_scores_and_more(df_peptidereactor_bio_data)

df_cmangoes_imb_data

Unnamed: 0,Dataset,Encoding,Encoding_max,F1,type,is_imbalanced,bio_field,type_field,missing
0,amp_gonzales,param_dis_shi,CMANGOES_seq_discretized_shifted,0.571429,sequence based,0.209302,amp,1,False
1,amp_gonzales,param_bin_shi,CMANGOES_seq_binary_shifted,0.5,sequence based,0.209302,amp,1,False
2,amp_iamp2l,param_dis_shi,CMANGOES_seq_discretized_shifted,0.716014,sequence based,0.267661,amp,1,False
3,amp_iamp2l,param_bin_shi,CMANGOES_seq_binary_shifted,0.708928,sequence based,0.267661,amp,1,False
4,acp_mlacp,param_dis_shi,CMANGOES_seq_discretized_shifted,0.40339,sequence based,0.319658,acp,1,False
5,acp_mlacp,param_bin_shi,CMANGOES_seq_binary_shifted,0.353913,sequence based,0.319658,acp,1,False
6,cpp_mlcpp,param_bin_shi,CMANGOES_seq_binary_shifted,0.379744,sequence based,0.387809,cpp,1,False
7,cpp_mlcpp,param_dis_shi,CMANGOES_seq_discretized_shifted,0.40197,sequence based,0.387809,cpp,1,False
8,aip_aippred,param_dis_shi,CMANGOES_seq_discretized_shifted,0.483782,sequence based,0.400381,aip,1,False
9,aip_aippred,param_bin_shi,CMANGOES_seq_binary_shifted,0.476557,sequence based,0.400381,aip,1,False


In [9]:
df_peptidereactor_imb_data

Unnamed: 0,Dataset,Encoding,Encoding_max,F1,type,is_imbalanced,bio_field,type_field,missing
138,amp_gonzales,geary_,geary_nlag_9,0.750000,sequence based,0.209302,amp,1,False
139,amp_gonzales,flgc_a,flgc_aaindex_ZIMJ680104,0.888889,sequence based,0.209302,amp,1,False
140,amp_gonzales,dist_f,dist_freq_dn_5_dc_50,0.888889,sequence based,0.209302,amp,1,False
141,amp_gonzales,ngram_,ngram_s3_5,0.816667,sequence based,0.209302,amp,1,False
142,amp_gonzales,fldpc_,fldpc_aaindex_ZIMJ680104,0.888889,sequence based,0.209302,amp,1,False
...,...,...,...,...,...,...,...,...,...
2339,cpp_sanders,ctdt,ctdt,0.872283,sequence based,0.765517,cpp,1,False
2340,cpp_sanders,ksctri,ksctriad_gap_1,0.872283,sequence based,0.765517,cpp,1,False
2341,cpp_sanders,flgc_a,flgc_aaindex_ZIMJ680104,0.875000,sequence based,0.765517,cpp,1,False
2342,cpp_sanders,ctriad,ctriad,0.875000,sequence based,0.765517,cpp,1,False


In [10]:
df_combined_imb_data = pd.concat(
    [df_cmangoes_imb_data, df_peptidereactor_imb_data], ignore_index=True)
df_combined_bio_data = pd.concat(
    [df_cmangoes_bio_data, df_peptidereactor_bio_data], ignore_index=True)

df_combined_imb_data

Unnamed: 0,Dataset,Encoding,Encoding_max,F1,type,is_imbalanced,bio_field,type_field,missing
0,amp_gonzales,param_dis_shi,CMANGOES_seq_discretized_shifted,0.571429,sequence based,0.209302,amp,1,False
1,amp_gonzales,param_bin_shi,CMANGOES_seq_binary_shifted,0.500000,sequence based,0.209302,amp,1,False
2,amp_iamp2l,param_dis_shi,CMANGOES_seq_discretized_shifted,0.716014,sequence based,0.267661,amp,1,False
3,amp_iamp2l,param_bin_shi,CMANGOES_seq_binary_shifted,0.708928,sequence based,0.267661,amp,1,False
4,acp_mlacp,param_dis_shi,CMANGOES_seq_discretized_shifted,0.403390,sequence based,0.319658,acp,1,False
...,...,...,...,...,...,...,...,...,...
1358,cpp_sanders,ctdt,ctdt,0.872283,sequence based,0.765517,cpp,1,False
1359,cpp_sanders,ksctri,ksctriad_gap_1,0.872283,sequence based,0.765517,cpp,1,False
1360,cpp_sanders,flgc_a,flgc_aaindex_ZIMJ680104,0.875000,sequence based,0.765517,cpp,1,False
1361,cpp_sanders,ctriad,ctriad,0.875000,sequence based,0.765517,cpp,1,False


In [11]:
df_combined_imb_data

Unnamed: 0,Dataset,Encoding,Encoding_max,F1,type,is_imbalanced,bio_field,type_field,missing
0,amp_gonzales,param_dis_shi,CMANGOES_seq_discretized_shifted,0.571429,sequence based,0.209302,amp,1,False
1,amp_gonzales,param_bin_shi,CMANGOES_seq_binary_shifted,0.500000,sequence based,0.209302,amp,1,False
2,amp_iamp2l,param_dis_shi,CMANGOES_seq_discretized_shifted,0.716014,sequence based,0.267661,amp,1,False
3,amp_iamp2l,param_bin_shi,CMANGOES_seq_binary_shifted,0.708928,sequence based,0.267661,amp,1,False
4,acp_mlacp,param_dis_shi,CMANGOES_seq_discretized_shifted,0.403390,sequence based,0.319658,acp,1,False
...,...,...,...,...,...,...,...,...,...
1358,cpp_sanders,ctdt,ctdt,0.872283,sequence based,0.765517,cpp,1,False
1359,cpp_sanders,ksctri,ksctriad_gap_1,0.872283,sequence based,0.765517,cpp,1,False
1360,cpp_sanders,flgc_a,flgc_aaindex_ZIMJ680104,0.875000,sequence based,0.765517,cpp,1,False
1361,cpp_sanders,ctriad,ctriad,0.875000,sequence based,0.765517,cpp,1,False


In [12]:
# Add one more column to flag missing values
df_combined_imb_data['Value'] = [
    'NaN' if i is True else 'notNaN'
    for i in df_combined_imb_data['F1'].isnull().tolist()]
df_combined_bio_data['Value'] = [
    'NaN' if i is True else 'notNaN'
    for i in df_combined_bio_data['F1'].isnull().tolist()]

In [13]:
df_combined_imb_data

Unnamed: 0,Dataset,Encoding,Encoding_max,F1,type,is_imbalanced,bio_field,type_field,missing,Value
0,amp_gonzales,param_dis_shi,CMANGOES_seq_discretized_shifted,0.571429,sequence based,0.209302,amp,1,False,notNaN
1,amp_gonzales,param_bin_shi,CMANGOES_seq_binary_shifted,0.500000,sequence based,0.209302,amp,1,False,notNaN
2,amp_iamp2l,param_dis_shi,CMANGOES_seq_discretized_shifted,0.716014,sequence based,0.267661,amp,1,False,notNaN
3,amp_iamp2l,param_bin_shi,CMANGOES_seq_binary_shifted,0.708928,sequence based,0.267661,amp,1,False,notNaN
4,acp_mlacp,param_dis_shi,CMANGOES_seq_discretized_shifted,0.403390,sequence based,0.319658,acp,1,False,notNaN
...,...,...,...,...,...,...,...,...,...,...
1358,cpp_sanders,ctdt,ctdt,0.872283,sequence based,0.765517,cpp,1,False,notNaN
1359,cpp_sanders,ksctri,ksctriad_gap_1,0.872283,sequence based,0.765517,cpp,1,False,notNaN
1360,cpp_sanders,flgc_a,flgc_aaindex_ZIMJ680104,0.875000,sequence based,0.765517,cpp,1,False,notNaN
1361,cpp_sanders,ctriad,ctriad,0.875000,sequence based,0.765517,cpp,1,False,notNaN


# 3. Visualize
## 3.1. CMANGOES results

In [14]:
def chart_visualize(df_input, string_title, flag_cmangoes_style):
    
    if flag_cmangoes_style:
        scale_general = alt.Scale(scheme='greys')
        scale_na_values = alt.Scale(domain=['NaN'], range=['#CD7DA9'])
    else:
        scale_general = alt.Scale(range=["#a6bddb", "#023858"])
        scale_na_values = alt.Scale(domain=['NaN'], range=['#a6611a'])
    
    
    chart_non_null = alt.Chart(
        df_input, title=string_title + " multi-dataset results"
    ).mark_rect().encode(
        x=alt.X('Encoding', type='nominal', axis=alt.Axis(labelAngle=-45)),
        y=alt.Y('Dataset', type='nominal',
                scale=alt.Scale(domain=list_of_datasets)),
        color=alt.Color('F1', type='quantitative', scale=scale_general),
        tooltip=['Dataset', 'Encoding_max', 'F1', 'is_imbalanced', 'bio_field']
    )
    
    chart_null = alt.Chart(
        df_input, title=string_title + " multi-dataset results"
    ).mark_rect().encode(
        x=alt.X('Encoding', type='nominal', axis=alt.Axis(labelAngle=-45)),
        y=alt.Y('Dataset', type='nominal',
                scale=alt.Scale(domain=list_of_datasets)),
        color=alt.Color('Value', type='nominal', scale=scale_na_values),
        tooltip=['Dataset', 'Encoding_max', 'F1', 'is_imbalanced', 'bio_field']
    )
    
    chart_result = chart_non_null + chart_null
    
    return chart_result


def replicate_visualizations(flag_visualization_style):
    chart_cmangoes_source_result = chart_visualize(
        df_cmangoes_source_data, 'CMANGOES source', flag_visualization_style)
    chart_peptidereactor_imb_result = chart_visualize(
        df_peptidereactor_imb_data, 'Peptidereactor imb',
        flag_visualization_style)
    chart_peptidereactor_bio_result = chart_visualize(
        df_peptidereactor_bio_data, 'Peptidereactor bio',
        flag_visualization_style)
    chart_combined_imb_result = chart_visualize(
        df_combined_imb_data, 'Combined imb', flag_visualization_style)
    chart_combined_bio_result = chart_visualize(
        df_combined_bio_data, 'Combined bio', flag_visualization_style)

    chart_result = chart_cmangoes_source_result &\
                   chart_peptidereactor_imb_result &\
                   chart_peptidereactor_bio_result &\
                   chart_combined_imb_result &\
                   chart_combined_bio_result

    return chart_result


def filter_and_visualize_datasets():
    
    # df_combined_bio and df_combined_imb
    # Filter according to F1 values and imbalance ration
    
    return None


## 3.1. Peptidereactor-styled results

In [15]:
flag_cmangoes_style = True
replicate_visualizations(not flag_cmangoes_style)

## 3.2. Our own visualizations

In [16]:
replicate_visualizations(flag_cmangoes_style)

## 3.3 Further analysis of encodings

In [34]:
df_combined_imb_data_upper = df_combined_bio_data[
    df_combined_bio_data['F1'] > 0.8]
df_combined_imb_data_upper.fillna('NaN', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_combined_imb_data_upper.fillna('NaN', inplace=True)


In [35]:
chart_visualize(df_combined_imb_data_upper, 'Combined F1 > 90 percent', True)

## 3.4 Performance analysis

In [19]:
df_performance_results = pd.read_csv(path_cmangoes_performance_data)
df_performance_results

Unnamed: 0,Encodings,Run_0,Run_1,Run_2
0,ace_vaxinpad_binary_centered_levels_1_and_2_,20.885754,20.847980,21.150071
1,ace_vaxinpad_binary_shifted_levels_1_and_2_,21.003920,20.989860,21.287949
2,ace_vaxinpad_discretized_centered_levels_1_and_2_,28.740059,28.816950,29.282086
3,ace_vaxinpad_discretized_shifted_levels_1_and_2_,28.779231,28.861472,29.302293
4,acp_anticp_binary_centered_levels_1_and_2_,30.269795,30.303933,30.795589
...,...,...,...,...
111,cpp_sanders_discretized_shifted_levels_1_and_2_,9.578461,9.707204,9.721294
112,hem_hemopi_binary_centered_levels_1_and_2_,57.135355,58.004068,58.125769
113,hem_hemopi_binary_shifted_levels_1_and_2_,58.395762,59.240168,59.277407
114,hem_hemopi_discretized_centered_levels_1_and_2_,79.766179,81.008070,81.112379


In [20]:
df_performance_results['Dataset'] = [
    string_name.split('_')[0] + '_' + string_name.split('_')[1]
    for string_name in df_performance_results['Encodings']]
df_performance_results

Unnamed: 0,Encodings,Run_0,Run_1,Run_2,Dataset
0,ace_vaxinpad_binary_centered_levels_1_and_2_,20.885754,20.847980,21.150071,ace_vaxinpad
1,ace_vaxinpad_binary_shifted_levels_1_and_2_,21.003920,20.989860,21.287949,ace_vaxinpad
2,ace_vaxinpad_discretized_centered_levels_1_and_2_,28.740059,28.816950,29.282086,ace_vaxinpad
3,ace_vaxinpad_discretized_shifted_levels_1_and_2_,28.779231,28.861472,29.302293,ace_vaxinpad
4,acp_anticp_binary_centered_levels_1_and_2_,30.269795,30.303933,30.795589,acp_anticp
...,...,...,...,...,...
111,cpp_sanders_discretized_shifted_levels_1_and_2_,9.578461,9.707204,9.721294,cpp_sanders
112,hem_hemopi_binary_centered_levels_1_and_2_,57.135355,58.004068,58.125769,hem_hemopi
113,hem_hemopi_binary_shifted_levels_1_and_2_,58.395762,59.240168,59.277407,hem_hemopi
114,hem_hemopi_discretized_centered_levels_1_and_2_,79.766179,81.008070,81.112379,hem_hemopi


In [21]:
df_performance_results_new = df_performance_results.drop(
    'Encodings', axis=1).melt('Dataset')
df_performance_results_new['TMP_LEGEND'] = len(df_performance_results_new['Dataset']) * ['Median']
df_performance_results_new

Unnamed: 0,Dataset,variable,value,TMP_LEGEND
0,ace_vaxinpad,Run_0,20.885754,Median
1,ace_vaxinpad,Run_0,21.003920,Median
2,ace_vaxinpad,Run_0,28.740059,Median
3,ace_vaxinpad,Run_0,28.779231,Median
4,acp_anticp,Run_0,30.269795,Median
...,...,...,...,...
343,cpp_sanders,Run_2,9.721294,Median
344,hem_hemopi,Run_2,58.125769,Median
345,hem_hemopi,Run_2,59.277407,Median
346,hem_hemopi,Run_2,81.112379,Median


In [22]:
chart_performance_points = alt.Chart(
    df_performance_results_new).mark_point().encode(
    alt.X('Dataset', type='nominal', axis=alt.Axis(labelAngle=-45)),
    alt.Y('value', type='quantitative'),
    alt.Color('variable', type='nominal'),
    alt.Tooltip(['Dataset', 'value'])
)

chart_performance_medians = alt.Chart(
    df_performance_results_new).mark_tick(filled=True, color='white').encode(
    alt.X('Dataset', type='nominal', axis=alt.Axis(labelAngle=-45)),
    alt.Y('median(value)', type='quantitative'),
    alt.Opacity('TMP_LEGEND', type='nominal', legend=alt.Legend(title='')),
    alt.Tooltip(['Dataset', 'median(value)'])
)

chart_performance_result = alt.layer(
    chart_performance_points, chart_performance_medians).resolve_scale(
    shape='independent'
).interactive()

chart_performance_result


In [23]:
# Calculate dataset sizes
# os.path.getsize("/path/to/file.mp3")
df_dataset_sizes = pd.DataFrame()
df_dataset_sizes['Dataset'] = list_of_datasets
path_original_data = os.path.join(path_root_data, 'Original_datasets')

list_sizes = []

for string_one_dataset in list_of_datasets:
    path_one_dataset = os.path.join(path_original_data, string_one_dataset)
    list_sizes.append(os.path.getsize(
        os.path.join(path_one_dataset, 'classes.txt')) + os.path.getsize(
        os.path.join(path_one_dataset, 'seqs.fasta')))

df_dataset_sizes['Size'] = list_sizes
df_dataset_sizes


Unnamed: 0,Dataset,Size
0,ace_vaxinpad,16346
1,acp_anticp,17075
2,acp_iacp,12788
3,acp_mlacp,23927
4,afp_amppred,131427
5,afp_antifp,197437
6,aip_aippred,34784
7,aip_antiinflam,59822
8,amp_antibp,36054
9,amp_antibp2,84034


In [24]:
chart_size_bars = alt.Chart(
    df_dataset_sizes).mark_bar().encode(
    alt.X('Dataset', type='nominal', axis=alt.Axis(labelAngle=-45)),
    alt.Y('Size', type='quantitative'),
    alt.Tooltip(['Dataset', 'Size'])
)
chart_size_bars

In [25]:
chart_imbalance_bars = alt.Chart(
    df_cmangoes_source_data).mark_bar().encode(
    alt.X('Dataset', type='nominal', axis=alt.Axis(labelAngle=-45)),
    alt.Y('is_imbalanced', type='quantitative'),
    alt.Tooltip(['Dataset', 'is_imbalanced'])
)
chart_imbalance_bars
