### This notebook is used for creating yaml files for testing previously trained immuneML models on different test datasets. 

In [255]:
import re
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import NegativeClassOptimization.config as config
import glob
import yaml
from pathlib import Path
from itertools import combinations

In [2]:
antigens = ['3VRL', '1NSN', '3RAJ', '5E94', '1H0D', '1WEJ', '1ADQ', '1FBI','2YPV', '1OB1']

In [136]:
class NoAliasDumper(yaml.SafeDumper):
    """
    Yaml dumper without yaml references.
    """
    def ignore_aliases(self, data):
        return True

In [101]:
def antigens_from_path(folder_name):
    """Find antigens name in the file name, where every word separated with'_'. Antigen list is a global variable."""
    name_parts = folder_name.split('_')
    ags = [name_part for name_part in name_parts if name_part in antigens]
    return ags

In [123]:
#creating dict config:dataset_paths
def get_config_path_dict(models_root, data_folder_path, file_pattern_format):
    """ 
    Creates a dictionary of a ML models and their corresponding test datasets depending on antigen(s) used for training.
    Antigen labeled as positive should be the same in training and testing datasets.
    
    Arguments:
        models_root: foler where models located
        data_folder_path: folder where test datasets are located
        file_pattern_format: (str) string pattern for test data file formated using .format. Ex: "'/*{0}*_test*'.format(conf_ag[0])", 
                                                                                                          where conf_ag denotes list of antigens from the name of the model.
    """
    config_paths = glob.glob(str(Path(config.IMMUNE_ML_BASE_PATH / models_root)) +'/fit_*')
    config_path_dict = dict()
    for config_path in config_paths:
        #getting dataset file pattern in a bit tricky way
        conf_ag = antigens_from_path(config_path.split('/')[-1])
        data_file_pattern = []
        exec('data_file_pattern.append(%s)' % file_pattern_format)
        data_file_pattern = data_file_pattern[0]
        
        #creating config path dict
        config_path = Path(config_path) / 'optimal_binder/zip/ml_settings_binder.zip'
        config_path = str(config_path)
        paths = glob.glob(str(Path(config.DATA_BASE_PATH / f'{data_folder_path}')) + data_file_pattern)
        conf_ag = '_'.join(conf_ag)  # to put into a set together with another string
        config_path_dict[(config_path, conf_ag)] = paths
    return config_path_dict

In [209]:
#{models_root}_{conf_ag[0]}_{path_ag[0]}
def create_test_yaml(models_root, data_folder_path, data_file_str_pattern, spec_name, test_run = False):
    """
    Creates yaml for immuneML application. Returns datasets and instruction dictionaries for subsequent dumping to yaml.
    Arguments:
        models_root: foler where models located
        data_folder_path: folder where test datasets are located
        data_file_str_pattern: (str) started pattern for test data file written formated using .format. Ex: "'/*{0}*_test*'.format(conf_ag[0])",
                                                                                                        where conf_ag denotes list of antigens from the name of the model.                                                   
        spec_name: (str) string pattern for specification name formated using .format. Ex: "'high_low_svm_{0}_{1}_{2}'.format(conf_ag[0],path_ag[0],path_ag[1])",
                                                                                            where conf_ag/path denotes list of antigens from the name of the model or test dataset correspondingly.                                                   
        test_run: (boolen) if True datasets and instruction dictionaries will consist of only one key-value pair.
    """
    config_path_dict = get_config_path_dict(models_root, data_folder_path, data_file_str_pattern)
    datasets = dict()
    instructions = dict()
    for config_set, paths in config_path_dict.items():
        conf_ag = config_set[1].split('_')
        config_path = config_set[0]
        for path in paths:
            path_ag = antigens_from_path(path.split('/')[-1])
            spec_name_ag = []
            exec('spec_name_ag.append(%s)' % spec_name)
            spec_name_ag = spec_name_ag[0]
            datasets[spec_name_ag] = {
                    "format": "Generic",
                    "params": {
                        "path": path,
                        "is_repertoire": False,
                        "region_type": "FULL_SEQUENCE",
                         "column_mapping": {
                                "Slide": "sequence_aas",
                                "example_id": "sequence_identifiers"

                            },
                            "metadata_column_mapping": {
                                "example_id": "sequence_identifiers",
                                "Antigen": "Antigen",
                                "binder": "binder", #can be compared later, no need to crete an alt_binder coumn now
                            }}
            }
            instructions[spec_name_ag + '_instruction'] = {
                        "type": "MLApplication",
                        "dataset": spec_name_ag,
                        "config_path": config_path,
                        "number_of_processes": 4
            }
            if test_run:
                return datasets, instructions
        
    return datasets, instructions

In [137]:
def yaml_dumping(yaml_file_name, datasets, instructions):
    """
    Dumps dictionaries to yaml sile for immuneML.
    """
    with open(f'../immuneML/yaml_specifications/{yaml_file_name}', 'w') as f:
            yaml.dump(
                {'definitions': {'datasets': datasets},
                'instructions': instructions},
                f,
                default_flow_style=False,
                sort_keys=False,
                Dumper=NoAliasDumper
            )

#there is a problem, maybe i need to use tsv not csv

In [231]:
def metadata_to_tsv(files_test_csv):
    """
    Converts metadata test files from csv to tsv and saves them to the folder test_tsv in the same root folder as metadata.
    Arguments:
        files_test_csv: list of all test files in the metadata folder.
    """
    for tets_path in files_test_csv:
        df_i = pd.read_csv(tets_path)
        out_path = tets_path.split('/')
        file_name = out_path[-1].split('.')[0] + '.tsv'
        out_path[-2] = 'test_tsv'
        out_path[-1] = file_name
        out_path = '/'.join(out_path)
        df_i.to_csv(out_path, sep ='\t')

### High-low agains pw-high

In [None]:
files_test_csv = glob.glob(str(Path(config.DATA_BASE_PATH / 'full_data/high_pairwise/metadata/'))  + '/*_test*')
metadata_to_tsv(files_test_csv)

In [225]:
datasets_high_low_vs_high, instructions_high_low_vs_high = create_test_yaml('high_low_svm_out', 'full_data/high_pairwise/test_tsv/',\
                                                                            "'/*{0}*_test*'.format(conf_ag[0])",\
                                                                            "'high_low_svm_{0}_{1}_{2}'.format(conf_ag[0],path_ag[0],path_ag[1])", test_run = False)

In [226]:
yaml_dumping('high_low_vs_high_svm_test.yaml', datasets_high_low_vs_high, instructions_high_low_vs_high)

In [227]:
datasets_high_low_vs_high_knn, instructions_high_low_vs_high_knn = create_test_yaml('high_low_knn_out', 'full_data/high_pairwise/test_tsv/',\
                                                                            "'/*{0}*_test*'.format(conf_ag[0])",\
                                                                            "'high_low_knn_{0}_{1}_{2}'.format(conf_ag[0],path_ag[0],path_ag[1])")

In [228]:
yaml_dumping('high_low_vs_high_knn_test.yaml', datasets_high_low_vs_high_knn, instructions_high_low_vs_high_knn)

In [229]:
datasets_high_low_vs_high_rf, instructions_high_low_vs_high_rf = create_test_yaml('high_low_rf_out', 'full_data/high_pairwise/test_tsv/',\
                                                                            "'/*{0}*_test*'.format(conf_ag[0])",\
                                                                            "'high_low_rf_{0}_{1}_{2}'.format(conf_ag[0],path_ag[0],path_ag[1])")

In [230]:
yaml_dumping('high_low_vs_high_rf_test.yaml', datasets_high_low_vs_high_rf, instructions_high_low_vs_high_rf)

### Test pw_high agains low binders

In [240]:
files_pw_high_test_csv = glob.glob(str(Path(config.DATA_BASE_PATH / 'full_data/high_low_concat/metadata/'))  + '/*_test*')

In [241]:
metadata_to_tsv(files_pw_high_test_csv)

In [None]:
"""for tets_path in files_pw_high_test_csv:
    df_i = pd.read_csv(tets_path)
    out_path = tets_path.split('/')
    file_name = out_path[-1].split('.')[0] + '.tsv'
    out_path[-2] = 'test_tsv'
    out_path[-1] = file_name
    out_path = '/'.join(out_path)
    df_i.to_csv(out_path, sep ='\t')"""

In [242]:
datasets_pw_high_vs_low_knn, instructions_pw_high_vs_low_knn = create_test_yaml('pw_high_knn_out', 'full_data/high_low_concat/test_tsv/',\
                                                                            "'/*{0}*_test*'.format(conf_ag[0])",\
                                                                            "'pw_high_knn_{0}_{1}_vs_{2}'.format(conf_ag[0], conf_ag[1], path_ag[0])")

In [243]:
yaml_dumping('pw_high_vs_low_knn_test.yaml', datasets_pw_high_vs_low_knn, instructions_pw_high_vs_low_knn)

In [244]:
datasets_pw_high_vs_low_svm, instructions_pw_high_vs_low_svm = create_test_yaml('pw_high_svm_out', 'full_data/high_low_concat/test_tsv/',\
                                                                            "'/*{0}*_test*'.format(conf_ag[0])",\
                                                                            "'pw_high_svm_{0}_{1}_vs_{2}'.format(conf_ag[0], conf_ag[1], path_ag[0])")

In [245]:
yaml_dumping('pw_high_vs_low_svm_test.yaml', datasets_pw_high_vs_low_svm, instructions_pw_high_vs_low_svm)

In [246]:
datasets_pw_high_vs_low_rf, instructions_pw_high_vs_low_rf = create_test_yaml('pw_high_rf_out', 'full_data/high_low_concat/test_tsv/',\
                                                                            "'/*{0}*_test*'.format(conf_ag[0])",\
                                                                            "'pw_high_rf_{0}_{1}_vs_{2}'.format(conf_ag[0], conf_ag[1], path_ag[0])")

In [247]:
yaml_dumping('pw_high_vs_low_rf_test.yaml', datasets_pw_high_vs_low_rf, instructions_pw_high_vs_low_rf)

### 1_vs_all against high-low

In [248]:
datasets_1all_vs_low_knn, instructions_1all_vs_low_knn = create_test_yaml('1_vs_all_knn_out', 'full_data/high_low_concat/test_tsv/',\
                                                                            "'/*{0}*_test*'.format(conf_ag[0])",\
                                                                            "'1_vs_all_knn_{0}_vs_high_low_{1}'.format(conf_ag[0], path_ag[0])")

In [249]:
yaml_dumping('1all_vs_high-low_knn_test.yaml', datasets_1all_vs_low_knn, instructions_1all_vs_low_knn)

In [250]:
datasets_1all_vs_low_svm, instructions_1all_vs_low_svm = create_test_yaml('1_vs_all_svm_out', 'full_data/high_low_concat/test_tsv/',\
                                                                            "'/*{0}*_test*'.format(conf_ag[0])",\
                                                                            "'1_vs_all_svm_{0}_vs_high_low_{1}'.format(conf_ag[0], path_ag[0])")

In [251]:
yaml_dumping('1all_vs_high-low_svm_test.yaml', datasets_1all_vs_low_svm, instructions_1all_vs_low_svm)

In [252]:
datasets_1all_vs_low_rf, instructions_1all_vs_low_rf = create_test_yaml('1_vs_all_rf_out', 'full_data/high_low_concat/test_tsv/',\
                                                                            "'/*{0}*_test*'.format(conf_ag[0])",\
                                                                            "'1_vs_all_rf_{0}_vs_high_low_{1}'.format(conf_ag[0], path_ag[0])")

In [253]:
yaml_dumping('1all_vs_high-low_rf_test.yaml', datasets_1all_vs_low_rf, instructions_1all_vs_low_rf)

### Creating cross pairwise high binders test datasets
in a way that neg.dataset wasn't used for training the model

In [268]:
datasets_pw_high_cross_rf, instructions_pw_high_cross_rf = create_test_yaml('pw_high_rf_out', 'full_data/high_pairwise/test_tsv/cross_pw_high',\
                                                                            "'/{0}*_test*'.format(conf_ag[0])",\
                                                                            "'pw_high_cross_rf_{0}_{1}_vs_{2}'.format(conf_ag[0], conf_ag[1], path_ag[1])")


In [269]:
yaml_dumping('pw_high_cross_rf_test.yaml', datasets_pw_high_cross_rf, instructions_pw_high_cross_rf)

In [270]:
datasets_pw_high_cross_knn, instructions_pw_high_cross_knn = create_test_yaml('pw_high_knn_out', 'full_data/high_pairwise/test_tsv/cross_pw_high',\
                                                                            "'/{0}*_test*'.format(conf_ag[0])",\
                                                                            "'pw_high_cross_knn_{0}_{1}_vs_{2}'.format(conf_ag[0], conf_ag[1], path_ag[1])")


In [271]:
yaml_dumping('pw_high_cross_knn_test.yaml', datasets_pw_high_cross_knn, instructions_pw_high_cross_knn)

In [272]:
datasets_pw_high_cross_svm, instructions_pw_high_cross_svm = create_test_yaml('pw_high_svm_out', 'full_data/high_pairwise/test_tsv/cross_pw_high',\
                                                                            "'/{0}*_test*'.format(conf_ag[0])",\
                                                                            "'pw_high_cross_svm_{0}_{1}_vs_{2}'.format(conf_ag[0], conf_ag[1], path_ag[1])")


In [273]:
yaml_dumping('pw_high_cross_svm_test.yaml', datasets_pw_high_cross_svm, instructions_pw_high_cross_svm)