In [1]:
from titrato.reports import SAMPL6DataProvider, SAMPL6ReportGenerator
import pandas as pd
import os
import io

In [2]:
# =============================================================================
# UTILITY CLASSES
# =============================================================================

class IgnoredSubmissionError(Exception):
    """Exception used to signal a submission that must be ignored."""
    pass


class BadFormatError(Exception):
    """Exception used to signal a submission with unexpected formatting."""
    pass


class SamplSubmission:
    """A generic SAMPL submission.
    Parameters
    ----------
    file_path : str
        The path to the submission file.
    Raises
    ------
    IgnoredSubmission
        If the submission ID is among the ignored submissions.
    """
    # The D3R challenge IDs that are handled by this class.
    CHALLENGE_IDS = {976}

    # The IDs of the submissions used for testing the validation.
    TEST_SUBMISSIONS = {}

    # Section of the submission file.
    SECTIONS = {}

    # Sections in CSV format with columns names.
    CSV_SECTIONS = {}

    def __init__(self, file_path, user_map):
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        print(file_name)
        file_data = file_name.split('-')

        # Check if this is a deleted submission.
        if file_data[0] == 'DELETED':
            raise IgnoredSubmissionError('This submission was deleted.')

        # Check if this is a test submission.
        self.receipt_id = file_data[0]
        if self.receipt_id in self.TEST_SUBMISSIONS:
            raise IgnoredSubmissionError('This submission has been used for tests.')

        # Check this is the correct challenge.
        self.challenge_id = int(file_data[1])
        assert self.challenge_id in self.CHALLENGE_IDS

        # Store user map information.
        user_map_record = user_map[user_map.receipt_id == self.receipt_id]
        assert len(user_map_record) == 1
        user_map_record = user_map_record.iloc[0]

        self.id = user_map_record.id
        self.participant = user_map_record.firstname + ' ' + user_map_record.lastname
        self.participant_id = user_map_record.uid
        #self.participant_email = user_map_record.email
        assert self.challenge_id == user_map_record.component

    @classmethod
    def _read_lines(cls, file_path):
        """Generator to read the file and discard blank lines and comments."""
        with open(file_path, 'r', encoding='utf-8-sig') as f:
            for line in f:
                # Strip whitespaces.
                line = line.strip()
                # Don't return blank lines and comments.
                if line != '' and line[0] != '#':
                    yield line

    @classmethod
    def _load_sections(cls, file_path):
        """Load the data in the file and separate it by sections."""
        sections = {}
        current_section = None
        for line in cls._read_lines(file_path):
            # Check if this is a new section.
            if line[:-1] in cls.SECTIONS:
                current_section = line[:-1]
            else:
                if current_section is None:
                    import pdb
                    pdb.set_trace()
                try:
                    sections[current_section].append(line)
                except KeyError:
                    sections[current_section] = [line]

        # Check that all the sections have been loaded.
        found_sections = set(sections.keys())
        if found_sections != cls.SECTIONS:
            raise BadFormatError('Missing sections: {}.'.format(found_sections - cls.SECTIONS))

        # Create a Pandas dataframe from the CSV format.
        for section_name in cls.CSV_SECTIONS:
            csv_str = io.StringIO('\n'.join(sections[section_name]))
            #print("csv_str: ", csv_str.read())
            columns = cls.CSV_SECTIONS[section_name]
            id_column = columns[0]           
            section = pd.read_csv(csv_str, index_col=id_column, names=columns, skipinitialspace=True)
            
            #print("section: \n", section)

            # Create Molecule ID column 
            #section["Molecule ID"] = None

            #for i,row in enumerate(section.iterrows()):
            #    mol_ID = row[1]["Microstate ID"].split("_")[0]
            #     section.loc[i, "Molecule ID"] = mol_ID
            #section.set_index("Molecule ID", inplace=True)
            
            sections[section_name] = section
        return sections

    @classmethod
    def _create_comparison_dataframe(cls, column_name, submission_data, experimental_data):
        """Create a single dataframe with submission and experimental data."""
        # Filter only the systems IDs in this submissions.

        experimental_data = experimental_data[experimental_data.index.isin(submission_data.index)] # match by column index
        # Fix the names of the columns for labelling.
        submission_series = submission_data[column_name]
        submission_series.name += ' (calc)'
        experimental_series = experimental_data[column_name]
        experimental_series.name += ' (expt)'

        # Concatenate the two columns into a single dataframe.
        return pd.concat([experimental_series, submission_series], axis=1)

    
# =============================================================================
# PKA PREDICTION CHALLENGE
# =============================================================================

class pKaTypeIISubmission(SamplSubmission):
    """A submission for pKa challenge with type II format (relative population of microscopic species).

    Parameters
    ----------
    file_path : str
        The path to the submission file

    Raises
    ------
    IgnoredSubmission
        If the submission ID is among the ignored submissions.

    """

    # The D3R challenge IDs that are handled by this class.
    CHALLANGE_IDS = {976}

    # The IDs of the submissions that will be ignored in the analysis.
    TEST_SUBMISSIONS = {}

    # Section of the submission file.
    SECTIONS = {'Predictions', 'Name', 'Software', 'Method'}

    # Sections in CSV format with columns names.
    CSV_SECTIONS = {'Predictions': ("Microstate ID",'2.00','2.10','2.20','2.30','2.40','2.50','2.60','2.70',
                                    '2.80','2.90','3.00','3.10','3.20','3.30','3.40','3.50','3.60','3.70','3.80',
                                    '3.90','4.00','4.10','4.20','4.30','4.40','4.50','4.60','4.70','4.80','4.90',
                                    '5.00','5.10','5.20','5.30','5.40','5.50','5.60','5.70','5.80','5.90','6.00',
                                    '6.10','6.20','6.30','6.40','6.50','6.60','6.70','6.80','6.90','7.00','7.10',
                                    '7.20','7.30','7.40','7.50','7.60','7.70','7.80','7.90','8.00','8.10','8.20',
                                    '8.30','8.40','8.50','8.60','8.70','8.80','8.90','9.00','9.10','9.20','9.30',
                                    '9.40','9.50','9.60','9.70','9.80','9.90','10.00','10.10','10.20','10.30',
                                    '10.40','10.50','10.60','10.70','10.80','10.90','11.00','11.10','11.20','11.30',
                                    '11.40','11.50','11.60','11.70','11.80','11.90','12.00')}


    def __init__(self, file_path, user_map):
        super().__init__(file_path, user_map)

        file_name = os.path.splitext(os.path.basename(file_path))[0]
        file_data = file_name.split('-')

        # Check if this is a type I submission
        self.submission_type = file_data[2]
        assert self.submission_type in ['typeII']

        self.file_name, self.index = file_data[3:]
        self.index = int(self.index)

        # Load predictions.
        sections = self._load_sections(file_path)  # From parent-class.
        self.data = sections['Predictions']  # This is a pandas DataFrame.
        self.name = sections['Name'][0]

        print("\npKaTypeIISubmission.name: \n", self.name)
        print("\npKaTypeIISubmission.data: \n", self.data)

In [3]:
# Import user map.
with open('SAMPL6_user_map_pKa.csv', 'r') as f:
    user_map = pd.read_csv(f)

# submission file
file_path = "../typeII_predictions/74mf6-976-typeII-Iorga-1.csv"
submission = pKaTypeIISubmission(file_path, user_map)

74mf6-976-typeII-Iorga-1

pKaTypeIISubmission.name: 
 microscopic populations from pKa prediction with Gaussian and global fitting

pKaTypeIISubmission.data: 
                        2.00          2.10          2.20          2.30  \
Microstate ID                                                           
SM15_micro003 -6.160000e-04 -7.760000e-04 -9.770000e-04 -1.230000e-03   
SM15_micro002 -7.390000e+00 -7.160000e+00 -6.930000e+00 -6.700000e+00   
SM15_micro001 -2.380000e+01 -2.350000e+01 -2.330000e+01 -2.310000e+01   
SM15_micro004 -2.910000e+01 -2.860000e+01 -2.810000e+01 -2.770000e+01   
SM20_micro004 -1.460000e+01 -1.440000e+01 -1.410000e+01 -1.390000e+01   
SM20_micro003 -4.570000e-07 -5.750000e-07 -7.240000e-07 -9.120000e-07   
SM22_micro002 -1.770000e+00 -1.960000e+00 -2.160000e+00 -2.370000e+00   
SM22_micro001 -2.030000e-01 -1.670000e-01 -1.380000e-01 -1.140000e-01   
SM22_micro003 -4.350000e+00 -4.310000e+00 -4.280000e+00 -4.260000e+00   
SM22_micro004 -1.380000e+01 -1.360000

In [4]:
# Save predictions section as a separate CSV file as required for titrato code
csv_path = "./csv_predictions/" + submission.receipt_id + ".csv"
submission.data.to_csv(csv_path)
print("Output file created: ", csv_path)
submission.data

Output file created:  ./csv_predictions/74mf6.csv


Unnamed: 0_level_0,2.00,2.10,2.20,2.30,2.40,2.50,2.60,2.70,2.80,2.90,...,11.10,11.20,11.30,11.40,11.50,11.60,11.70,11.80,11.90,12.00
Microstate ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SM15_micro003,-0.000616,-0.000776,-0.000977,-0.00123,-0.00155,-0.00195,-0.00245,-0.00309,-0.00388,-0.00489,...,-14.0,-14.3,-14.6,-14.9,-15.3,-15.6,-16.0,-16.4,-16.8,-17.2
SM15_micro002,-7.39,-7.16,-6.93,-6.7,-6.47,-6.24,-6.01,-5.78,-5.55,-5.32,...,-0.399,-0.48,-0.575,-0.682,-0.802,-0.936,-1.08,-1.24,-1.41,-1.59
SM15_micro001,-23.8,-23.5,-23.3,-23.1,-22.8,-22.6,-22.4,-22.2,-21.9,-21.7,...,-16.8,-16.9,-16.9,-17.1,-17.2,-17.3,-17.5,-17.6,-17.8,-18.0
SM15_micro004,-29.1,-28.6,-28.1,-27.7,-27.2,-26.8,-26.3,-25.8,-25.4,-24.9,...,-1.11,-0.964,-0.828,-0.705,-0.595,-0.498,-0.414,-0.342,-0.28,-0.229
SM20_micro004,-14.6,-14.4,-14.1,-13.9,-13.7,-13.4,-13.2,-13.0,-12.8,-12.5,...,-0.00174,-0.00138,-0.0011,-0.000871,-0.000692,-0.000549,-0.000436,-0.000347,-0.000275,-0.000219
SM20_micro003,-4.57e-07,-5.75e-07,-7.24e-07,-9.12e-07,-1e-06,-1e-06,-2e-06,-2e-06,-3e-06,-4e-06,...,-6.36,-6.59,-6.82,-7.05,-7.28,-7.51,-7.74,-7.97,-8.2,-8.43
SM22_micro002,-1.77,-1.96,-2.16,-2.37,-2.58,-2.8,-3.01,-3.23,-3.46,-3.68,...,-29.8,-30.3,-30.8,-31.2,-31.7,-32.1,-32.6,-33.1,-33.5,-34.0
SM22_micro001,-0.203,-0.167,-0.138,-0.114,-0.0944,-0.0787,-0.0661,-0.0559,-0.0478,-0.0413,...,-7.32,-7.55,-7.78,-8.01,-8.24,-8.47,-8.7,-8.93,-9.16,-9.39
SM22_micro003,-4.35,-4.31,-4.28,-4.26,-4.24,-4.22,-4.21,-4.2,-4.19,-4.19,...,-11.5,-11.7,-11.9,-12.2,-12.4,-12.6,-12.8,-13.1,-13.3,-13.5
SM22_micro004,-13.8,-13.6,-13.3,-13.1,-12.8,-12.6,-12.3,-12.1,-11.8,-11.6,...,-0.000671,-0.000533,-0.000423,-0.000336,-0.000267,-0.000212,-0.000169,-0.000134,-0.000106,-8.4e-05


In [5]:
# Experimental data
exp = SAMPL6DataProvider("SAMPL6_experimental_pkas.csv", "exp", "Experiment", 
                              bootstrap_options={"n_samples": 3})
print(exp.method_desc)

Experiment


In [6]:
# Type II prediction
typeII_pred = SAMPL6DataProvider(csv_path, "typeii", "method-name")
typeII_pred

<titrato.sampl.SAMPL6DataProvider at 0x11ce85e48>

In [7]:
typeII_pred.file_path

'/Users/isikm/lab/SAMPL6-repos/sampl6-physicochemical-properties/analysis_of_pKa_predictions/analysis_of_typeII_predictions/linear_free_energy_vs_pH_plots/csv_predictions/74mf6.csv'

In [8]:
typeII_pred.data_type

'typeii'

In [9]:
typeII_pred.method_desc

'method-name'

In [10]:
typeII_pred._typeiii_charge_file

In [11]:
predictions = [typeII_pred]
testmol="SM15"

In [12]:
reporter = SAMPL6ReportGenerator(testmol, exp, predictions, "./Images/Molecules/{}.pdf".format(testmol))
reporter.make_all_plots()
reporter.generate_latex()
reporter.save_all("Reports", ext="pdf")
reporter.close()