In [243]:
import pandas as pd
import numpy as np

from sklearn.neural_network import MLPClassifier

from sklearn import metrics

from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [247]:
class theData:
    """
        theData class implements all required methods to read the input data and perform any processing methods.
        
        Class parameters:
            xRaw: Actual inputs
            xTitles: Label per input dimension
            
            xParsed: Covnert non numerical input to numerical (index of related label)
            xLabels: The unique labels used in the input features (xRaw) - if xParced[i] = j, then xRaw[i] = Labels[j]

            cutoff: Threshold used to convert output data to categorical (1/0)

            tTitles: Label of each output dimension
            tRaw: Raw output vector (MFIs)
            tParsed: Output coverted to categorical (1/0) - tParsed[i] = tRaw[i]>=cutoff ? 1 : 0
            
        Supported methods (public):
            readData: Import all the data (X and y)
            setNewCutoff: Recompute tParsed with a new cutoff
            
    """
    def __init__(self, _defaultCutoff = 1200):
        #initialize properties
        self.xRaw = []
        self.xTitles = []
        self.xLabels = []
        self.xParsed = []

        self.cutoff = _defaultCutoff

        self.tTitles = []
        self.tRaw = []
        self.tParsed = []

    """
        Method: readData
    """ 
    def readData(self, _path,  _inputCols, _outputCols, _sheetToRead=0):
        """
            Read an excel/csv file to extract the table data and produce the features.
            Parse data and convert non numerical data to numerical. Patients' records are stored horizontaly (Rows)
            
            _path (str): FULL path of the excel file to read - Read permission is assumed
            _inputCols (str): Columns to read and produce the input vector, eg. D:AF meaning read cols from D to AF
            _outputCols (str): Columns to read and produce the output vector. 
                               IMPORTANT: Each column will be a independent categorization -> experiment
            _sheetToRead (int): Which sheet to read (1 for the first sheet, etc.)
        """
        #
        # INPUT
        #
        df = pd.read_excel(io=_path, header=0, usecols=_inputCols, sheet_name=_sheetToRead, dtype=str)
        df.fillna('0', inplace=True)
        self.xRaw = df.values  # input
        self.xTitles = df.columns
        # Convert input to numerical data (-> replace with index of the labels matrix)
        T = np.concatenate(self.xRaw, axis=0)
        self.xLabels = np.unique(T)
        self.xParsed = np.copy(self.xRaw)
        for i in range(0,self.xLabels.shape[0]):
            self.xParsed[self.xParsed == self.xLabels[i]] = i

        #
        # OUTPUT
        #
        df = pd.read_excel(io=_path, header=0, usecols=_outputCols, sheet_name=_sheetToRead, dtype=int)
        df.fillna(-1, inplace=True)
        self.tRaw = df.values  # input
        self.tTitles = df.columns
        # Convert continues data to categorical (TRUE/FALSE)
        self.tParsed = np.copy(self.tRaw)
        self.tParsed[self.tRaw >= self.cutoff] = 1
        self.tParsed[self.tRaw <  self.cutoff] = 0
    
    """
        Method: setNewCutoff
    """ 
    def setNewCutoff(self, _newCutoff):
        """
            Set a new cutoff and reassign categories
        """
        self.cutoff = _newCutoff
        self.tParsed = np.copy(self.tRaw)
        self.tParsed[self.tRaw >= self.cutoff] = 1
        self.tParsed[self.tRaw <  self.cutoff] = 0

In [253]:
dt = theData(1200)
dt.readData("./data/LSA1_DATA.xlsx", "G:L", "S:DK")

In [309]:
class theStudy:
    """
        theStudy class implements all the analysis proposed in the related paper.
        
        Class parameters:
            data: The input data (observations + labels)
            cvTestSize: Percentage of test size in a fold of a k-fold cross validation
            cvIterations: How many iterations to perform of a k-fold cross validation
            
        Supported methods (public):
            prepareCrossValidation: Prepare the Cross Validation Tests
            doAnalyze: Perform the prediction analysis
    """
    def __init__(self, _data, _cvTestSize=.3, _cvIterations=10):
        #initialize dataset
        self.data = _data        
        #initialize cross validation
        self.prepareCrossValidation(_cvTestSize, _cvIterations)
        
    """
        Method: prepareCrossValidation
    """ 
    def prepareCrossValidation(self, _cvTestSize=.3, _cvIterations=10):
        """
            Prepare the Cross Validation Tests
            
            _cvTestSize (float in (0,1)): Percentage of test size in a fold of a k-fold cross validation
            _cvIterations (int in [1,10]): How many iterations to perform of a k-fold cross validation
        """
        #initialize cross validation configuration
        if _cvTestSize<=0 or _cvTestSize >=1:
            _cvTestSize = .3
        _cvIterations = int(_cvIterations)
        if _cvIterations<=0 or _cvTestSize >=10:
            _cvTestSize = 10
        self.cvTestSize = _cvTestSize
        self.cvIterations = _cvIterations
        self.cv = ShuffleSplit(n_splits=_cvIterations, test_size=_cvTestSize, random_state=0)
    """
        Method: doAnalyze
    """ 
    def doAnalyze(self, _mfiCol, _data=None, _clf=None):
        """
            Perform the prediction analysis
            
            _mfiCol (int): Which column to analyze
            _data (class theData): Reassign input data.
        """
        if not (_data is None):
            self.data = _data
        # initialize classifier
        if _clf is None:
            clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(20, 3), max_iter=5000, random_state=0)
        else:
            clf = _clf
        # Cross Validation -> compute fitting scores
        precission = np.zeros((len(_mfiCol), self.cvIterations))
        recall = np.zeros((len(_mfiCol), self.cvIterations))
        f1 = np.zeros((len(_mfiCol), self.cvIterations))
        for i in range(0,len(_mfiCol)):
            precission[i,:] = cross_val_score(clf, self.data.xParsed, self.data.tParsed[:,_mfiCol[i]], cv=self.cv, scoring='accuracy')
            recall[i,:] = cross_val_score(clf, self.data.xParsed, self.data.tParsed[:,_mfiCol[i]], cv=self.cv, scoring='recall')
            f1[i,:] = cross_val_score(clf, self.data.xParsed, self.data.tParsed[:,_mfiCol[i]], cv=self.cv, scoring='f1')
        return(precission, recall, f1)

In [310]:
cutoffs = [1000, 1100, 1200, 1300, 1400, 1500, 1700, 2000]

mfis = [0,1,2,3,4,5]

for j in range(0,len(cutoffs)):
    dt.setNewCutoff(cutoffs[j])
    study = theStudy(dt)
    (P, R, F1) = study.doAnalyze(mfis)
    for i in range(0,len(mfis)):
        m_max = np.max(F1[i,:])
        i_max = np.where(F1[i,:] == m_max)
        i_max = i_max[0][0]

        print('%d: %s - P/R/F1: %.2f/%.2f/%.2f' % (cutoffs[j], dt.tTitles[i], P[i,i_max], R[i,i_max], m_max))
    print('-------------------------------------------')

1000: A*01:01 - P/R/F1: 0.81/0.00/0.00
1000: A*02:01 - P/R/F1: 0.85/0.00/0.00
1000: A*02:03 - P/R/F1: 0.83/0.00/0.00
1000: A*02:06 - P/R/F1: 0.84/0.00/0.00
1000: A*03:01 - P/R/F1: 0.87/0.00/0.00
1000: A*11:01 - P/R/F1: 0.88/0.00/0.00
-------------------------------------------
1100: A*01:01 - P/R/F1: 0.81/0.00/0.00
1100: A*02:01 - P/R/F1: 0.80/0.03/0.06
1100: A*02:03 - P/R/F1: 0.85/0.00/0.00
1100: A*02:06 - P/R/F1: 0.85/0.00/0.00
1100: A*03:01 - P/R/F1: 0.88/0.00/0.00
1100: A*11:01 - P/R/F1: 0.88/0.00/0.00
-------------------------------------------
1200: A*01:01 - P/R/F1: 0.81/0.00/0.00
1200: A*02:01 - P/R/F1: 0.86/0.00/0.00
1200: A*02:03 - P/R/F1: 0.85/0.00/0.00
1200: A*02:06 - P/R/F1: 0.86/0.00/0.00
1200: A*03:01 - P/R/F1: 0.88/0.00/0.00
1200: A*11:01 - P/R/F1: 0.88/0.00/0.00
-------------------------------------------
1300: A*01:01 - P/R/F1: 0.82/0.00/0.00
1300: A*02:01 - P/R/F1: 0.86/0.00/0.00
1300: A*02:03 - P/R/F1: 0.86/0.00/0.00
1300: A*02:06 - P/R/F1: 0.87/0.00/0.00
1300: A*03