In [3]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.ensemble import RandomForestRegressor

In [4]:
class ML(object):
    def load_ml_params(self, fl):
        fh = open(fl, 'r')
        for s in fh:
            s = s.rstrip()
            if s == '' or s[0] == '#':
                continue
            w1, w2 = s.split('\t')
            if w1 != 'rf_best-params':
                continue
        #     print(s)
            key, val = w2.split('|')
            if key == 'bootstrap':
                mdl_bootstrap = bool(val)
            elif key == 'ccp_alpha':
                mdl_ccp = float(val)
            elif key == 'criterion':
                mdl_criterion =  val       
            elif key == 'max_depth':
                mdl_maxdepth = int(val)
            elif key == 'max_features':
                if val == 'sqrt' or val == 'log2' or val == 'auto':
                    mdl_maxfeat =  val
                elif val == 'None':
                    mdl_maxfeat = None
                elif val.isnumeric():
                    mdl_maxfeat = int(val)
                else:
                    mdl_maxfeat = float(val)
            elif key == 'max_leaf_nodes':
                if val == 'None':
                    mdl_maxleafnodes = None
                else:
                    mdl_maxleafnodes = int(val)
            elif key == 'max_samples':
                if val == 'None':
                    mdl_maxsamples = None
                elif val.isnumeric():
                    mdl_maxsamples = int(val)
                else:
                    mdl_maxsamples = float(val)
            elif key == 'min_impurity_decrease':
                mdl_minimpuritydec = float(val)
            elif key == 'min_samples_leaf':
                if val.isnumeric():
                    mdl_minsamplesleaf = int(val)
                else:
                    mdl_minsamplesleaf = float(val)
            elif key == 'min_samples_split':
                if val.isnumeric():
                    mdl_minsamplessplit  = int(val)
                else:
                    mdl_minsamplessplit = float(val)
            elif key == 'min_weight_fraction_leaf':
                mdl_minweightfraclef  = float(val)
            elif key == 'n_estimators':
                mdl_estimators = int(val)
            elif key == 'n_jobs':
                if val == 'None':
                    mdl_njobs = None
                else:
                    mdl_njobs = int(val)
            elif key == 'oob_score':
                mdl_oob  = bool(val)  
            elif key == 'random_state':
                mdl_random  = int(val)
            elif key == 'verbose':
                mdl_verbose  =  int(val) 
            elif key == 'warm_start':
                mdl_warmstart  = bool(val)
        fh.close()

        mdl = RandomForestRegressor(bootstrap=mdl_bootstrap, ccp_alpha=mdl_ccp, \
                                    criterion=mdl_criterion, max_depth=mdl_maxdepth, \
                                    max_features=mdl_maxfeat, \
                                    max_leaf_nodes=mdl_maxleafnodes, \
                                    max_samples=mdl_maxsamples, \
                                    min_impurity_decrease=mdl_minimpuritydec, \
                                    min_samples_leaf=mdl_minsamplesleaf, \
                                    min_samples_split=mdl_minsamplessplit, \
                                    min_weight_fraction_leaf=mdl_minweightfraclef, \
                                    n_estimators=mdl_estimators, \
                                    n_jobs=mdl_njobs, \
                                    oob_score=mdl_oob, \
    #                                 random_state=mdl_random, \
                                    verbose=mdl_verbose, \
    #                                 warm_start=mdl_warmstart,\
                                   )  
        return(mdl)
    
    def load_dataset(self, fl):
        df = pd.read_excel(fl)
        df.set_index("Label", inplace=True)
        # labels
        z = df.index.to_numpy(dtype=str)
        X = df.copy()
        y = X.pop('AUC')
#         display(X)
#         display(y)
        # labels
        z = df.index.to_numpy(dtype=str)
        return(X, y, z)
    
    def plot_obs_pred(self, obs, pred, title, ymax, linear_reg, f):
        plt.rcParams['font.size'] = 16
        if ymax == 0:
            ymax = 450
        plt.xlim(0,ymax+50)
        plt.ylim(0,ymax+50)
        plt.plot([0,ymax+50],[0,ymax+50], '--', linewidth=3, color='red')
        if linear_reg == False:
            plt.plot(obs, pred, 'o', alpha=0.5, markersize=10)
        if linear_reg == True:
            sns.regplot(x=obs, y=pred, scatter_kws={'alpha':0.5, 's': 100},\
                        line_kws={'linewidth':4}, truncate=False)
            

        plt.xlabel('Binding AUC observed')
        plt.ylabel('Binding AUC predicted')
        plt.title(title)
        tick_marks = np.arange(0, ymax+50, 50)
        plt.xticks(tick_marks, rotation = 90, fontsize=12)
        plt.yticks(tick_marks, fontsize=12)
        return(f)