In [1]:
import json
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import pickle

from sklearn.preprocessing import OneHotEncoder, MaxAbsScaler, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import recall_score, accuracy_score, f1_score, auc, roc_auc_score, precision_score, balanced_accuracy_score, fbeta_score, make_scorer
from sklearn.inspection import permutation_importance

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC


In [3]:
def scoring(clf, X, y):
    scores = {}
    y_pred = clf.predict(X)
    scores['recall'] = recall_score(y, y_pred)
    scores['accuracy'] = accuracy_score(y, y_pred)
    scores['f1'] = f1_score(y, y_pred)
    scores['precision'] = precision_score(y, y_pred)
    scores['balanced_accuracy'] = balanced_accuracy_score(y, y_pred)
    
    return scores

sub_folders = [f for f in os.listdir('.') if os.path.isdir(f) and 'no_' in f]

model_scores = []
for dataset in ['CATH', 'COMA', 'ENSEMBL', 'SCOP', 'TCPH', 'TESMA']:
    dataset_scores = {}
    
    data_file = f'data_{dataset}.json'
    model_folder = f'no_{dataset}'
    
    with open(f'./{model_folder}/forest_model.pkl', 'rb') as model_file:
        forest_model = pickle.load(model_file)
    with open(f'./{model_folder}/logreg_model.pkl', 'rb') as model_file:
        logreg_model = pickle.load(model_file)
    with open(f'./{model_folder}/svc_model.pkl', 'rb') as model_file:
        svc_model = pickle.load(model_file)
    with open(f'./{model_folder}/tree_model.pkl', 'rb') as model_file:
        tree_model = pickle.load(model_file)
    
    features = pd.read_csv(f'./features_{dataset}.csv', index_col=0)
    labels = pd.read_csv(f'./labels_{dataset}.csv', index_col=0).to_numpy().ravel()
    
    dataset_scores['DecisionTree'] = scoring(tree_model, features, labels)
    model_scores.append([dataset, 'DecisionTree', 
                        dataset_scores['DecisionTree']['recall'], dataset_scores['DecisionTree']['accuracy'], 
                        dataset_scores['DecisionTree']['precision'], dataset_scores['DecisionTree']['balanced_accuracy'], 
                        dataset_scores['DecisionTree']['f1']])
    
    dataset_scores['RandomForest'] = scoring(tree_model, features, labels)
    model_scores.append([dataset, 'RandomForest', 
                        dataset_scores['RandomForest']['recall'], dataset_scores['RandomForest']['accuracy'], 
                        dataset_scores['RandomForest']['precision'], dataset_scores['RandomForest']['balanced_accuracy'], 
                        dataset_scores['RandomForest']['f1']])
    
    dataset_scores['SVC'] = scoring(tree_model, features, labels)
    model_scores.append([dataset, 'SVC', 
                        dataset_scores['SVC']['recall'], dataset_scores['SVC']['accuracy'], 
                        dataset_scores['SVC']['precision'], dataset_scores['SVC']['balanced_accuracy'], 
                        dataset_scores['SVC']['f1']])
    
    dataset_scores['LogisticRegression'] = scoring(tree_model, features, labels) 
    model_scores.append([dataset, 'LogisticRegression', 
                        dataset_scores['LogisticRegression']['recall'], dataset_scores['LogisticRegression']['accuracy'], 
                        dataset_scores['LogisticRegression']['precision'], dataset_scores['LogisticRegression']['balanced_accuracy'], 
                        dataset_scores['LogisticRegression']['f1']])   
    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


KeyboardInterrupt: 

In [9]:
scores_df = pd.DataFrame(model_scores, columns=['DataSet', 'ModelType', 'Recall', 'Accuracy', 'Precision', 'BalancedAccuracy', 'F1'])

In [10]:
scores_df

Unnamed: 0,DataSet,ModelType,Recall,Accuracy,Precision,BalancedAccuracy,F1
0,CATH,DecisionTree,0.0,0.579274,0.0,0.579274,0.0
1,CATH,RandomForest,0.0,0.579274,0.0,0.579274,0.0
2,CATH,SVC,0.0,0.579274,0.0,0.579274,0.0
3,CATH,LogisticRegression,0.0,0.579274,0.0,0.579274,0.0
4,COMA,DecisionTree,0.0,0.25,0.0,0.25,0.0
5,COMA,RandomForest,0.0,0.25,0.0,0.25,0.0
6,COMA,SVC,0.0,0.25,0.0,0.25,0.0
7,COMA,LogisticRegression,0.0,0.25,0.0,0.25,0.0
8,ENSEMBL,DecisionTree,0.467,0.562598,0.078313,0.518538,0.134133
9,ENSEMBL,RandomForest,0.467,0.562598,0.078313,0.518538,0.134133


In [6]:
def probabilistic_scoring(clf, X, y, threshold=0.5):
    y_pred = clf.predict_proba(X)
    tp = 0
    fp = 0
    fn = 0
    tn = 0
    
    for predicted, actual in zip(y_pred, y):
        if predicted[1] >= threshold and actual == 1:
            tp += 1
        elif predicted[1] < threshold and actual == 1:
            fn += 1
        elif predicted[1] >= threshold and actual == 0:
            fp += 1
        elif predicted[1] < threshold and actual == 0:
            tn += 1
        else:
            print('WTF')
    
    precision = tp / (tp + fp) if (tp + fp)  != 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) != 0 else 0.0
    accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) != 0 else 0.0
    f1 = (2 * tp) / (2 * tp + fp + fn) if (2 * tp + fp + fn) != 0 else 0.0
    
    scores = {
        'precision': precision,
        'recall': recall,
        'accuracy': accuracy,
        'f1': f1
    }
    return scores


sub_folders = [f for f in os.listdir('.') if os.path.isdir(f) and 'no_' in f]

model_scores = []
# for threshold in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
for threshold in [0.5]:
    for dataset in ['CATH', 'COMA', 'ENSEMBL', 'SCOP', 'TCPH']:
        dataset_scores = {}
        
        data_file = f'data_{dataset}.json'
        model_folder = f'no_{dataset}'
        
        with open(f'./{model_folder}/forest_model.pkl', 'rb') as model_file:
            forest_model = pickle.load(model_file)
        with open(f'./{model_folder}/logreg_model.pkl', 'rb') as model_file:
            logreg_model = pickle.load(model_file)
        with open(f'./{model_folder}/svc_model.pkl', 'rb') as model_file:
            svc_model = pickle.load(model_file)
        with open(f'./{model_folder}/tree_model.pkl', 'rb') as model_file:
            tree_model = pickle.load(model_file)
        
        features = pd.read_csv(f'./features_{dataset}.csv', index_col=0)
        labels = pd.read_csv(f'./labels_{dataset}.csv', index_col=0).to_numpy().ravel()
        
        dataset_scores['DecisionTree'] = probabilistic_scoring(tree_model, features, labels, threshold=threshold)
        model_scores.append([dataset, 'DecisionTree', threshold,
                            dataset_scores['DecisionTree']['recall'], dataset_scores['DecisionTree']['accuracy'], 
                            dataset_scores['DecisionTree']['precision'], dataset_scores['DecisionTree']['f1']])
        
        dataset_scores['RandomForest'] = probabilistic_scoring(forest_model, features, labels, threshold=threshold)
        model_scores.append([dataset, 'RandomForest', threshold,
                            dataset_scores['RandomForest']['recall'], dataset_scores['RandomForest']['accuracy'], 
                            dataset_scores['RandomForest']['precision'], dataset_scores['RandomForest']['f1']])
        
        dataset_scores['SVC'] = probabilistic_scoring(svc_model, features, labels, threshold=threshold)
        model_scores.append([dataset, 'SVC', threshold,
                            dataset_scores['SVC']['recall'], dataset_scores['SVC']['accuracy'], 
                            dataset_scores['SVC']['precision'], dataset_scores['SVC']['f1']])
        
        dataset_scores['LogisticRegression'] = probabilistic_scoring(logreg_model, features, labels, threshold=threshold) 
        model_scores.append([dataset, 'LogisticRegression', threshold,
                            dataset_scores['LogisticRegression']['recall'], dataset_scores['LogisticRegression']['accuracy'], 
                            dataset_scores['LogisticRegression']['precision'], dataset_scores['LogisticRegression']['f1']])   
    
prob_scores_df = pd.DataFrame(model_scores, columns=['DataSet', 'ModelType', 'Threshold', 'Recall', 'Accuracy', 'Precision', 'F1'])


In [7]:
prob_scores_df.to_csv('prob_scores.csv')

In [9]:
def probabilistic_scoring(clf, X, y, threshold=0.5):
    y_pred = clf.predict_proba(X)
    tp = 0
    fp = 0
    fn = 0
    tn = 0
    
    for predicted, actual in zip(y_pred, y):
        if predicted[1] >= threshold and actual == 1:
            tp += 1
        elif predicted[1] < threshold and actual == 1:
            fn += 1
        elif predicted[1] >= threshold and actual == 0:
            fp += 1
        elif predicted[1] < threshold and actual == 0:
            tn += 1
        else:
            print('WTF')
    
    precision = tp / (tp + fp) if (tp + fp)  != 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) != 0 else 0.0
    accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) != 0 else 0.0
    f1 = (2 * tp) / (2 * tp + fp + fn) if (2 * tp + fp + fn) != 0 else 0.0
    
    scores = {
        'precision': precision,
        'recall': recall,
        'accuracy': accuracy,
        'f1': f1
    }
    return scores


sub_folders = [f for f in os.listdir('.') if os.path.isdir(f) and 'no_' in f]

model_scores = []
for threshold in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    for dataset in ['CATH', 'CENSUS', 'ENSEMBL', 'SCOP', 'TCPH']:
        dataset_scores = {}
        
        data_file = f'data_{dataset}.json'
        model_folder = f'no_{dataset}'
        
        features = pd.read_csv(f'./features_{dataset}.csv', index_col=0)
        labels = pd.read_csv(f'./labels_{dataset}.csv', index_col=0).to_numpy().ravel()
        
        with open(f'./{model_folder}/tree_model_exclusioncriteria.pkl', 'rb') as model_file:
            tree_model = pickle.load(model_file)
        dataset_scores['DecisionTree'] = probabilistic_scoring(tree_model, features, labels, threshold=threshold)
        model_scores.append([dataset, 'DecisionTree', threshold,
                            dataset_scores['DecisionTree']['recall'], dataset_scores['DecisionTree']['accuracy'], 
                            dataset_scores['DecisionTree']['precision'], dataset_scores['DecisionTree']['f1']])

        # with open(f'./{model_folder}/forest_model_smallestvalues.pkl', 'rb') as model_file:
        #     forest_model = pickle.load(model_file)        
        # dataset_scores['RandomForest'] = probabilistic_scoring(tree_model, features, labels, threshold=threshold)
        # model_scores.append([dataset, 'RandomForest', threshold,
        #                     dataset_scores['RandomForest']['recall'], dataset_scores['RandomForest']['accuracy'], 
        #                     dataset_scores['RandomForest']['precision'], dataset_scores['RandomForest']['f1']])

        # with open(f'./{model_folder}/forest_model_smallestvalues.pkl', 'rb') as model_file:
        #     forest_model = pickle.load(model_file)        
        # dataset_scores['SVC'] = probabilistic_scoring(tree_model, features, labels, threshold=threshold)
        # model_scores.append([dataset, 'SVC', threshold,
        #                     dataset_scores['SVC']['recall'], dataset_scores['SVC']['accuracy'], 
        #                     dataset_scores['SVC']['precision'], dataset_scores['SVC']['f1']])
        
        with open(f'./{model_folder}/logreg_model_exclusioncriteria.pkl', 'rb') as model_file:
            logreg_model = pickle.load(model_file)  
        dataset_scores['LogisticRegression'] = probabilistic_scoring(logreg_model, features, labels, threshold=threshold) 
        model_scores.append([dataset, 'LogisticRegression', threshold,
                            dataset_scores['LogisticRegression']['recall'], dataset_scores['LogisticRegression']['accuracy'], 
                            dataset_scores['LogisticRegression']['precision'], dataset_scores['LogisticRegression']['f1']])   
    
prob_scores_df = pd.DataFrame(model_scores, columns=['DataSet', 'ModelType', 'Threshold', 'Recall', 'Accuracy', 'Precision', 'F1'])
prob_scores_df.to_csv('prob_scores_exclusioncriteria.csv')