In [1]:
#basic imports - numpy, pandas, matplotlib, etc.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from math import exp
from tableone import TableOne

#sklean packages for support vector machine
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import classification_report

#import joblib to save sklearn models
from joblib import dump, load

#plotly for figures
import plotly.express as px

In [2]:
#define function for converting string embedding back to np array
def embedding_to_array(embedding_string):
    """This function:
            -accepts a string embedding
            -removes lines breaks and square brackets
            -converts to np array
       Numpy imported as np required.
    """
    
    array = np.fromstring(((embedding_string.replace("\n", "")).replace('[', "")).replace("]", ""), dtype = float, sep = " ")
    return array

In [3]:
#full dataset
criteria_master = pd.read_csv("/Users/Sam/Dropbox/Capstone/jupyter_notebooks/criteria_final_embedded.csv", converters = {'Embedding': embedding_to_array})

#peek
criteria_master.head()

Unnamed: 0,Criteria,Label,Embedding
0,active bacterial fungal infec unresponsive med...,Active Infection,"[0.02945438, 0.10728448, 0.16200222, 0.1811302..."
1,active bacterial fungal infec unresponsive med...,Active Infection,"[0.02945438, 0.10728448, 0.16200222, 0.1811302..."
2,active bacterial fungal infec unresponsive med...,Active Infection,"[0.02945438, 0.10728448, 0.16200222, 0.1811302..."
3,active bacterial fungal infec unresponsive med...,Active Infection,"[0.02945438, 0.10728448, 0.16200222, 0.1811302..."
4,active bacterial fungal infec unresponsive med...,Active Infection,"[0.02945438, 0.10728448, 0.16200222, 0.1811302..."


In [5]:
#features and outcomes
X = criteria_master['Embedding']
y = criteria_master['Label']

In [9]:
%%time
#iterate through different classes
for label in pd.unique(criteria_master.Label):
    
    #if not 'Other'
    if label not in ['Active Infection', 'Age', 'Blood Counts', 'Cardiovascular Function', 'CNS Involvement', 'Marrow Blast Percentage', 'Prior Antileukemic Therapy', 'Diagnosis', 'Patient Comorbidities', 'Molecular Features', 'Prior SCT', 'Allergy/Hypersensitivity', 'Informed Consent', 'Resistance/Relapse/Remission', 'SCT Eligible', 'Other']:
        
        #print message
        print(f"Working on class: {label}")
        
        #start df for specified label by copying given data df
        binary_y = pd.DataFrame({'Label':list(y)})

        #set up binary dataframe
        binary_y['Label'] = np.where(binary_y['Label'] == label, 1, 0)

        #search across different cost values - weighted based on differing scales of inverse class frequencies
        class_freq = Counter(criteria_master['Label'])[label] / len(criteria_master)
        class_odds = class_freq / (1 - class_freq)
        balance = [{0:(class_odds*100),1:1}, {0:(class_odds*10),1:1}, {0:1,1:1}, {0:1,1:(class_odds*10)}, {0:1,1:(class_odds*100)}]
        param_grid = dict(class_weight = balance)
        grid_centered = False
        grid_search_counter = 1
        print(f"Grid search {grid_search_counter}...")
        
        #instantiate model w/ linear kernel, probability estimation, and class balancing
        svm = SVC(kernel = 'linear', probability = True)

        #perform grid search
        clf = GridSearchCV(svm, param_grid, n_jobs = -1, cv = 5, scoring = 'roc_auc')
        clf.fit(list(X), binary_y.Label)

        #extend grid search to left or right if necessary
        while grid_centered == False:
            
            #if left most value found, check next left value
            if clf.best_params_['class_weight'] == balance[0]:

                #add to counter
                grid_search_counter += 1
                print(f"Grid search {grid_search_counter} leftward...")
                
                #reset balance
                balance = [{0:(balance[0][0]*10),1:1}, balance[0]]
                param_grid = dict(class_weight = balance)

                #instantiate model w/ linear kernel, probability estimation, and class balancing
                svm = SVC(kernel = 'linear', probability = True)

                #perform grid search
                clf = GridSearchCV(svm, param_grid, n_jobs = -1, cv = 5, scoring = 'roc_auc')
                clf.fit(list(X), binary_y.Label)

                #check if grid result is same as before
                if (clf.best_params_['class_weight'] == balance[-1]) or (grid_search_counter > 4):

                    #for testing
                    print(balance)
                    print(clf.best_params_)
                    
                    #end loop
                    grid_centered = True

                    #save model as .joblib
                    if "/" in label:
                        new_label = label.replace("/", "")
                        model_path = "/Users/Sam/Dropbox/Capstone/classifier_models/" + new_label + ".joblib"
                        dump(clf, model_path)
                    else:
                        model_path = "/Users/Sam/Dropbox/Capstone/classifier_models/" + label + ".joblib"
                        dump(clf, model_path)                    
                    
            #if right most value found, check next left value
            elif clf.best_params_['class_weight'] == balance[-1]:

                #add to counter
                grid_search_counter += 1
                print(f"Grid search {grid_search_counter} rightward...")
                
                #reset balance
                balance = [balance[-1], {0:1,1:(balance[-1][1]*10)}]
                param_grid = dict(class_weight = balance)

                #instantiate model w/ linear kernel, probability estimation, and class balancing
                svm = SVC(kernel = 'linear', probability = True)

                #perform grid search
                clf = GridSearchCV(svm, param_grid, n_jobs = -1, cv = 5, scoring = 'roc_auc')
                clf.fit(list(X), binary_y.Label)

                #check if grid result is same as before
                if (clf.best_params_['class_weight'] == balance[0]) or (grid_search_counter > 4):
                    
                    #for testing
                    print(balance)
                    print(clf.best_params_)
                    
                    #end loop
                    grid_centered = True

                    #save model as .joblib
                    if "/" in label:
                        new_label = label.replace("/", "")
                        model_path = "/Users/Sam/Dropbox/Capstone/classifier_models/" + new_label + ".joblib"
                        dump(clf, model_path)
                    else:
                        model_path = "/Users/Sam/Dropbox/Capstone/classifier_models/" + label + ".joblib"
                        dump(clf, model_path)  
                    
                                        
            #if grid is centered already, end loop
            else:
                
                #for testing
                print(balance)
                print(clf.best_params_)
                
                #end loop
                grid_centered = True

                #save model as .joblib
                if "/" in label:
                    new_label = label.replace("/", "")
                    model_path = "/Users/Sam/Dropbox/Capstone/classifier_models/" + new_label + ".joblib"
                    dump(clf, model_path)
                else:
                    model_path = "/Users/Sam/Dropbox/Capstone/classifier_models/" + label + ".joblib"
                    dump(clf, model_path)                 
                
        #print message
        print("Grid search and evaluation complete.\n")

Working on class: Hepatic Function
Grid search 1...
Grid search 2 leftward...
Grid search 3 leftward...
Grid search 4 leftward...
Grid search 5 leftward...
[{0: 50410.08201640326, 1: 1}, {0: 5041.008201640327, 1: 1}]
{'class_weight': {0: 50410.08201640326, 1: 1}}
Grid search and evaluation complete.

Working on class: Life Expectancy
Grid search 1...
Grid search 2 rightward...
[{0: 1, 1: 1.8622696411251212}, {0: 1, 1: 18.622696411251212}]
{'class_weight': {0: 1, 1: 1.8622696411251212}}
Grid search and evaluation complete.

Working on class: Concurrent Medications
Grid search 1...
Grid search 2 rightward...
Grid search 3 rightward...
[{0: 1, 1: 48.941270475429484}, {0: 1, 1: 489.41270475429485}]
{'class_weight': {0: 1, 1: 48.941270475429484}}
Grid search and evaluation complete.

Working on class: Performance Status
Grid search 1...
Grid search 2 leftward...
[{0: 43.31412676336181, 1: 1}, {0: 4.331412676336181, 1: 1}]
{'class_weight': {0: 4.331412676336181, 1: 1}}
Grid search and evalua