In [None]:
def svmModel(X, y, svm):
    
    # create the k-fold object
    kf = KFold(n_splits=5, shuffle=True, random_state=0)
    
    # arrays to collect data fold scores
    accuracies = np.array([])
    precisions = np.array([])
    recalls = np.array([])
    f1s = np.array([])
    specificities = np.array([])
    aucs = np.array([])
    predictions = np.array([])
    
    # iterate over the 10 folds
    for train_index, test_index in kf.split(X):
        
        # split data
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # fit the model
        svm.fit(X_train, y_train)
        
        # Make predictions on the test set
        y_predicted = svm.predict(X_test)
        
        predictions = np.append(predictions, y_predicted)
        
        # collect the fold's scores
        y_scores = svm.decision_function(X_test)

        accuracies, precisions, recalls, f1s, specificities, aucs = appendScores(
            y_test, y_predicted, y_scores, accuracies, precisions, recalls, f1s, specificities, aucs)
        
    # record the scores for this model
    summary_df = getScores(accuracies, precisions, recalls,f1s, specificities, aucs)
    
    return summary_df, predictions

In [None]:
def appendScores(y_test, y_predicted, y_scores, accuracies, precisions, recalls, f1s, specificities, aucs):
     
    current_accuracy, current_precision, current_recall, current_f1, current_specificity, current_auc = \
    calculateScores(y_test, y_predicted, y_scores)
    
    # append the new scores 
    accuracies = np.append(accuracies, current_accuracy)
    precisions = np.append(precisions, current_precision)
    recalls = np.append(recalls, current_recall)
    f1s = np.append(f1s, current_f1)
    specificities = np.append(specificities, current_specificity)
    aucs = np.append(aucs, current_auc)
    
    return accuracies, precisions, recalls, f1s, specificities, aucs   

In [None]:
def calculateScores(y_true, y_predicted, y_scores):
    
    # Get true-negative, false-positive, false-negative and true-positive from confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_true, y_predicted).ravel()

    # get the accuracy score
    accuracy = (tp+tn)/(tp+tn+fp+fn)

    # get the percision score
    if tp+fp == 0:
        precision = 0
    else:        
        precision = tp/(tp+fp)

    # get the recall score
    if tp+fn == 0:
        recall = 0
    else:        
        recall = tp/(tp+fn)
    
    # get the f1 score
    if recall+precision == 0:
        f1 = 0
    else:    
        f1 = (2*precision*recall)/(precision+recall)

    # get the specificity score
    if tn+fp == 0:
        specificity = 0
    else:
        specificity = tn/(tn+fp)
     
    # calculate the auc
    auc = roc_auc_score(y_true, y_scores)
    
    return accuracy, precision, recall, f1, specificity, auc

In [None]:
def getScores(accuracies, precisions, recalls, f1s, specificities, aucs):
    
    # calculate mean & std for each evaluator
    mean_summary = [np.mean(accuracies), np.mean(precisions), np.mean(recalls),
                    np.mean(f1s), np.mean(specificities), np.mean(aucs)]
    
    std_summary = [np.std(accuracies), np.std(precisions), np.std(recalls),
                   np.std(f1s), np.std(specificities), np.std(aucs)]
    
    # create summary DataFrame for all score types
    score_types = ["accuracy", "precision", "recall", "F1", "specificity", "AUC"]
    summary_df = pd.DataFrame({"mean": mean_summary, "STD": std_summary}, index=score_types)
    
    return summary_df

In [None]:
def sanityCheck4(predictions):
    
    ones_ratio = np.mean(predictions)
    zeros_ratio = 1 - ones_ratio
    
    print("'1' predictions: around "+str(int(ones_ratio*100))+"%")
    print("'0' predictions: around "+str(int(zeros_ratio*100))+"%")
    
    return None

In [None]:
def featureSelect(processed_X, processed_bi_y, n_features):
    
    # create and fit the feature selector
    svm = SVC(kernel='rbf', C=1, gamma=1)
    sfs = SequentialFeatureSelector(svm, n_features_to_select = n_features)
    sfs.fit(processed_X, processed_bi_y)

    # get the selected features
    selection_mask = sfs.get_support()
    feature_selection = np.array(feature_list)[selection_mask]
    
    return feature_selection, selection_mask

In [None]:
def evaluateFeatureSet(processed_X, processed_bi_y, selection_mask):
    
    # retain the selected features only
    new_feature_X = processed_X[:,selection_mask]
    
    # evalusate the model with these features
    summary_df, predictions = svmModel(new_feature_X, processed_bi_y, SVC(kernel='rbf', C=1, gamma=1))
    
    return summary_df, predictions, new_feature_X

In [None]:
def gridSearch(final_X, processed_bi_y):
    
    # parameters to grid search
    param_grid = {'C': [0.01, 0.05, 0.1, 1, 10], 'gamma': [0.001, 0.01, 0.05, 0.1, 1, 10], 'kernel': ['rbf']}
    
    # score variables
    best_accuracy=0
    best_auc=0
    
    # iterate over the parameter grid
    for g in ParameterGrid(param_grid):

        # cross-validate the model and record its performance
        current_model_scores, _ = svmModel(final_X, processed_bi_y, SVC(**g))
        current_accuracy = current_model_scores.to_dict()["mean"]["accuracy"]
        current_auc = current_model_scores.to_dict()["mean"]["AUC"]
        
        # save a best accuracy score
        if current_accuracy > best_accuracy:
            best_accuracy = current_accuracy
            best_parameters_accuracy = g
            
         # save a best auc score
        if current_auc > best_auc:
            best_auc = current_auc
            best_parameters_auc = g
            
    # report scores to user
    print(f'Best accuracy is {best_accuracy} for {best_parameters_accuracy} SVM parameters.')
    print(f'Best AUC is {best_auc} for {best_parameters_auc} SVM parameters.')

    return None

In [None]:
def permutationTest(n_permutations, X, y, model):
    
    # initializations
    all_permutations = dict()
    score_types = ["accuracy", "precision", "recall", "F1", "specificity", "AUC"]
    
    # calculate the model scores
    for score_type in score_types:
        score_df, _ = svmModel(X, y, model)
        true_score = score_df.to_dict()["mean"][score_type]
        all_permutations[score_type] = {"perm_scores": [], "true_score": true_score}
    
    # calculate all score types for all permutations
    for i in range(n_permutations):
        
        y_ = np.random.permutation(y)
        cur_prem_score_df, _ = svmModel(X, y_, model)
        cur_perm_scores_summary = cur_prem_score_df.to_dict()
        
        for score_type in score_types:          
            cur_perm_score = cur_perm_scores_summary["mean"][score_type]
            all_permutations[score_type]["perm_scores"].append(cur_perm_score)
        
    #calculate p values for all scores
    p_values = np.array([])
    
    for score_type in score_types:
        all_perm_scores = np.array(all_permutations[score_type]["perm_scores"])
        true_score = all_permutations[score_type]["true_score"]
        higher_then_true_scores_count = (all_perm_scores >= true_score).sum()
        p_value = (higher_then_true_scores_count+1)/(n_permutations+1)
        p_values = np.append(p_values, p_value)
    
    # create the evaluation dataframe
    summary_df = pd.DataFrame({"p value": p_values}, index=score_types)
    
    return summary_df

In [None]:
def finalModel(final_X, processed_bi_y, new_C, new_gamma):
    
    svm_model = SVC(kernel='rbf', C=new_C, gamma=new_gamma)
    svm_model.fit(final_X, processed_bi_y)
    
    return svm_model