In [218]:
import re

from sklearn.svm import SVC
import os
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import KernelPCA, PCA, TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN, SVMSMOTE
import pandas as pd
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.model_selection import KFold, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.metrics import make_scorer, roc_auc_score  # added

In [302]:
base_path_normal_train = r'Saved_Embeddings/Normal/'
base_path_abnormal_train = r'Saved_Embeddings/Abnormal/'
base_path_normal_test = r'Saved_Embeddings/Normal_test/'
base_path_abnormal_test = r'Saved_Embeddings/Abnormal_test/'
use_dimensionality_reduction = False
use_scaler = True
use_age_gender_data = False
use_oversampling = True

In [303]:
def load_all_image_embeddings(base_path_normal, base_path_abnormal, age_gender_data):
    embeddings = []
    labels = []
    age_list = []
    gender_list = []
    normal_age_gender_data = age_gender_data[0]
    abnormal_age_gender_data = age_gender_data[1]
    normal_id_list = list(normal_age_gender_data['ID'])    
    abnormal_id_list = list(abnormal_age_gender_data['ID'])
    for k in os.listdir(base_path_normal):
        age, gender = None, None
        id_cleaned = int(''.join(filter(str.isdigit, k)))
        if id_cleaned in normal_id_list:
            _, gender, age = list(normal_age_gender_data.loc[normal_age_gender_data['ID'] == id_cleaned].values.ravel())
        embeddings.append(np.load(base_path_normal + k))
        age_list.append(age)
        gender_list.append(gender)
        labels.append(0)
    for s in os.listdir(base_path_abnormal):
        age, gender = None, None
        id_cleaned = int(''.join(filter(str.isdigit, s)))
        if id_cleaned in abnormal_id_list:
            _, gender, age = list(
                abnormal_age_gender_data.loc[abnormal_age_gender_data['ID'] == id_cleaned].values.ravel())
        embeddings.append(np.load(base_path_abnormal + s))
        labels.append(1)
        age_list.append(age)
        gender_list.append(gender)
    return np.vstack(embeddings), np.array(labels), age_list, gender_list


def load_all_data():
    normal_train, abnormal_train, normal_test, abnormal_test = add_age_gender_data()

    X_train, labels_train, age_list_train, gender_list_train = load_all_image_embeddings(base_path_normal_train,
                                                                                         base_path_abnormal_train,
                                                                                         (normal_train, abnormal_train))
    X_test, labels_test, age_list_test, gender_list_test = load_all_image_embeddings(base_path_normal_test,
                                                                                     base_path_abnormal_test,
                                                                                     (normal_test, abnormal_test))
    return X_train, labels_train, X_test, labels_test, age_list_train, age_list_test, gender_list_train, gender_list_test


def do_scaling(X_train, X_test, method='standard'):
    if method == 'standard':
        scaler = StandardScaler()
        scaler.fit(X_train)
#         print("Scaler used :- ", str("Standard Scaler"))
        return scaler.transform(X_train), scaler.transform(X_test)
    elif method == 'minmax':
        scaler = MinMaxScaler()
        scaler.fit(X_train)
#         print("Scaler used :- ", str("MinMax Scaler"))
        return scaler.transform(X_train), scaler.transform(X_test)
    else:
        return 'Not a valid method'


def svm_model(X_train, Y_train, X_test, Y_test, params):
    
    svm_ = SVC(kernel=params['kernel'], gamma=params['gamma'], C= params['C'])
    svm_.fit(X_train, Y_train)
    pred_labels_test = svm_.predict(X_test)
    pred_labels_train = svm_.predict(X_train)
    print(pred_labels_test)
    print(Y_test)
    print('================================= Performance ==========================')
    print(params)
    print("Train Accuracy :- ", str(accuracy_score(Y_train, pred_labels_train)))
    print("Test Accuracy :- ", str(accuracy_score(Y_test, pred_labels_test)))
    #added
    print('Test F1Score :- ', str(f1_score(Y_test, pred_labels_test, average='binary',labels=np.unique(pred_labels_test))))
    print('Train F1Score :- ', str(f1_score(Y_train, pred_labels_train, average='binary',labels=np.unique(pred_labels_train))))
    print('='*50)
    return Y_test, pred_labels_test


def grid_search_parameters(Data_X, Data_Y):
    param_grid = {'C': [0.1, 1, 10, 0.01, .001], # added 0.01
                  'gamma': [100,10, 1, 0.1, 0.01, 0.001,],
                  'kernel': ['sigmoid']}#['rbf', 'linear', 'poly',]} #removed 'sigmoid' and tried 'poly' - consistently performing poorly
#     acc = make_scorer(custom_scorer, actual_scorer = accuracy_score)
#     auc_score = make_scorer(custom_scorer, actual_scorer = roc_auc_score, needs_threshold=True) 
    fs = make_scorer(custom_scorer, actual_scorer = f1_score)
#     gc = GridSearchCV(DecisionTreeClassifier(), param_grid=params, cv =cvv, 
#                   scoring={"roc_auc": auc_score, "accuracy": acc}, 
#                   refit="roc_auc", n_jobs=-1, 
#                   return_train_score = True)
#     grid = RandomizedSearchCV(SVC(), param_grid, verbose=3, scoring={"roc_auc": auc_score,}, refit = 'roc_auc')
    grid = RandomizedSearchCV(SVC(), param_grid, verbose=3, scoring=fs, refit = True)#{"f1_score": fs,}, refit = True, )
    grid = GridSearchCV(SVC(), param_grid, verbose=3, scoring=fs)#{"f1_score": fs,}, refit = True, )
#     print('Multimetric: ', grid.multimetric_)
    grid.fit(Data_X, Data_Y)
#     print('Best_params: ', grid.best_params_)
    return grid.best_params_

#Changed 'weighted' to 'macro'
def calculate_all_acc_parameters(predicted_labels, y_true):
#     print('labels') #-->prints out [0,1]
#     print(np.unique(predicted_labels))
    lbls = [0,1] #added and replaced 'np.unique(predicted_labels)' with 'lbls'
    return accuracy_score(y_true, predicted_labels), \
            f1_score(y_true, predicted_labels, average='binary',labels=lbls), \
            recall_score(y_true,predicted_labels,average='binary', labels=lbls), \
            precision_score(y_true, predicted_labels, average='binary', labels=lbls)

def ten_cross_validation(data_X, data_Y, params,model_name='svm', folds=10):
    Accuracy, F1_score, Recall, Precision = [], [], [], []
    kfold = KFold(folds, random_state=1, shuffle=True)
    count = 1
    Fold_Number = []
    for train_index, test_index in kfold.split(data_X):
        
        X_train, X_test = data_X[train_index], data_X[test_index]
        y_train, y_test = data_Y[train_index], data_Y[test_index]
#         print(f'{count}th validation...')
        svm_classifier = SVC().set_params(**params)
        svm_classifier.fit(X_train, y_train)
        accuracy, f1score, recall, precision = calculate_all_acc_parameters(svm_classifier.predict(X_test), y_test)
        Accuracy.append(accuracy)
        F1_score.append(f1score)
        Recall.append(recall)
        Precision.append(precision)
        Fold_Number.append('Fold' + str(count))
        
        print(f'Fold{count}:\t Accuracy: ', accuracy)
        print('\t F1 Score', f1score)
        print('\t Recall', recall)
        print('\t Precision', precision)
        count = count + 1
        
    print('Accuracy', np.mean(Accuracy), end = '\t')
    print('F1 Score', np.mean(F1_score))
    print('Recall', np.mean(Recall), end = '\t')
    print('Precision', np.mean(Precision))
    return [Fold_Number, Accuracy, F1_score, Recall, Precision]


def bbc_model(X_train, Y_train, X_test, Y_test):
    bbc = BalancedBaggingClassifier(random_state=42)
    bbc.fit(X_train, Y_train)
    pred_labels_test = bbc.predict(X_test)
    pred_labels_train = bbc.predict(X_train)
    print(pred_labels_test)
    print(Y_test)
    print("Train Accuracy :- ", str(accuracy_score(Y_train, pred_labels_train)))
    print("Test Accuracy :- ", str(accuracy_score(Y_test, pred_labels_test)))
    return Y_test, pred_labels_test


def do_dimensionality_reduction(X_train, X_test, Y_train, Y_test, method='kernelpca'):
    if method == 'kernelpca':
        earlier_dimension = X_train.shape[1]
        kernelpca = KernelPCA()
        kernelpca.fit(X_train)
        X_train = kernelpca.transform(X_train)
        X_test = kernelpca.transform(X_test)
        print("Dimenion Reduced from :- ", str(earlier_dimension), " to :- ", str(X_train.shape[1]), " Using :- ",
              str(method))
        return X_train, X_test, Y_train, Y_test
    elif method == 'pca':
        pca = PCA()
        pca.fit(X_train)
        earlier_dimension = X_train.shape[1]
        X_train = pca.transform(X_train)
        X_test = pca.transform(X_test)
        print("Dimenion Reduced from :- ", str(earlier_dimension), " to :- ", str(X_train.shape[1]), " Using :- ",
              str(method))
        return X_train, X_test, Y_train, Y_test
    elif method == 'svd':
        svd = TruncatedSVD()
        svd.fit(X_train)
        earlier_dimension = X_train.shape[1]
        X_train = svd.transform(X_train)
        X_test = svd.transform(X_test)
        print("Dimenion Reduced from :- ", str(earlier_dimension), " to :- ", str(X_train.shape[1]), " Using :- ",
              str(method))
        return X_train, X_test, Y_train, Y_test
    elif method == 'tsne':
        earlier_dimension = X_train.shape[1]
        X = np.concatenate([X_train, X_test], axis=0)
        Y = np.concatenate([Y_train, Y_test], axis=0)
        X_embedded = TSNE(n_components=3).fit_transform(X)
        X_train, X_test, Y_train, Y_test = train_test_split(X_embedded, Y, test_size=.1)
        print("Dimension Reduced from :- ", str(earlier_dimension), " to :- ", str(X_train.shape[1]), " Using :- ",
              str(method))
        return X_train, X_test, Y_train, Y_test
    

def use_oversampling_method(X_train, Y_train, method='smote'):
    if method == 'smote':
        oversample = SMOTE()
        X_train, Y_train = oversample.fit_resample(X_train, Y_train)
        print("oversampling used :- ", str(method))
        return X_train, Y_train
    elif method == 'randomsampler':
        random = RandomOverSampler()
        X_train, Y_train = random.fit_resample(X_train, Y_train)
        print("oversampling used :- ", str(method))
        return X_train, Y_train
    elif method == 'adasyn':
        adasyn = ADASYN()
        X_train, Y_train = adasyn.fit_resample(X_train, Y_train)
        print("oversampling used :- ", str(method))
        return X_train, Y_train
    elif method == "svmsmote":
        svmsmote = SVMSMOTE()
        X_train, Y_train = svmsmote.fit_resample(X_train, Y_train)
        print("oversampling used :- ", str(method))
        return X_train, Y_train
    else:
        print('Wrong Method')


def add_age_gender_data():
    normal_train = pd.read_excel('age and gender.xlsx', sheet_name=0)
    abnormal_train = pd.read_excel('age and gender.xlsx', sheet_name=1)
    normal_test = pd.read_excel('age and gender.xlsx', sheet_name=2)
    abnormal_test = pd.read_excel('age and gender.xlsx', sheet_name=3)
    return normal_train, abnormal_train, normal_test, abnormal_test


def add_age_gender_to_data(data, agedata, gender_data):
    temp_array = []
    for index, k in enumerate(data):
        temp_k = list(k) + [agedata[index], gender_data[index]]
        temp_array.append(temp_k)
    return np.array(temp_array)

#added function
def custom_scorer(y_true, y_pred, actual_scorer):
    score = np.nan
    try:
        score = actual_scorer(y_true, y_pred)
    except Exception: 
        pass

    return score

In [304]:
X_train, Y_train, X_test, Y_test, age_list_train, age_list_test, gender_list_train, gender_list_test = load_all_data()


In [305]:
# len(Y_test)

In [306]:
best_parameters = {'kernel': 'linear', 'gamma': 1, 'C': 1}
print(X_train.shape)

if use_oversampling:
    print('Using Oversampling with method = "svmsmote"')
    X_train, Y_train = use_oversampling_method(X_train, Y_train, method='svmsmote')

print(X_train.shape)

if use_scaler:
    print('Using Scaler')
    X_train, X_test = do_scaling(X_train, X_test)

if use_dimensionality_reduction:
    print('Using Dimensionality Reduction, PCA')
    X_train, X_test, Y_train, Y_test = do_dimensionality_reduction(X_train, X_test, Y_train, Y_test, method='pca')


train_array = []

if use_age_gender_data:
    print('Using Age and Gender data')
    X_train = add_age_gender_to_data(X_train, age_list_train, gender_list_train)
    X_test = add_age_gender_to_data(X_test, age_list_test, gender_list_test)
    print("After Age Data added shape of Train Data ", X_train.shape)
    print("After Age Data added shape of Test Data ", X_test.shape)

result =svm_model(X_train, Y_train, X_test, Y_test, best_parameters)
# print(result)
pd.DataFrame(result).to_csv('imbalanced_svm_Model.csv', index=False)

best_parameters = grid_search_parameters(X_train, Y_train)
X = np.concatenate([X_train, X_test], axis=0)
Y = np.concatenate([Y_train, Y_test], axis=0)
print('================================= Validation ===================================')
print('Best parameters: ', best_parameters)
pd.DataFrame(np.array(ten_cross_validation(X, Y,best_parameters, folds=10)).T,
             columns=['Fold_Number', 'Accuracy', 'F1_Score', 'Recall', 'Precision']).to_csv('Results.csv', index=False)
print('================================= Results ===================================')
result_refined =svm_model(X_train, Y_train, X_test, Y_test, best_parameters)
pd.DataFrame(result_refined).to_csv('imbalanced_svm_Model_refined.csv', index=False)


(982, 2048)
Using Oversampling with method = "svmsmote"
oversampling used :-  svmsmote
(1314, 2048)
Using Scaler
[0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 1 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
{'kernel': 'linear', 'gamma': 1, 'C': 1}
Train Accuracy :-  1.0
Test Accuracy :-  0.73
Test F1Score :-  0.2285714285714286
Train F1Score :-  1.0
Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END ..C=0.1, gamma=100, kernel=sigmoid;, score=0.158 total time=   1.6s
[CV 2/5] END ..C=0.1, gamma=100, kernel=sigmoid;, score=0.084 total time=   1.6s
[CV 3/5] END ..C=0.1, gamma=100, kernel=sigmoid;, score=0.038 total time=   1.7s
[CV 4/5] END ..C=0.1, gamma=1

[CV 2/5] END .C=0.01, gamma=100, kernel=sigmoid;, score=0.000 total time=   1.8s
[CV 3/5] END .C=0.01, gamma=100, kernel=sigmoid;, score=0.000 total time=   1.7s
[CV 4/5] END .C=0.01, gamma=100, kernel=sigmoid;, score=0.000 total time=   1.8s
[CV 5/5] END .C=0.01, gamma=100, kernel=sigmoid;, score=0.000 total time=   1.6s
[CV 1/5] END ..C=0.01, gamma=10, kernel=sigmoid;, score=0.000 total time=   1.6s
[CV 2/5] END ..C=0.01, gamma=10, kernel=sigmoid;, score=0.000 total time=   1.6s
[CV 3/5] END ..C=0.01, gamma=10, kernel=sigmoid;, score=0.000 total time=   1.6s
[CV 4/5] END ..C=0.01, gamma=10, kernel=sigmoid;, score=0.000 total time=   1.5s
[CV 5/5] END ..C=0.01, gamma=10, kernel=sigmoid;, score=0.000 total time=   1.5s
[CV 1/5] END ...C=0.01, gamma=1, kernel=sigmoid;, score=0.000 total time=   1.6s
[CV 2/5] END ...C=0.01, gamma=1, kernel=sigmoid;, score=0.000 total time=   1.6s
[CV 3/5] END ...C=0.01, gamma=1, kernel=sigmoid;, score=0.000 total time=   1.7s
[CV 4/5] END ...C=0.01, gamm

In [239]:
# from matplotlib import pyplot as plt

# plt.imshow(img_array, cmap='gray')
# plt.show()
# recall_score?
f1_score?

In [278]:
RandomizedSearchCV?

In [292]:
# RandomizedSearchCV().best_params_

In [200]:
result_refined[1] - result[1]

array([ 0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,
        0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0, -1,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  1,  0, -1,  0,  0,  0, -1,  0,  0,
        0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])