__PREPARING DATASET__

In [34]:
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
from sklearn.model_selection import RandomizedSearchCV

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

In [35]:
FEATURES_DIR = 'F:/src/features/'
# coocc_features_fp = os.path.join(FEATURES_DIR, 'f_coocc_288.txt')
aec_features_fp = os.path.join(FEATURES_DIR, 'f_aec3d_68_256.txt')

In [36]:
#проверяем пересечение множеств id (методы могли упасть на разных данных)
# coocc_features = np.loadtxt(coocc_features_fp,delimiter=',', dtype=str)
# print(coocc_features.shape)
# coocc_ids = set(coocc_features[:,2])

aec_features = np.loadtxt(aec_features_fp,delimiter=',', dtype=str)
print(aec_features.shape)
# aec_ids = set(aec_features[:,2])

#assert coocc_ids == aec_ids, 'patients sets differs!'

(1093, 259)


In [29]:
gender_age = [gender+age for gender, age in zip(aec_features[:,0], aec_features[:,1])]
u,c = np.unique(gender_age, return_counts=True)
hist = np.histogram(c, bins=u)
print(u,c)
GROUP_SIZE = min(c)
print('minimal age-sex group size:', GROUP_SIZE)

['F18' 'F19' 'F20' 'F21' 'F22' 'F23' 'F38' 'F39' 'F40' 'F41' 'F42' 'F43'
 'M18' 'M19' 'M20' 'M21' 'M22' 'M23' 'M38' 'M39' 'M40' 'M41' 'M42' 'M43'] [29 40 37 55 31 45 45 47 42 35 38 44 35 42 49 58 27 72 59 62 71 25 59 46]
minimal age-sex group size: 25


In [37]:
fem_idx = np.argwhere(aec_features[:,0] == 'F')
m_idx = np.argwhere(aec_features[:,0] == 'M')
females = np.squeeze(aec_features[fem_idx,:])
males = np.squeeze(aec_features[m_idx,:])
print('FEMALE:\n', females.shape)
print('\nMALE:\n', males.shape)

N = int(GROUP_SIZE*0.9)
print('train group size', N)
n = GROUP_SIZE - N
train_idx = []
test_idx = []
for age in ['18', '19', '20', '21', '38', '39', '40', '41']:
    for arr in [females, males]:
        tr_f = np.squeeze(arr[np.argwhere(arr[:,1] == age)])
        print(age, tr_f.shape)
        train_idx += list(tr_f[:N, 2])
        test_idx += list(tr_f[N:N+n, 2])
print('train idx', len(train_idx))
print('test idx', len(test_idx))

FEMALE:
 (488, 259)

MALE:
 (605, 259)
train group size 22
18 (29, 259)
18 (35, 259)
19 (40, 259)
19 (42, 259)
20 (37, 259)
20 (49, 259)
21 (55, 259)
21 (58, 259)
38 (45, 259)
38 (59, 259)
39 (47, 259)
39 (62, 259)
40 (42, 259)
40 (71, 259)
41 (35, 259)
41 (25, 259)
train idx 352
test idx 48


In [38]:
def get_experiment_data(features, train_idx, test_idx):
    X_train = np.zeros(shape=(len(train_idx), features.shape[1] - 3))
    print('train shape', X_train.shape)
    y_train = np.concatenate((np.zeros(len(train_idx)//2), np.ones(len(train_idx)//2)))
    for i, pid in enumerate(train_idx):
        k = np.argwhere(features[:,2] == pid)
        X_train[i, :] = np.squeeze(features[k, 3:])
        
    X_test = np.zeros(shape=(len(test_idx), features.shape[1] - 3))
    for i, pid in enumerate(test_idx):
        k = np.argwhere(features[:,2] == pid)
        X_test[i, :] = np.squeeze(features[k, 3:])
        
    print('test shape', X_test.shape)
    y_test = np.concatenate((np.zeros(len(test_idx)//2), np.ones(len(test_idx)//2)))
    
    idx = np.random.permutation(len(y_train))
    X_train = X_train[idx]
    y_train = y_train[idx]
    
    idx = np.random.permutation(len(y_test))
    X_test = X_test[idx]
    y_test = y_test[idx]
    
    return X_train, y_train, X_test, y_test

In [39]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
    return results['mean_test_score'][np.flatnonzero(results['rank_test_score'] == 1)[0]]


def optimize(X, y, clf, param_dist, name, n_iter_search = 20):
    random_search = RandomizedSearchCV(clf, param_distributions=param_dist, 
                                       scoring='accuracy', n_iter=n_iter_search, cv=5)
    random_search.fit(X, y)
    best_sol = report(random_search.cv_results_)
    print(name, best_sol)
    

def make_experiment(features, train_idx, test_idx):
    X_train, y_train, X_test, y_test = get_experiment_data(features, train_idx, test_idx)

    #KNN
    clf = KNeighborsClassifier()
    param_dist = {"n_neighbors": np.arange(1, 21, 1)}
    optimize(X_train, y_train, clf, param_dist, 'KNN')
    
    #Random forest
    clf = RandomForestClassifier()
    param_dist = {"n_estimators": np.arange(1, 50, 5),
              "max_depth": np.arange(1, 3, 1)}
    optimize(X_train, y_train, clf, param_dist, 'RF')
    
    #AdaBoost
    clf = AdaBoostClassifier()
    param_dist = {"n_estimators": np.arange(1, 200, 5),
                "learning_rate": np.arange(0.1, 1.1, 0.2)}
    optimize(X_train, y_train, clf, param_dist, 'Ada', 50)
        
    #SVM
    clf = SVC()
    param_dist = {"kernel": ['linear', 'poly', 'rbf'],
              "C": [10**(_) for _ in np.arange(-5, 6, 1, dtype=float)],
              "degree": np.arange(1, 7, 1)}
    optimize(X_train, y_train, clf, param_dist, 'SVM', 30)
    

In [40]:
print("\nWORKING ON AEC FEATURES\n")
make_experiment(aec_features, train_idx, test_idx)


WORKING ON AEC FEATURES

train shape (352, 256)
test shape (48, 256)
Model with rank: 1
Mean validation score: 0.645 (std: 0.058)
Parameters: {'n_neighbors': 14}

Model with rank: 2
Mean validation score: 0.645 (std: 0.066)
Parameters: {'n_neighbors': 10}

Model with rank: 3
Mean validation score: 0.642 (std: 0.073)
Parameters: {'n_neighbors': 12}

KNN 0.6448289738430584
Model with rank: 1
Mean validation score: 0.684 (std: 0.075)
Parameters: {'n_estimators': 46, 'max_depth': 2}

Model with rank: 2
Mean validation score: 0.679 (std: 0.062)
Parameters: {'n_estimators': 46, 'max_depth': 1}

Model with rank: 3
Mean validation score: 0.676 (std: 0.072)
Parameters: {'n_estimators': 31, 'max_depth': 2}

RF 0.6843863179074446
Model with rank: 1
Mean validation score: 0.719 (std: 0.063)
Parameters: {'n_estimators': 161, 'learning_rate': 0.5000000000000001}

Model with rank: 2
Mean validation score: 0.710 (std: 0.051)
Parameters: {'n_estimators': 176, 'learning_rate': 0.5000000000000001}

Mode

In [None]:
print("\nWORKING ON CO-OCCURENCES FEATURES\n")
make_experiment(coocc_features, train_idx, test_idx)