### GENERATING FEATURES

In [None]:
!pip3 install scikit-image

In [None]:
import library
import color_features
import texture_features
import glcm_features
import hair_removal
import pandas as pd
import numpy as np
import cv2
import os

samples = library.get_sample(path = "/home/name/Desktop/CAD/challenge1/train", amount=0)

dictF = {}
features = pd.DataFrame()
count = 0
flag = True
for sample in samples:
    print('count ', count)
    count += 1
    img = cv2.imread(sample)
    output_bh = library.hair_removal_BH(img)
    
    dictF['name'] = sample
    dictF['label'] = (0 if 'nevus' in sample else 1 )
    
    # color features
    colors = color_features.extract_color_features(output_bh)
    
    dictF.update(colors)
    
    #glcm features
    angles = [0, np.pi/4, np.pi/2, 3*np.pi/4]
    distances = [1]
    colorspaces = ['rgb', 'hsv', 'lab', 'ycc', 'gray']

    for cs in colorspaces:
        glcm = glcm_features.get_glcm(output_bh, angles, distances, cs)
        dictF.update(glcm)
    
    # lbp features
    lbp = texture_features.extract_lbp(output_bh, 1, 8)
    dictF.update(lbp)
    
    # orb features
    # orb = texture_features.extract_orb(output_bh, 64)
    # dictF.update(orb)
    
    features = features.append(dictF, ignore_index=True)
    
    library.writeFeatures(features,
                  flag,
                  os.path.join('/home',
                             'name',
                             'Desktop',
                             'CAD'),
                  'features_train_bh_3000.csv')
    
    flag = False
    features = pd.DataFrame()
    dictF.clear()

In [None]:
import os
import library
import pandas as pd
from sklearn import tree
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif    

#
classifiers = ["rf", "tree", "svm", "adaboost", "gradboost", "histgradboost", "knn", "lda"]

#train = pd.read_csv(os.path.join('/home','emily','Desktop','CAD','features_train_bh_3000.csv'))
#test = pd.read_csv(os.path.join('/home','emily','Desktop','CAD','features_test_bh_3000.csv'))

train1 = pd.read_csv(os.path.join('/home','emily','Desktop','CAD','MelanomaChallenge','features','featuresCh1E_0.csv'))
train2 = pd.read_csv(os.path.join('/home','emily','Desktop','CAD','MelanomaChallenge','features','featuresCh1E_1.csv'))
train3 = pd.read_csv(os.path.join('/home','emily','Desktop','CAD','MelanomaChallenge','features','featuresCh1E_2.csv'))
test = pd.read_csv(os.path.join('/home','emily','Desktop','CAD','MelanomaChallenge','features','featuresCh1E_3.csv'))

train = pd.concat([train1, train2, train3])
print(train.shape)

y = train['label']
X = train.drop(['label'], axis=1)
X = X.drop(['name'], axis=1)


y_test = test['label']
X_test = test.drop(['label'], axis=1)
X_test = X_test.drop(['name'], axis=1)

X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.2, random_state=1)



# preprocessing options
#('selectFromModel', SelectFromModel(RandomForestClassifier(random_state=42, n_jobs = -1)))
#('selector rfe', RFE(RandomForestClassifier(random_state=42, n_jobs = -1))),
#('reduce_dims', PCA(n_components=150)),
#('mutual_info_classif, SelectKBest(mutual_info_classif, k=100)),

for classifier in classifiers:
    
    # preprocessing steps
    pipe = [('scale', StandardScaler()),
            ('selector rfe', RFE(RandomForestClassifier(random_state=42, n_jobs = -1)))
           ]

    
    if classifier == "svm":
        clf, best_params = library.SVC_linear(X_val, y_val, cv=2)
        clf.set_params(**best_params)
        print("### SVM ###")
    
    elif classifier == "rf":
        clf, best_params = library.RandomForest(X_val, y_val, cv=2)
        clf.set_params(**best_params)
        print("### RF ###")
    
    elif classifier == "tree":
        clf = tree.DecisionTreeClassifier()
        print("### TREE ###")
    
    elif classifier == "adaboost":
        clf, best_params = library.AdaBoost(X_val, y_val)
        clf.set_params(**best_params)
        print("### ADABOOST ###")
    
    elif classifier == "gradboost":
        clf, best_params = library.GradientBoosting(X_val, y_val)
        clf.set_params(**best_params)
        print("### GRADBOOST ###")
    
    elif classifier == "knn":
        clf, best_params = library.knn(X_val, y_val)
        clf.set_params(**best_params)
        print("### KNN ###")
        
    elif classifier == "histgradboost":
        clf = HistGradientBoostingClassifier()
        print("### HISTGRADBOOST ###")
        
    elif classifier == "lda":
        clf = LinearDiscriminantAnalysis()
        print("### LDA ###")        
        
    # add classifier 
    pipe.append(tuple(('clf', clf)))
    
    steps = Pipeline(pipe)
    
    # pipeline shape
    print("current pipeline")
    print(steps)
    
    
    library.fit_report(steps, X, y, X_test, y_test)
    

(2630, 423)
Searching for best hyperparameters
The best parameters for rf are {'criterion': 'gini', 'max_depth': 10, 'n_estimators': 900} with an accuracy of 0.6294
### RF ###
current pipeline
Pipeline(steps=[('scale', StandardScaler()),
                ('selector rfe',
                 RFE(estimator=RandomForestClassifier(n_jobs=-1,
                                                      random_state=42))),
                ('clf',
                 RandomForestClassifier(max_depth=10, n_estimators=900,
                                        n_jobs=-1, random_state=42))])
###############
 ### Report ###
              precision    recall  f1-score   support

           0       0.79      0.77      0.78       323
           1       0.80      0.81      0.80       355

    accuracy                           0.79       678
   macro avg       0.79      0.79      0.79       678
weighted avg       0.79      0.79      0.79       678

 ### score ###
0.7920353982300885
 ### accuracy ###
0.7920353982

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


0.47640117994100295
 ### accuracy ###
0.47640117994100295
### f1_score ###
0.0
### confusion matrix ###
[[323   0]
 [355   0]]
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The best parameters for ab are {'learning_rate': 1, 'n_estimators': 900} with an accuracy of 0.6471
### ADABOOST ###
current pipeline
Pipeline(steps=[('scale', StandardScaler()),
                ('selector rfe',
                 RFE(estimator=RandomForestClassifier(n_jobs=-1,
                                                      random_state=42))),
                ('clf',
                 AdaBoostClassifier(learning_rate=1, n_estimators=900,
                                    random_state=42))])
###############
 ### Report ###
              precision    recall  f1-score   support

           0       0.75      0.77      0.76       323
           1       0.79      0.77      0.78       355

    accuracy                           0.77       678
   macro avg       0.77      0.77      0.77       678
weighted avg 