In [1]:
import pandas as pd
import pickle
import numpy as np
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import os

import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"  # Also affect subprocesses

def to_str(clf, ext, mag, pca_ratio, report):
    return f"{clf}; {ext}; {mag}; {pca_ratio};{report['benign']['precision']}; {report['benign']['recall']}; {report['benign']['f1-score']};{report['malignant']['precision']}; {report['malignant']['recall']}; {report['malignant']['f1-score']};{report['macro avg']['precision']}; {report['macro avg']['recall']}; {report['macro avg']['f1-score']};{report['weighted avg']['precision']}; {report['weighted avg']['recall']}; {report['weighted avg']['f1-score']};\n"

def save(value, filename):
    dump_directory = './dump'
    os.makedirs(dump_directory, exist_ok=True)
    path = os.path.join(dump_directory, filename)
    pickle.dump(value, open(path, 'wb'))

def load_data(ext, mag, data_type='train'):
   
    file_path_x = f'/kaggle/input/ic-features/features/{ext}/{ext}_{mag}_X_{data_type}.npy'
    file_path_y = f'/kaggle/input/ic-features/features/{ext}/{ext}_{mag}_y_{data_type}.npy'

    try:
        X_data = np.load(file_path_x)
        y_data = np.load(file_path_y)
        return X_data, y_data
    except FileNotFoundError as e:
        print(f"Error loading data for {ext}_{mag}_{data_type}: {e}")
        return None, None


def train_and_save_model(model, param_grid, X_train, y_train, X_test, y_test, ext, mag, pca_ratio, clf_name):
    grid = GridSearchCV(model, param_grid, cv=5)
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)

    save(grid, f"{clf_name}_grid_{ext}_{mag}_{pca_ratio}.pkl")
    save(y_pred, f"ypre_{clf_name}_{ext}_{mag}_{pca_ratio}.pkl")

    return grid, y_pred

def main():
    with open('./results.csv', 'w') as file:
        file.write("clf;ext;mag;pca_ratio;benign-precision; benign-recall; benign-f1-score;malignant-precision;malignant-recall;malignant-f1-score;macro avg-precision;macro avg-recall;macro avg-f1-score;weighted avg-precision;weighted avg-recall;weighted avg-f1-score;\n")

        for ext in ['lbp', 'pftas', 'vgg', 'resnet']:
            for mag in ['40', '100', '200', '400']:
                try:
                    X_train, y_train = load_data(ext, mag, 'train')
                    X_test, y_test = load_data(ext, mag, 'test')
                except TypeError as e:
                    print(f"Error loading data for {ext}_{mag}_train or test: {e}")
                    continue  # Skip to the next iteration

                
                #for pca_ratio in [1, 0.75, 0.5]:
                for pca_ratio in [1]:
                    ss = StandardScaler()
                    X_train = ss.fit_transform(X_train)
                    X_test = ss.transform(X_test)

                    if pca_ratio != 1:
                        pca = PCA(n_components=int(X_train.shape[1] * pca_ratio))
                        X_train = pca.fit_transform(X_train)
                        X_test = pca.transform(X_test)
                    

                    # SVM
                    print(f"Training and save model to clf SVM feature {ext} mag {mag} pca ratio {pca_ratio}")
                    param_grid_svm = {
                        'kernel': ['linear', 'rbf', 'poly'],
                        'C': [0.0001, 0.001, 1, 100, 1000],
                        'gamma': ['scale', 'auto']
                    }
                    svm_model, y_pred_svm = train_and_save_model(svm.SVC(probability=True), param_grid_svm, X_train, y_train, X_test, y_test, ext, mag, pca_ratio, 'SVM')
                    file.write(to_str('svm', ext, mag, pca_ratio, metrics.classification_report(y_test, y_pred_svm, digits=5, output_dict=True)))

                    # MLP
                    print(f"Training and save model to clf MLP feature {ext} mag {mag} pca ratio {pca_ratio}")
                    param_grid_mlp = {
                        'hidden_layer_sizes': [(10,), (50,), (100,)],
                        'activation': ['relu', 'tanh', 'logistic'],
                        'solver': ['sgd', 'adam'],
                        'alpha': [0.0001, 0.001, 0.01],
                    }
                    mlp_model, y_pred_mlp = train_and_save_model(MLPClassifier(max_iter=200), param_grid_mlp, X_train, y_train, X_test, y_test, ext, mag, pca_ratio, 'MLP')
                    file.write(to_str('mlp', ext, mag, pca_ratio, metrics.classification_report(y_test, y_pred_mlp, digits=5, output_dict=True)))

                    # RFC
                    print(f"Training and save model to clf RFC feature {ext} mag {mag} pca ratio {pca_ratio}")
                    param_grid_rfc = {
                        'n_estimators': [50, 100, 200],
                        'max_depth': [None, 5, 10, 20],
                        'min_samples_split': [2, 5, 10],
                        'min_samples_leaf': [1, 2, 4],
                    }
                    rfc_model, y_pred_rfc = train_and_save_model(RandomForestClassifier(), param_grid_rfc, X_train, y_train, X_test, y_test, ext, mag, pca_ratio, 'RFC')
                    file.write(to_str('rfc', ext, mag, pca_ratio, metrics.classification_report(y_test, y_pred_rfc, digits=5, output_dict=True)))

                    # MAIORIA
                    print(f"Training and save model to clf majority_model feature {ext} mag {mag} pca ratio {pca_ratio}")
                    majority_model = VotingClassifier(estimators=[
                        ('rfc', rfc_model.best_estimator_),
                        ('svm', svm_model.best_estimator_),
                        ('mlp', mlp_model.best_estimator_)
                    ], voting='hard', n_jobs=-1)
                    majority_model.fit(X_train, y_train)
                    y_pred_majority = majority_model.predict(X_test)
                    save(majority_model, f"MAIORIA_grid_{ext}_{mag}_{pca_ratio}.pkl")
                    save(y_pred_majority, f"ypre_MAIORIA_{ext}_{mag}_{pca_ratio}.pkl")
                    file.write(to_str('maioria', ext, mag, pca_ratio, metrics.classification_report(y_test, y_pred_majority, digits=5, output_dict=True)))

                    # MAXIMO
                    print(f"Training and save model to clf maximo_model feature {ext} mag {mag} pca ratio {pca_ratio}")
                    maximo_model = VotingClassifier(estimators=[
                        ('rfc', rfc_model.best_estimator_),
                        ('svm', svm_model.best_estimator_),
                        ('mlp', mlp_model.best_estimator_)
                    ], voting='soft', n_jobs=-1)
                    maximo_model.fit(X_train, y_train)
                    y_pred_maximo = maximo_model.predict(X_test)
                    save(maximo_model, f"MAXIMO_grid_{ext}_{mag}_{pca_ratio}.pkl")
                    save(y_pred_maximo, f"ypre_MAXIMO_{ext}_{mag}_{pca_ratio}.pkl")
                    file.write(to_str('maximo', ext, mag, pca_ratio, metrics.classification_report(y_test, y_pred_maximo, digits=5, output_dict=True)))

if __name__ == "__main__":
    main()

Training and save model to clf SVM feature lbp mag 40 pca ratio 1
Training and save model to clf MLP feature lbp mag 40 pca ratio 1
Training and save model to clf RFC feature lbp mag 40 pca ratio 1
Training and save model to clf majority_model feature lbp mag 40 pca ratio 1
Training and save model to clf maximo_model feature lbp mag 40 pca ratio 1
Training and save model to clf SVM feature lbp mag 100 pca ratio 1
Training and save model to clf MLP feature lbp mag 100 pca ratio 1
Training and save model to clf RFC feature lbp mag 100 pca ratio 1
Training and save model to clf majority_model feature lbp mag 100 pca ratio 1
Training and save model to clf maximo_model feature lbp mag 100 pca ratio 1
Training and save model to clf SVM feature lbp mag 200 pca ratio 1
Training and save model to clf MLP feature lbp mag 200 pca ratio 1
Training and save model to clf RFC feature lbp mag 200 pca ratio 1
Training and save model to clf majority_model feature lbp mag 200 pca ratio 1
Training and sav

### Normalização

In [2]:
# # Normalizando  /kaggle/input/ic-features/features/lbp/lbp_40_test_X.npy 
                  
# ss = StandardScaler()
# X_train = ss.fit_transform(X_train)
# X_test = ss.transform(X_test)

### SVM com Grid Search

In [3]:
# from sklearn import svm

# param_grid = {
#     'kernel': ['linear', 'rbf', 'poly'],
#     'C': [0.0001, 0.001, 1, 100, 1000],
#     'gamma': ['scale', 'auto']
# }

# svm_grid = GridSearchCV(svm.SVC(probability=True), param_grid, cv=5)
# svm_grid.fit(X_train, y_train)

# with open(f'./{ext}_{mag}_svm.pickle', 'wb') as file:
#     pickle.dump(grid, file)

In [4]:
# with open(f'./{ext}_{mag}_svm.pickle', 'rb') as file:
#     svm_grid = pickle.load(file)

# print("Melhores Parâmetros:", svm_grid.best_params_, '\n')

# y_pred = svm_grid.predict(X_test)
# print(metrics.classification_report(y_test, y_pred, digits=5))


# # --------
# with open(f'./{ext}_{mag}_pca{pca_ratio}.txt', 'a+') as file:
#     file.write('SVM\n')
#     file.write(str(svm_grid.best_params_) + '\n')
#     file.write(metrics.classification_report(y_test, y_pred, digits=5) + '\n')

### MLP com Grid Search

In [5]:
# from sklearn.neural_network import MLPClassifier

# parameters = {
#     'hidden_layer_sizes': [(10,), (50,), (100,)],
#     'activation': ['relu', 'tanh', 'logistic'],
#     'solver': ['sgd', 'adam'],
#     'alpha': [0.0001, 0.001, 0.01],
# }

# mlp_grid = GridSearchCV(MLPClassifier(max_iter=200), parameters, n_jobs=-1, cv=5)
# mlp_grid.fit(X_train, y_train)

# with open(f'./{ext}_{mag}_mlp.pickle', 'wb') as file:
#     pickle.dump(grid, file)

In [6]:
# with open(f'./{ext}_{mag}_mlp.pickle', 'rb') as file:
#     mlp_grid = pickle.load(file)

# print("Melhores Parâmetros:", mlp_grid.best_params_, '\n')

# y_pred = mlp_grid.predict(X_test)
# print(metrics.classification_report(y_test, y_pred, digits=5))

# # --------
# with open(f'./{ext}_{mag}_pca{pca_ratio}.txt', 'a+') as file:
#     file.write('MLP\n')
#     file.write(str(mlp_grid.best_params_) + '\n')
#     file.write(metrics.classification_report(y_test, y_pred, digits=5) + '\n')

#### RFC com Grid Search

In [7]:
# from sklearn.ensemble import RandomForestClassifier

# parameters = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [None, 5, 10, 20],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
# }

# grid = GridSearchCV(RandomForestClassifier(), parameters, n_jobs=-1, cv=5)
# grid.fit(X_train, y_train)

# with open(f'./{ext}_{mag}_rfc.pickle', 'wb') as file:
#     pickle.dump(grid, file)

In [8]:
# with open(f'./{ext}_{mag}_rfc.pickle', 'rb') as file:
#     rfc_grid = pickle.load(file)

# print("Melhores Parâmetros:", rfc_grid.best_params_, '\n')

# y_pred = rfc_grid.predict(X_test)
# print(metrics.classification_report(y_test, y_pred, digits=5))

# # --------
# with open(f'./{ext}_{mag}_pca{pca_ratio}.txt', 'a+') as file:
#     file.write('RFC\n')
#     file.write(str(rfc_grid.best_params_) + '\n')
#     file.write(metrics.classification_report(y_test, y_pred, digits=5) + '\n')

### Combinação

In [9]:
# from sklearn.ensemble import VotingClassifier

# #MAIORIA

# # voting = hard -> uses predicted class labels for majority rule voting
# clf = VotingClassifier(estimators=[('rfc', rfc_grid.best_estimator_), ('svm', svm_grid.best_estimator_), ('mlp', mlp_grid.best_estimator_)], voting='hard', n_jobs=-1)
# clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)
# print(metrics.classification_report(y_test, y_pred, digits=5))

# # --------
# with open(f'./{ext}_{mag}_pca{pca_ratio}.txt', 'a+') as file:
#     file.write('MAIORIA\n')
#     file.write(metrics.classification_report(y_test, y_pred, digits=5) + '\n')

In [10]:
# #MAXIMO

# # voting = hard -> uses predicted class labels for majority rule voting
# clf = VotingClassifier(estimators=[('rfc', rfc_grid.best_estimator_), ('svm', svm_grid.best_estimator_), ('mlp', mlp_grid.best_estimator_)], voting='soft', n_jobs=-1)
# clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)
# print(metrics.classification_report(y_test, y_pred, digits=5))

# # --------
# with open(f'./{ext}_{mag}_pca{pca_ratio}.txt', 'a+') as file:
#     file.write('MAXIMO\n')
#     file.write(metrics.classification_report(y_test, y_pred, digits=5) + '\n')