# Um estudo sobre aprendizado ativo

Nesse noteboook implementei minha própria classe de um algoritmo de aprendizado ativo.

Este algoritmo, *MyActLearning*, usa como base o Logistic Regression(LR) e seu treinamento é feito com 10% dos dados de disponíveis para essa função.

Usei o BreastCancer dataset para uma primeira comparação desse algorimo com o LR. Os resultados foram interessantes.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.datasets import fetch_openml, load_breast_cancer

from sklearn.manifold  import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import MDS
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


from sklearn.model_selection import train_test_split,  KFold, cross_val_score, cross_val_predict
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import accuracy_score, f1_score,confusion_matrix

#Logistic Regression:
from sklearn.linear_model import LogisticRegression
#Support Vector Machines (SVM):
from sklearn.svm import SVC
#Random Forest:
from sklearn.ensemble import RandomForestClassifier
#K-Nearest Neighbors (KNN):
from sklearn.neighbors import KNeighborsClassifier

#Decision Tree:
from sklearn.tree import DecisionTreeClassifier
#Gradient Boosting:
from sklearn.ensemble import GradientBoostingClassifier
#Gaussian Naive Bayes:
from sklearn.naive_bayes import GaussianNB
#AdaBoost:
from sklearn.ensemble import AdaBoostClassifier
#ICA
from sklearn.decomposition import FastICA

import math



### 1. Preparação dos dados

In [2]:
X = load_breast_cancer().data
y = load_breast_cancer().target

In [3]:
X_train, X_test, y_train, y_test,  = train_test_split(X, y, random_state=42, test_size=0.2)


In [4]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

### 2. Declaração da classe do algoritmo

In [5]:
class MyActLearning():

    def __init__(self, n_iterations=5, k_samples=10):
        self.n_iterations = n_iterations
        self.k_samples = k_samples
        self.classifier = LogisticRegression(random_state=42, max_iter=1000)
    
    def fit(self, X_pool, y_pool):
        test_size_aux = X_pool.shape[0] - (0+1)*10
        X_train, X_val, y_train, y_val = train_test_split(X_pool, y_pool, test_size=test_size_aux, random_state=42)

        for i in range(self.n_iterations):
            model = self.classifier.fit(X_train, y_train)
            y_pred = self.classifier.predict(X_val)

            # Obter as probabilidades das classes para o conjunto de teste
            probabilities = model.predict_proba(X_val)

            # Calcular a diferença entre as probabilidades das classes
            differences = np.abs(probabilities[:, 0] - probabilities[:, 1])

            # Ordenar os exemplos pelo valor absoluto da diferença (do menor para o maior)
            sorted_indices = np.argsort(differences)

            # Exibir os 10 exemplos mais incertos
            most_uncertain_indices = sorted_indices[:self.k_samples]

            X_train = np.concatenate((X_train, X_val[most_uncertain_indices]))
            y_train = np.concatenate((y_train, y_val[most_uncertain_indices]))

            print(f'Shape do X_train: {X_train.shape[0]}')

            X_val = np.delete(X_val, most_uncertain_indices, axis=0)
            y_val = np.delete(y_val, most_uncertain_indices)
    
    def get_params(self, deep=True):
        # Retorna os parâmetros do estimador em um dicionário
        return {"n_iterations": self.n_iterations, "k_samples": self.k_samples}
    
    def predict(self, X):
        y_pred = self.classifier.predict(X)
        return y_pred

### 3. Treinamentos e predições

In [6]:
k_samples = 10
n_iterations = int((X_train.shape[0]*0.1)/k_samples)
my_model = MyActLearning(n_iterations = n_iterations)
my_model.fit(X_train, y_train)

Shape do X_train: 20
Shape do X_train: 30
Shape do X_train: 40
Shape do X_train: 50


In [7]:
y_pred = my_model.predict(X_test)
acc = round(accuracy_score(y_pred, y_test),3)*100

In [8]:
lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(X_train, y_train)
y_pred2 = lr.predict(X_test)
acc2 = round(accuracy_score(y_pred2, y_test), 3)*100

In [9]:
print('Comparação dos dois valores calculados:')
print(f'LR: {acc2}\nMyActLearning: {acc}')

Comparação dos dois valores calculados:
LR: 98.2
MyActLearning: 98.2


In [10]:
pca = PCA(n_components=2)
pca.fit(X_train)
Xtrain_pca = pca.transform(X_train)

pca.fit(X_test)
Xtest_pca = pca.transform(X_test)

In [11]:
my_model.fit(Xtrain_pca, y_train)
y_pred = my_model.predict(Xtest_pca)
acc = round(accuracy_score(y_pred, y_test),3)*100

lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(Xtrain_pca, y_train)
y_pred2 = lr.predict(Xtest_pca)
acc2 = round(accuracy_score(y_pred2, y_test), 3)*100

print('Comparação dos dois valores calculados para o PCA:')
print(f'LR: {acc2}\nMyActLearning: {acc}')

Shape do X_train: 20
Shape do X_train: 30
Shape do X_train: 40
Shape do X_train: 50
Comparação dos dois valores calculados para o PCA:
LR: 96.5
MyActLearning: 96.5


### 4. Comentários finais

Os resultados mostraram-se equivalentes para o dataset aplicado. Este fato, provavelmente, deve-se a simplicidade dos dados. Testes posteriores devem ser realizados em dados mais complexos para analisar de uma forma mais rigorosa a eficiência do algoritmo *MyActLearning*.

