In [2]:
### PYMOO
from pymoo.core.problem import Problem, ElementwiseProblem, StarmapParallelization
from pymoo.algorithms.moo.nsga2 import NSGA2
from pymoo.optimize import minimize
from pymoo.operators.sampling.rnd import BinaryRandomSampling
from pymoo.operators.crossover.pntx import TwoPointCrossover
from pymoo.operators.mutation.bitflip import BitflipMutation
from pymoo.util.display.multi import MultiObjectiveOutput
from pymoo.core.sampling import Sampling
from pymoo.core.mutation import Mutation
from pymoo.indicators.hv import HV
from pymoo.termination.max_gen import MaximumGenerationTermination
from pymoo.termination import get_termination

#multiprocessamento e outros
from tqdm.notebook import trange, tqdm
import multiprocessing, requests, sys, time, itertools, dill, random, os, pickle, copy

#Pandas, SKLearn e etc.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import metrics, svm
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import StratifiedKFold, cross_val_score

#Goatools e Gontosim
from goatools.obo_parser import GODag
from goatools.base import get_godag
#go = get_godag("go-basic.obo", optional_attrs={'relationship'})
from Similarity import Similarity_of_Two_GOTerms, Similarity_of_Set_of_GOTerms

""" Funções e Variaveis Auxiliares"""
from Constantes import *
from PlotingFunctions import *

from sklearn.ensemble import RandomForestClassifier

  EXISTS: go-basic.obo
go-basic.obo: fmt(1.2) rel(2024-06-17) 51,502 Terms; optional_attrs(relationship)


In [3]:
data = pd.read_pickle('Data/data_Breast_GSE70947_norm.pkl')
colunas = data.columns.drop(['type']).tolist()

affymetrix_similarity = pd.read_pickle('./Data/Breast_GSE70947-gene_symbols.pkl')
go_completness = dict(zip(affymetrix_similarity['feature'], affymetrix_similarity['scores']))
go_completness_breast_cancer = dict(zip(affymetrix_similarity['feature'], affymetrix_similarity['scores_breast_cancer_pathways']))
del go_completness['type']
del go_completness_breast_cancer['type']
X = data[colunas]
y = data['type']

In [4]:
# seleciona as features de acordo com o vetor binário
selected_features = [ 1609,  2369,  9077, 21976, 22587, 23786, 32599, 34806]
features = np.full(len(X.columns),False)
features[selected_features] = True
X_selected = np.array(X)
f_1 = []
n_tests = 1
seed = 48
for i in range(n_tests):
    seed = seed + 1
    # Kfolding usado para separar em treino e teste
    skf = StratifiedKFold(n_splits=10, random_state=seed, shuffle=True)
    #X_train, X_valid, y_train, y_valid = train_test_split(X_selected, self.y, test_size=0.1, random_state=100)
        	
    # treino usando modelo SVM
    clf = RandomForestClassifier()
    
    # compute f_1 and AUC on validation set
    f_1.append(np.mean(cross_val_score(clf, X_selected, y, cv=skf, scoring='f1_macro')))
        
f_1 = round(np.array(f_1).sum()/n_tests, 3)
n_features = len(selected_features)
(f_1,n_features)

(0.867, 8)

In [28]:
clf = RandomForestClassifier(n_estimators=1000)
clf.fit(X, y)

In [30]:
np.sort(clf.feature_importances_)[::-1][:100]

array([0.00619391, 0.00479471, 0.00467563, 0.00407751, 0.00376029,
       0.00343695, 0.00341625, 0.00334563, 0.00327129, 0.0032149 ,
       0.00302657, 0.0029937 , 0.00295486, 0.00291486, 0.0028788 ,
       0.00287691, 0.00262031, 0.00260271, 0.00256654, 0.00256043,
       0.00255202, 0.00253911, 0.0024505 , 0.00244455, 0.00243452,
       0.00242414, 0.00241163, 0.00232047, 0.00231002, 0.00225614,
       0.00225379, 0.00222898, 0.00217847, 0.00216949, 0.00215268,
       0.00214575, 0.00210361, 0.0020993 , 0.00209246, 0.00209065,
       0.00207737, 0.0020213 , 0.0020123 , 0.00200868, 0.0019966 ,
       0.00198792, 0.00198072, 0.00197429, 0.00196805, 0.00196339,
       0.00196287, 0.00196089, 0.00193177, 0.00192972, 0.00192968,
       0.00192677, 0.00192273, 0.00189271, 0.00187325, 0.00187205,
       0.00184007, 0.00183913, 0.00182943, 0.00182185, 0.00181598,
       0.00179204, 0.00177221, 0.00175575, 0.00174777, 0.00173939,
       0.00173622, 0.00173595, 0.00173116, 0.00171612, 0.00170

In [44]:
clf.feature_names_in_[np.argsort(clf.feature_importances_, kind = 'stable')]

array(['NM_144987', 'ENST00000322831', 'NM_001625', ..., 'NM_203422',
       'NM_006086', 'NM_006101'], dtype=object)

In [45]:
np.array(list(range(len(clf.feature_importances_))))[np.argsort(clf.feature_importances_, kind = 'stable')]

array([    0,     2,     3, ..., 25069,  9077, 10607])

In [42]:
clf.feature_importances_[10607]

0.0061939146555902306