In [10]:
import joblib
import pandas as pd
import numpy as np
import time


In [11]:
class GeneticGroup:
    def __init__(self,vector_reg,vector_clf,vector_v_index,dna_size,pd_variables_range,
                n_pop=500,crossover_rate=0.5,mutation_rate=0.1,
                n_generations=50,model_r=np.max,
                 model_c1=np.min,
                 model_c2=np.min,
                 model_c3=np.min,
                 model_c4=np.min,
                 model_c5=np.min):
        self.v_r=vector_reg
        self.v_g=vector_clf
        self,vvi=vector_v_index
        self.m_reg=model_r
        self.m_c1=model_c1
        self.m_c2=model_c2
        self.m_c3=model_c3
        self.m_c4=model_c4
        self.m_c5=model_c5
        self.ds = dna_size  #用几个位的二进制编码
        self.np = n_pop
        self.cr = crossover_rate
        self.mr = mutation_rate
        self.ng = n_generations
        self.pvr = pd_variables_range#变量范围
        self.nv =len(self.pvr)
        self.pop = np.random.randint(2, size=(self.np, self.ds * self.nv))
        #把数组A组合成长度为种群数量的数组
        self.x=np.tile(vector_reg,self.np).reshape(self.np,-1)
        self.x_clf=np.tile(vector_clf,self.np).reshape(self.np,-1)
    #解码
    def translateDNA(self):
        pop=self.pop#种群数量
        data_x=np.zeros((self.np,self.nv))
        for i in range(self.nv):
            pop_x=pop[:,i::self.nv]#切片  取出每个变量的编码
            data_x[:,i]=float(self.pvr['min'][i])+ \
            float(self.pvr['max'][i]-self.pvr['min'][i])* \
            pop_x.dot(2**np.range(self.ds)[::-1])/float(2**self.ds-1)
        self.x=data_x
        self.x_clf[:,self.vvi]=self.x
        return self.x,self.x_clf
    
                              
    #计算适应度    此处为目标函数  活性值*AMET数                 
    def get_fitness(self):
        data_x,data_xclf=self.translateDNA()
        pred_reg= self.m_reg.predict(data_x)#回归预测
        total_ADMET=self.m_c1.predict(data_xclf)+self.m_c2.predict(data_xclf)+ \
        (self.m_c1.predict(data_xclf)^1)+ \
        self.m_c1.predict(data_xclf)+ \
        (self.m_c1.predict(data_xclf)^1)
        fitness_Reg=pred_reg - np.min(pred_reg) + 1e-4
        fitness_ADMET = (total_ADMET >= 4) * 1 + 1e-4
        return fitness_Reg * fitness 
    
    def select(self):
        idx=np.random.choice(np.arange(self.np),size=self.np,
                             replace=True,p=self.get_fitness()/np.sum(self.get_fitness()))
        self.pop=self.pop[idx]
    def mutation(self,vector):
        if np.random.rand()<self.mr:
            mutation_point=np.random.randint(0,self.ds*self.nv)
            vector[mutation_point]=vector[mutation]^1#按位取反
    def crossover_and_mutation(self):
        pop=self.pop
        for i in range(self.pop):
            child=self.pop[i,:]
            if np.random.rand()<self.cr:
                mother=pop[np.random.randint(self.np)]
                cross_points=np.random.randint(0,self.ds*self.nv)
                child[cross_points:]=mother[cross_points:]
            self.mutation(child)
            pop[i,:]=child
        self.pop=pop
    def optimization(self):
        zy=zeros(self.ng,self.nv)
        for i in range(self.ng):
            self.select()
            self.crossover_and_mutation()
            fitness=self.get_fitness()
            if np.std(fitness)<=1e-5:
                break
            zy[i,:]=self.x[int(np.argmax(fitness)),:]
        return self.x[int(np.argmax(fitness)),:],self.x_clf[int(np.argmax(fitness)),:],zy
        

In [13]:
#回归数据
data_reg=pd.read_excel('./data/Molecular_Descriptor_20.xlsx',header=0)
label_reg=pd.read_excel('./data/ERα_activity.xlsx',header=0)

In [19]:
#分类数据
data_clf=pd.read_excel('./data/Molecular_Descriptor_20.xlsx',header=0)
label_clf=pd.read_excel('./data/ADMET.xlsx',header=0)
Caco_2,CYP3A4,hERG,HOB,MN=label_clf.iloc[:,1],label_clf.iloc[:,2],label_clf.iloc[:,3], \
label_clf.iloc[:,4],label_clf.iloc[:,5]

In [None]:
#加载分类模型
RF_clf_1 = joblib.load('./Classify/forest_clf_y1')


In [None]:
#加载预测模型

In [21]:
var=pd.read_excel("./data/Molecular_Descriptor_20.xlsx")

list_min=var.min()
list_max=var.max()
delta=list_max-list_min
var_data={'min':list_min,'max':list_max,'delta':delta}
var_range=pd.DataFrame(var_data)
dna_size=int(np.ceil(np.log2(np.max((var_range['max']-var_range['min'])))))
dna_size

8

In [29]:
res=np.zeros((len(data_reg),len(data_reg.columns)+2)) #创建已优化的回归特征数据
res_clf=np.zeros((len(data_clf),len(data_clf.columns)))
res[i,:-2].shape

(20,)

In [30]:
t1=time.time()
for i in range(len(data_reg)):
    print('训练进度',i)
    GG=GeneticGroup(np.array(data_reg.iloc[i,:]),np.array(data_clf.iloc[i,:]), \
    vector_v_index,dna_size,var_range,n_generations=500,model_r=RF_reg, \
    model_c1=RF_clf_1,model_c2=RF_clf_2,model_c3=RF_clf_3,model_c4=RF_clf_4, \
    model_c5=RF_clf_5)
    res[i,:-2],res_clf[i,:],zy=GG.optimization()
    res[i,-2]=RF_reg.predict(res[i,:-2].reshape(1,-1))#行变成列
    res[i,-1]=RF_clf_1.predict(res[i,:-2].reshape(1,-1))+ \
               RF_clf_2.predict(res[i,:-2].reshape(1,-1))+ \
                (RF_clf_3.predict(res[i,:-2].reshape(1,-1)))^1+ \
                RF_clf_4.predict(res[i,:-2].reshape(1,-1))+ \
                (RF_clf_5.predict(res[i,:-2].reshape(1,-1)))^1
    

训练进度 0


NameError: name 'vector_v_index' is not defined