In [53]:
import numpy as np
import pandas as pd
import math
import gplearn
import pydotplus
import graphviz
import os     
os.environ["PATH"] += os.pathsep + 'C:\Program Files (x86)\Graphviz2.38\bin'
from PIL import Image
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from gplearn.genetic import SymbolicRegressor
from gplearn.genetic import SymbolicTransformer
from gplearn.functions import make_function
from gplearn.fitness import make_fitness

In [54]:
def readData():
    
    global df

    df = pd.read_csv("data.csv")
    df = df[['cpk_1','k_1', 'uricac_1','ldh_1',"cpk3f", 'k3f', 'uric3f','ldh3f', "crat3f", "newarf"]]
    
    matrix = df.values
    for i, iv in enumerate(matrix):
        for j, jv in enumerate(iv):
            if jv.replace(" ", "") == "":
                if j == 9 :
                    matrix[i, j] = 0
                elif j == 8 :
                    matrix[i, j] = 0.4
                elif j == 7 or j == 3:   
                    matrix[i, j] = 350
                elif j == 6 or j == 2 :
                    matrix[i, j] = 6
                elif j == 5 or j == 1:
                    matrix[i, j] = 0
                elif j == 4 or j == 0 :
                    matrix[i, j] = 150
                    
                    
    matrix = matrix.astype(np.float32)
    df = pd.DataFrame(matrix, columns=df.columns)


In [55]:
def fitness(realData, predictData ,w):

    predictData = (predictData >= 0).astype(np.float32)
    acc = (np.sum((predictData == realData))) / predictData.shape[0]
    
    return acc

In [56]:
f = make_fitness(fitness , greater_is_better=True)

In [57]:
def accuracy(realData, predictData):
    
    conf_mat = metrics.confusion_matrix(realData, predictData, labels=[0, 1])
    healthy = conf_mat[0][0] / (conf_mat[0][0]+conf_mat[0][1])
    sick = conf_mat[1][1] / (conf_mat[1][1]+conf_mat[1][0])
    return healthy, sick

In [58]:
def predict(data, train, scaler = None, model = None):

    data = data.copy()
    X = data[:, :-1]
    if train :
        scaler  = StandardScaler().fit(X)
    X = scaler.transform(X)
    y = data[:, -1]
    
    if train:
        model = SymbolicRegressor(function_set=['add', 'sub', 'mul', 'div', 'log', 'sqrt', 'abs', 'max' ,'min' ,'neg'], 
                                  metric = f, verbose = 0,
                                  population_size=1000, generations=20,
                                  stopping_criteria = 3, random_state = 42, init_depth = (5,15), 
                                  tournament_size = 100, const_range=(-3.0, 3.0),p_crossover = 0.85 ,
                                  p_subtree_mutation = 0.01,
                                  p_hoist_mutation = 0.04, p_point_mutation=0.1, p_point_replace=0.1)
        
        model.fit(X, y)
    y_p = model.predict(X) >= 0
    healthy, sick = accuracy(y, y_p)
    
    return healthy, sick, scaler, model

In [59]:
def removeNan(data_frame,k):
    
    data_frame = data_frame.dropna()
    shape0 = data_frame.shape[0]
    ind = np.arange(shape0)
    size = math.ceil(shape0 / k)
    
    
    return data_frame, shape0, ind , size

In [60]:
def k_fold(k):
    
    np.random.seed(42)
    readData()
    global df
        
    trs = []
    tss = []
    mdls = []
    
    df, shape0df, ind ,size= removeNan(df ,k)

    np.random.shuffle(ind)
    data = df.values 
    data = data[ind]
    sizef = size
    max_size = df.shape[0]
    
    for i in range(k):
        print("k : ",i)
        start = sizef * i
        end = min(max_size, (i + 1) * sizef)
        
        data_train = np.concatenate([data[:start], data[end:]], axis = 0)
        data_test  = data[start:end]
        
        rec0, rec1, scaler, model = predict(data_train, True)
        trs.append((rec0, rec1))
        graph = pydotplus.graphviz.graph_from_dot_data(model._program.export_graphviz())
       # Image(graph.create_png())
        rec0, rec1, scaler, model = predict(data_test, False, scaler, model)
        tss.append((rec0, rec1))
        print("train", trs[-1])
        print("test", tss[-1])
        
    return trs, tss

In [62]:
trains = []
tests  = []
k = 2
print("model : gp")
trs, tss = k_fold(k)
trains.append(trs)
tests.append(tss)

model : gp
k :  0
train (0.9985185185185185, 0.8)
test (0.9970238095238095, 0.6122448979591837)
k :  1
train (0.9970238095238095, 0.6326530612244898)
test (0.9985185185185185, 0.8)
