In [1]:
import numpy as np
import pandas as pd
import random
# from Config import Config as con
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
import sklearn.exceptions
import warnings
import itertools

warnings.filterwarnings("ignore", category=sklearn.exceptions.ConvergenceWarning)

In [3]:
try:
    #read file from csv
	dataset			= pd.read_csv("Stroke_10.csv", sep=',')

except IOError as e:
	raise e
		
name_label_columns	= dataset.columns.values[len(dataset.columns)-1]
#separated feature and label
feature 			= dataset.drop(name_label_columns, axis=1)
label 				= dataset[name_label_columns].copy()

In [13]:
def initPopulation(feature):
	individu = []

	chromosomeGenerator = [list(i) for i in itertools.product([0, 1], repeat=len(feature.columns)) if i.count(1)>=3]
	n_individual = 100
	for i in range(n_individual):
		index = random.randint(0,len(chromosomeGenerator)-1)
		individu.append(chromosomeGenerator[index])
	return individu

def getFitnessValue(individu, feature, label):
	subsetFeature = removeExcludeFeature(individu, feature)
    
	#count error rate by MLPClassifier
	error_rate = []
	for subsetFeature in subsetFeature :
		featureTrain, featureTest, labelTrain, labelTest = train_test_split(subsetFeature, label, test_size=0.2, random_state=42)
		result = fitnessValueModel(featureTrain, featureTest, labelTrain, labelTest)
		error_rate.append(result)

	# get rank of individu by their error_rate
	rank = [0] * len(error_rate)
	for i, x in enumerate(sorted(range(len(rank)), key=lambda y: error_rate[y], reverse=True)):
		rank[x] = i

	# rank based method will be:
    # Φ(i) = κ · R(i)           i = 1,...,N.
    # k is constant between 1 until 2
    # R(i) is rank of individu
	k = 1.5
	fitnessValue = list(k*(1+np.array(rank)))
  
	return fitnessValue

def checkIndividu(individu):
	zeroSubset = []
	# get useless feature on subset
	for i in [i for i,j in enumerate(individu) if j==0]:
		zeroSubset.append(i)

	return zeroSubset

def removeExcludeFeature(individu, feature):
    #get index of exclusion feature (useless feature) represent by 0 and inclusion feature represent by 1 
	uselessFeature = []
	for individu in individu :
		check = checkIndividu(individu)
		uselessFeature.append(check)

	#drop the exclustion feature by their index
	subsetFeature = []
	for uselessFeature in uselessFeature :
		subsetFeature.append(feature.drop(feature.columns[[uselessFeature]], axis=1))
    
	return subsetFeature

def fitnessValueModel(featureTrain, featureTest, labelTrain, labelTest):
	model = MLPClassifier()
	#train model
	model.fit(featureTrain, labelTrain)
	#prediction
	predict = model.predict(featureTest)
	accuracy = accuracy_score(labelTest, predict)
	error_rate = 1-accuracy

	return error_rate


def roulleteProcess(individu, fitnessValue):
	relative_fitness = [f/float(sum(fitnessValue)) for f in fitnessValue]
	prob = [sum(relative_fitness[:i+1]) for i in range(len(relative_fitness))]

	#num_roullete is N / 2, N is population size
	num_roullete = 6
	chosenIndividu = []
	for i in range(num_roullete):
		rand = random.random()
		for (j, ind) in enumerate(individu):
			if rand <= prob[j]:
				chosenIndividu.append(list(ind))
				break

	return chosenIndividu		
		
def cloneIndividu(individu):
	newIndividu = []

	for i in range(len(individu)):
		for j in range(len(individu[0])):
			temp = individu[i]
# 			value = Individu(temp)
		newIndividu.append(temp)

	return newIndividu

def crossoverProcess(individu):
	newIndividu = cloneIndividu(individu)

	for i in range(0,len(individu),2):
		parent1 = newIndividu[i]
		parent2 = newIndividu[i+1]

		if(random.uniform(0,1) < 0.9):
			# TODO kawinkan
			index1=None
			index2=None
			tpoint1 = random.randint(0,len(individu[0]))
			tpoint2 = random.randint(0,len(individu[0]))

			if(tpoint1<=tpoint2):
				index1 = tpoint1
				index2 = tpoint2
			else:
				index1 = tpoint2
				index2 = tpoint1

			for j in range(index1,index2):
				if(j%2 == 0):
					tempParent1 = parent1[int(j/2)]
					parent1[int(j/2)] = parent2[int(j/2)]
					parent2[int(j/2)] = tempParent1
				else:
					tempParent2 = parent2[int(j/2)]
					parent1[int(j/2)] = parent1[int(j/2)]
					parent2[int(j/2)] = tempParent2
	return newIndividu

def mutationProcess(individu):
	newIndividu = cloneIndividu(individu)

	for i in range(len(newIndividu)) :
		randGen = random.randint(0, len(newIndividu[0])-1)
		if (random.uniform(0,1) < 0.2):
			if (newIndividu[i][randGen] == 0):
				newIndividu[i][randGen] = 1
			else :
				newIndividu[i][randGen] = 0
	return newIndividu

def elitismProcess(parent,children,feature,label):
	individu = cloneIndividu(parent)
	individu.extend(cloneIndividu(children))

	fitnessValue = getFitnessValue(individu,feature,label)

	result = pd.DataFrame(individu)
	result['fitnessValue'] = pd.Series(fitnessValue)
	result = result.sort_values(by=['fitnessValue'], ascending=False)
	result = result.drop("fitnessValue", axis=1)
	selected = result.values[0]
	selected = list(selected.astype(int))
	
	return selected

In [14]:
individu = initPopulation(feature)

In [15]:
individu

[[1, 0, 1, 0, 1, 0],
 [1, 1, 1, 1, 1, 1],
 [0, 1, 1, 0, 1, 0],
 [0, 1, 1, 0, 0, 1],
 [1, 1, 0, 1, 1, 0],
 [1, 0, 1, 0, 1, 0],
 [0, 0, 1, 1, 1, 0],
 [0, 1, 0, 1, 0, 1],
 [0, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 0, 1],
 [0, 1, 1, 1, 1, 0],
 [1, 1, 1, 1, 1, 0],
 [1, 0, 1, 1, 0, 0],
 [1, 0, 1, 1, 1, 1],
 [1, 0, 1, 0, 0, 1],
 [1, 1, 0, 0, 1, 1],
 [1, 0, 1, 1, 1, 0],
 [1, 1, 1, 1, 0, 0],
 [1, 1, 1, 1, 1, 1],
 [1, 1, 1, 0, 1, 0],
 [0, 0, 1, 1, 1, 1],
 [1, 0, 0, 1, 1, 1],
 [1, 1, 1, 1, 0, 0],
 [1, 0, 1, 0, 1, 1],
 [1, 1, 0, 0, 1, 0],
 [1, 1, 1, 0, 0, 0],
 [1, 1, 0, 1, 0, 0],
 [0, 1, 0, 1, 1, 0],
 [1, 0, 0, 1, 0, 1],
 [0, 1, 0, 1, 1, 1],
 [1, 1, 1, 0, 1, 1],
 [0, 0, 1, 0, 1, 1],
 [1, 0, 0, 1, 0, 1],
 [0, 0, 1, 1, 1, 0],
 [0, 1, 1, 1, 0, 0],
 [0, 1, 1, 1, 0, 0],
 [0, 0, 0, 1, 1, 1],
 [1, 0, 1, 1, 1, 0],
 [0, 1, 1, 0, 1, 0],
 [1, 0, 0, 1, 0, 1],
 [1, 1, 0, 1, 1, 0],
 [0, 1, 1, 0, 1, 0],
 [1, 0, 1, 0, 0, 1],
 [0, 0, 1, 1, 1, 0],
 [0, 1, 0, 1, 0, 1],
 [0, 1, 0, 1, 0, 1],
 [1, 0, 0, 1, 1, 0],
 [1, 1, 1, 1,

In [18]:
num_generation = 3
for i in range(num_generation):
    fitnessValue = getFitnessValue(individu, feature, label)
    roullete = roulleteProcess(individu, fitnessValue)
    parent = cloneIndividu(roullete)
    cross = crossoverProcess(roullete)
    mutate = mutationProcess(cross)
    children = cloneIndividu(mutate)
    selected = elitismProcess(parent,children,feature,label)

In [19]:
selected

[0, 1, 0, 1, 1, 1]

In [20]:
def selectedFeature(individu, feature):
    #get index of exclusion feature (useless feature) represent by 0 and inclusion feature represent by 1 
	zeroSubset = []
	# get useless feature on subset
	for i in [i for i,j in enumerate(individu) if j==0]:
		zeroSubset.append(i)

	#drop the exclustion feature by their index
	subsetFeature = feature.drop(feature.columns[[zeroSubset]], axis=1)

	return subsetFeature

In [21]:
selectedFeature(selected, feature)

Unnamed: 0,dissimilarity,asm,energy,correlation
0,9.086207,0.701083,0.837307,0.865392
1,9.086207,0.707436,0.841092,0.862082
2,9.965517,0.689291,0.830235,0.85693
3,9.086207,0.72534,0.851669,0.851811
4,9.672414,0.691132,0.831344,0.860815
5,9.086207,0.707436,0.841092,0.862082
6,8.5,0.69851,0.835769,0.876216
7,9.086207,0.687095,0.828912,0.872084
8,9.086207,0.722037,0.849728,0.853819
9,9.086207,0.707436,0.841092,0.862082


In [114]:
def testAccuracy(feature, label):
	featureTrain, featureTest, labelTrain, labelTest = train_test_split(feature, label, test_size=0.2, random_state=42)
	model = MLPClassifier()
	model.fit(featureTrain, labelTrain)
	predict = model.predict(featureTest)
	accuracy = accuracy_score(labelTest, predict)

	return accuracy

In [115]:
print("Fitur yang digunakan adalah : ", list(dfSubset.columns))
print("\nAccuracy with All Feature using Hold-out Validation Model : ", testAccuracy(feature, label))
print("\nAccuracy with Subset Feature using Hold-out Validation Model : ", testAccuracy(dfSubset, label))

Fitur yang digunakan adalah :  ['b', 'd', 'e']

Accuracy with All Feature using Hold-out Validation Model :  0.651162790698

Accuracy with Subset Feature using Hold-out Validation Model :  0.976744186047
