In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from deap import creator, base, tools, algorithms
import random
import numpy
from scipy import interpolate
import matplotlib.pyplot as plt
import sys
import time


df = pd.read_csv("reduced_dataset.csv",  header=None)
y = df.iloc[:, 0]

print("Shape of target vector : ",y.shape)

column_numbers = [x for x in range(df.shape[1])]  # list of columns' integer indices

column_numbers.remove(0) #removing column integer index 0
Features= df.iloc[:, column_numbers] #return all columns except the 0th column

X=Features

# Form training, test, and validation sets
X_trainAndTest, X_validation, y_trainAndTest, y_validation = train_test_split(X, y, test_size=0.20, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_trainAndTest, y_trainAndTest, test_size=0.20, random_state=42)

Shape of target vector :  (1500,)


In [2]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import classification_report

# Feature subset fitness function
def getFitness(individual, X_train, X_test, y_train, y_test, evaluation=False):

	cols = [index for index in range(len(individual)) if individual[index] == 0]
	X_trainParsed = X_train.drop(X_train.columns[cols], axis=1)
	X_trainOhFeatures = pd.get_dummies(X_trainParsed)
	X_testParsed = X_test.drop(X_test.columns[cols], axis=1)
	X_testOhFeatures = pd.get_dummies(X_testParsed)

	# Remove any columns that aren't in both the training and test sets
	sharedFeatures = set(X_trainOhFeatures.columns) & set(X_testOhFeatures.columns)
	removeFromTrain = set(X_trainOhFeatures.columns) - sharedFeatures
	removeFromTest = set(X_testOhFeatures.columns) - sharedFeatures
	X_trainOhFeatures = X_trainOhFeatures.drop(list(removeFromTrain), axis=1)
	X_testOhFeatures = X_testOhFeatures.drop(list(removeFromTest), axis=1)

	# Apply logistic regression on the data, and calculate accuracy
	clf = LogisticRegression()
	clf.fit(X_trainOhFeatures, y_train)
	predictions = clf.predict(X_testOhFeatures)
	accuracy = accuracy_score(y_test, predictions)

	if evaluation == False:	
		return (accuracy,)

#	results = confusion_matrix(y_test, predictions)
	precision = precision_score(y_test, predictions)
	recall = recall_score(y_test, predictions)
    # Print classification report
	print("Classification Report:")
	print(classification_report(y_test, predictions, digits=4))
    
	return (accuracy, precision, recall)

# Create Individual/Classes
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

# Create Toolbox/Base Class
toolbox = base.Toolbox()
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, len(X.columns) - 1)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", getFitness, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, evaluation=False)
toolbox.register("mate", tools.cxOnePoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)


def getHof():
	# Initialize variables to use eaSimple
	numPop = 10
	numGen = 10
	pop = toolbox.population(n=numPop)
	hof = tools.HallOfFame(numPop * numGen)
	stats = tools.Statistics(lambda ind: ind.fitness.values)
	stats.register("avg", numpy.mean)
	stats.register("std", numpy.std)
	stats.register("min", numpy.min)
	stats.register("max", numpy.max)
	pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=numGen, stats=stats, halloffame=hof, verbose=False)
	return hof

def getMetrics(hof):

	testAccuracyList = []
	validationAccuracyList = []
	individualList = []
	for individual in hof:
		testAccuracy = individual.fitness.values
		validationAccuracy = getFitness(individual, X_trainAndTest, X_validation, y_trainAndTest, y_validation, evaluation=False)
		testAccuracyList.append(testAccuracy[0])
		validationAccuracyList.append(validationAccuracy[0])
		individualList.append(individual)
	testAccuracyList.reverse()
	validationAccuracyList.reverse()
	return testAccuracyList, validationAccuracyList, individualList

In [3]:

if __name__ == '__main__':

	individual = [1 for i in range(5000)]
	start = time.time()
	testAccuracy, precision, recall = getFitness(individual, X_train, X_test, y_train, y_test, evaluation=True)
	end = time.time()
	validationAccuracy = getFitness(individual, X_trainAndTest, X_validation, y_trainAndTest, y_validation, evaluation=False)
	print('Test accuracy with all features: \t' + str(testAccuracy))
	print('Validation accuracy with all features: \t' + str(validationAccuracy[0]))
	print("Test time : " + str(end-start))
	print('Precision : \t' + str(precision) + '\tRecall : ' + str(recall))
	print("Number of Features : " + str(len(individual)))

	build_start = time.time()
	hof = getHof()
	testAccuracyList, validationAccuracyList, individualList = getMetrics(hof)
	build_end = time.time()
	# Get a list of subsets that performed best on validation data
	maxValAccSubsetIndicies = [index for index in range(len(validationAccuracyList)) if validationAccuracyList[index] == max(validationAccuracyList)]
	maxValIndividuals = [individualList[index] for index in maxValAccSubsetIndicies]
	maxValSubsets = [[list(X)[index] for index in range(len(individual)) if individual[index] == 1] for individual in maxValIndividuals]


	# WORKING ON HOF
	count = [0 for i in range(len(X.columns))]
	for subset in hof :
		i = 0
		for feature in subset :
			if feature :
				count[i] += 1
			i += 1
	hof_feature_count = []
	for index in range(len(count)) :
		hof_feature_count.append([ list(X)[index], count[index] ])
	print ("===================================================")

	#Rank Features
	count = [0 for i in range(len(X.columns))]
	rank =  [i for i in range(len(X.columns))]
	for subset in individualList :
		i = 0
		for count_index in subset :
			count[i] += count_index
			i += 1
	for i in range(len(count)): 
		max_idx = i
		for j in range(i+1, len(count)):
			if count[max_idx] < count[j]:
				max_idx = j
		count[i], count[max_idx] = count[max_idx], count[i]
		rank[i], rank[max_idx] = rank[max_idx], rank[i]

	best_features = [list(X)[index] for index in rank[:10]]

	#Print Features Subsets
	print('\n---Optimal Feature Subset(s)---\n')
	for index in range(len(maxValAccSubsetIndicies)):

		start = time.time()
		testAccuracy, precision, recall = getFitness(maxValIndividuals[index], X_train, X_test, y_train, y_test, evaluation=True)
		end = time.time()

		print('Number Features In Subset: \t' + str(len(maxValSubsets[index])))
		print('Test Time: ' + str(end-start))
		print('Test Accuracy: \t\t' + str(testAccuracy))
		print('Validation Accuracy: \t\t' + str(validationAccuracyList[maxValAccSubsetIndicies[index]]))
		print('Precision : \t' + str(precision) + '\tRecall : ' + str(recall))
		print('========================================')
		print('========================================')


	print("---BUILD TIME : " + str(build_end-build_start) + " ---\n")

Classification Report:
              precision    recall  f1-score   support

           0     0.9919    1.0000    0.9960       123
           1     1.0000    0.9915    0.9957       117

    accuracy                         0.9958       240
   macro avg     0.9960    0.9957    0.9958       240
weighted avg     0.9959    0.9958    0.9958       240

Test accuracy with all features: 	0.9958333333333333
Validation accuracy with all features: 	0.9933333333333333
Test time : 14.369956016540527
Precision : 	1.0	Recall : 0.9914529914529915
Number of Features : 5000

---Optimal Feature Subset(s)---

Classification Report:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000       123
           1     1.0000    1.0000    1.0000       117

    accuracy                         1.0000       240
   macro avg     1.0000    1.0000    1.0000       240
weighted avg     1.0000    1.0000    1.0000       240

Number Features In Subset: 	2462
Test Time: 4.758516

Classification Report:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000       123
           1     1.0000    1.0000    1.0000       117

    accuracy                         1.0000       240
   macro avg     1.0000    1.0000    1.0000       240
weighted avg     1.0000    1.0000    1.0000       240

Number Features In Subset: 	2482
Test Time: 3.8088698387145996
Test Accuracy: 		1.0
Validation Accuracy: 		0.9966666666666667
Precision : 	1.0	Recall : 1.0
Classification Report:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000       123
           1     1.0000    1.0000    1.0000       117

    accuracy                         1.0000       240
   macro avg     1.0000    1.0000    1.0000       240
weighted avg     1.0000    1.0000    1.0000       240

Number Features In Subset: 	2480
Test Time: 3.8146204948425293
Test Accuracy: 		1.0
Validation Accuracy: 		0.9966666666666667
Precision : 	1.0	R

Classification Report:
              precision    recall  f1-score   support

           0     0.9840    1.0000    0.9919       123
           1     1.0000    0.9829    0.9914       117

    accuracy                         0.9917       240
   macro avg     0.9920    0.9915    0.9917       240
weighted avg     0.9918    0.9917    0.9917       240

Number Features In Subset: 	2476
Test Time: 2.7129318714141846
Test Accuracy: 		0.9916666666666667
Validation Accuracy: 		0.9966666666666667
Precision : 	1.0	Recall : 0.9829059829059829
Classification Report:
              precision    recall  f1-score   support

           0     0.9919    0.9919    0.9919       123
           1     0.9915    0.9915    0.9915       117

    accuracy                         0.9917       240
   macro avg     0.9917    0.9917    0.9917       240
weighted avg     0.9917    0.9917    0.9917       240

Number Features In Subset: 	2521
Test Time: 3.9513139724731445
Test Accuracy: 		0.9916666666666667
Validation Accu