# AIML426 Project 1 Q2  

## Genetic Algorithm for Feature Selection  

Genetic algorithm can be used for feature selection, given $N$ features, each feature selection result can be represented as a $N$-dimensional binary list $X=(x_1, ..., x_n)$, where $x_i=1$ means the feature $i$ is selected and $x_i=0$ otherwise.  

### Problem description  

Take a dataset with $N$ features and determine the optimal selection of features for fitting a good predictive model.  

In [1]:
# import function
import feat_select as fs


# function for running experiment N-times
def multi_run(func, data = None, feat_names = None, params = None, n=1):
    from datetime import datetime as dt
    # accept only singular parameters - i.e. feed one function only
    best_indiv = []
    run_stats = []
    time_delta = []
    population = []

    for r in range(n): 
        start = dt.now()
        if feat_names:
            if params:
                pop, stat, hof = func(data, feat_names, params)
                
            else:
                pop, stat, hof = func(data, feat_names)
                
        else: 
            pop, stat, hof = func(data, params)

        
        end = dt.now()
        time = end - start

        run_stats.append(stat)
        best_indiv.append(hof)
        time_delta.append(time)
        population.append(pop)
                
    return run_stats, best_indiv, time_delta, population

# function for running through the different datasets or different functions
def changing_run(paths = None, feat_names = None, func = None, params = None, n = 1):

    diffs = [[],[], [], []] 

    if len(paths) > 1:
        # iterate paths, keep all other parameters the same, function is required, params are optional
        for p, dat in enumerate(paths):
            if len(feat_names) > 1:
                # include feat_names input
                if params:
                    run, best, time, population = multi_run(func, 
                                           data= dat, 
                                           feat_names= feat_names[p], 
                                           params = params, 
                                           n = n)
                else:
                    run, best, time, population = multi_run(func, 
                                           data= dat, 
                                           feat_names= feat_names[p], 
                                           n = n)
            
            else:
                
                if params:
                    run, best, time, population = multi_run(func, 
                                           data= dat, 
                                           params = params, 
                                           n = n)
                else:
                    run, best, time, population = multi_run(func, 
                                           data= dat, 
                                           n = n)
            diffs[0].append(run)
            diffs[1].append(best)
            diffs[2].append(time)
            diffs[3].append(population)

    return diffs

In [2]:
# test the wrapper function
WrapperGA = changing_run(func = fs.main,
                         params = "FEAT_SEL = 'Wrapper'",
                         paths = ['wbcd.data', 'sonar.data'],
                         feat_names = ['wbcd.names', 'sonar.names'],
                         n = 5)  




In [None]:
# test the filter function
FilterGA = changing_run(func = fs.main,
                        paths = ['wbcd.data', 'sonar.data'],
                        feat_names = ['wbcd.names', 'sonar.names'], 
                        n = 5)  





In [86]:
FilterGA[1][0][4][0] # WrapperGA[stats[dat[run[gen]]], hof[dat[run[indiv]]], time[dat[delta]]]

[1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0]