# Training and Validating Bug Prediction Models Assignment 


Below are the required imports

In [139]:
# dataset operations (panda dataframes) 
import pandas as pd
import numpy as np


# preprocessing (binarizer)
from sklearn import preprocessing

#data balancing
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

# accuracy metrics
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import matthews_corrcoef
from matplotlib import pyplot
from sklearn.metrics import roc_curve

# feature selection 
from sklearn.datasets import load_digits
from mlxtend.feature_selection import ExhaustiveFeatureSelector 
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectPercentile, chi2

# single classifiers
from sklearn import svm
from sklearn.svm import SVC

#deap imports
import random
import operator 
from deap import base
from deap import creator
from deap import tools
from deap import gp
import array

from multiprocessing import Pool



# 1 Read in CSV data and preprocess

## 1.1 Reading in the csv file data with pandas into a dataframe


In [140]:
camel = "./datasets/camel-1.6.csv"
jedit= "./datasets/jedit-4.3.csv"
ant= "./datasets/ant-1.7.csv"
log4j= "./datasets/log4j-1.2.csv"
lucene= "./datasets/lucene-2.4.csv"
data_sets = [('camel',camel), ('jedit',jedit), ('ant',ant), ('log4j',log4j),('lucene',lucene)]

def preporcess_data(path):
    data = pd.read_csv(path)
    #Binarize the 'bug' column - '0' for No-bug and '1' for bug.
    binarizer = preprocessing.Binarizer()
    data['bug'] = binarizer.transform(data['bug'].values.reshape(-1,1))
    #Separate the features (x) from the target column (y).
    #Also, remove non-numeric fields that do not help in prediction.
    data = data.drop(['name','version','name.1'], axis=1)
    return data

# 2 Data balancing

## SMOTE 
Use synthetic Minority Oversampling Technique (SMOTE) and indicate the categorical columns


In [141]:
def balance_data(data):
    x = data.drop(['bug'], axis=1)
    y = data['bug']
    X_balanced, y_balanced = SMOTE().fit_resample(x, y)
    return X_balanced, y_balanced

# 3 Feature selection
In this task we aim to select the features that most influence the classification outcome. We aim to remove covariant features to reduce overfitting and the training cost.

## Univariate feature selection
*Univariate feature selection works by selecting the best features based on univariate statistical tests. SelectPercentile (a type of Univariate feature selection) removes all but a user-specified highest scoring percentage of features.* 

In [142]:
def select_fetaures(x, y):
    selection = SelectPercentile(chi2, percentile=80)
    X_best_feature = selection.fit_transform(x, y)
    y_best_feature = y
    columns = np.asarray(x.columns.values)
    support = np.asarray(selection.get_support())
    columns_with_support = columns[support]
    return columns_with_support

# 4 K-fold validation


In [143]:
FOLDS = 10

#take in data and produce training and test sets pairs.
def split_kfold_data(data):
    x = data.drop(['bug'],axis=1)
    y = data.bug
    skf = StratifiedKFold(n_splits= FOLDS)
    strata = skf.split(x,y)
    train_and_test_folds = []
    #produce train and test indexes on which to split data
    for train, test in strata:
        train_fold = data.iloc[train]
        test_fold = data.iloc[test]
        # balance the train_fold 
        balanced_x, balanced_y = balance_data(train_fold)
        balanced_x = pd.DataFrame(balanced_x, columns=x.columns) 
        balanced_y = pd.DataFrame(balanced_y, columns=['bug']) 
        #select the best fearures from the balanced train_fold
        selected_feature = select_fetaures(balanced_x, balanced_y)
        X_best_feature = balanced_x[selected_feature]
        y_best_feature = balanced_y.values.ravel()
        #select only the selected features from the test_fold.
        x_test_fold = test_fold[selected_feature]
        y_test_fold = test_fold.bug
        #collect each fold's training and test data.
        train_and_test_folds.append((x_test_fold,y_test_fold,X_best_feature,y_best_feature))
    return train_and_test_folds



 
# take an svm classifier and cross-validate (10 k-folds)
# use matthews_corrcoef as a metric. Return the average mcc
def cross_validate(clf,train_and_test_folds):
    fold_metrics = []  
    for x_test_fold, y_test_fold, X_best_feature, y_best_feature in train_and_test_folds:
        # Fit svm
        clf.fit(X_best_feature, y_best_feature)
        # Check matthews_corrcoef on test set
        pred = clf.predict(x_test_fold)
        fold_metrics.append(matthews_corrcoef(y_test_fold, pred))
    return sum(fold_metrics)/FOLDS


In [144]:
# take a [gamma,c] array and produce its mcc accuracy as its optimization fitness
def svm_optimization(params,train_and_test_folds):
    gamma = params[0]
    C = params[1]
    clf = svm.SVC(gamma=gamma, C=C)
    fitness = cross_validate(clf,train_and_test_folds)
    return (fitness,)

# 5 Geneteic algorithm optimization
Here we use the DEAP framework to implement our evolutionary algorithm
Create an individual creation function and population production function & Create evolution operators

In [145]:
#a function to produce a random chromosome ([gamma,c]) with gamma (min - 0.0000010, max - 8) 
#and c (min-0.01, max -32000)
#a chromosome is of type is given by the paramter icls = "Individual" defined above
def initInd(icls,gamma_min,gamma_max, c_min,c_max):
    gamma = random.uniform(gamma_min, gamma_max)
    c= random.uniform(c_min, c_max)
    ind = icls([gamma, c])
    return ind

GAMMA_MIN =  0.0000010
GAMMA_MAX = 8
C_MIN = 0.01
C_MAX = 32000

#this is a decorator function is required by the mutation operator to keep gamma and c within bounderies on mutation
def checkBounds(gamma_min,gamma_max, c_min,c_max):
    def decorator(func):
        def wrapper(*args, **kargs):
            offspring = func(*args, **kargs)
            for child in offspring:
                    if child[0] > gamma_max:
                        child[0] = gamma_max
                    elif child[0] < gamma_min:
                        child[0] = gamma_min
                    if child[1] > c_max:
                        child[1] = c_max
                    elif child[1] < c_min:
                        child[1] = c_min
            return offspring
        return wrapper
    return decorator



# 6 Conduct the evolution optimization 

In [146]:
# a helper function to select the best individual to ever live during the evolution process
def select_best_from_hall_of_fame(hall_of_fame_lst):
    best,best_fitness = 0,0
    for indiv,fitness in hall_of_fame_lst:
        if best_fitness < fitness:
            best = indiv
            best_fitness = fitness
    return best, best_fitness

np.seterr(all="ignore")
import warnings
warnings.filterwarnings("ignore")

# constants - cross-over probability , mutation probability, and number of generations respectively 
CXPB, MUTPB, NGEN = 0.5, 0.1, 300
POPULATION_SIZE = 100
MINIMUM_DELTA = 0.001

def evolution(train_and_test_folds):
    #initiate mutation objects and operators
    #create a maximization optimization class (positive weight)
    creator.create("FitnessMax", base.Fitness, weights=(1.0,))
    #create an individual class with a chromosome of type array ([gamma,c]) and a fitness "FitnessMax" defined above
    creator.create("Individual", array.array, typecode="d", fitness=creator.FitnessMax)
    #create a tool box to handle mutation operators
    toolbox = base.Toolbox()
    #register an operation to create individuals of type 'creator.Individual' using the function  'initInd'
    toolbox.register("individual", initInd, creator.Individual,GAMMA_MIN,GAMMA_MAX,C_MIN,C_MAX)
    #register a function to repeatedly call the 'toolbox.individual' function to produce a list of individuals
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)
    #single point cross over operator
    toolbox.register("mate", tools.cxOnePoint)
    #Mutation operator with uniform mutation within the gamma and c boundaries
    toolbox.register("mutate", tools.mutUniformInt, low=[0,0], up=[GAMMA_MAX,C_MAX], indpb=MUTPB)
    #selection operator using tournaments of size 3
    toolbox.register("select", tools.selTournament, tournsize=3)
    #The fitness function used to conduct the selection
    toolbox.register("evaluate", svm_optimization,train_and_test_folds=train_and_test_folds)
    #decorate the mutation operator with the tailormade "checkBounds" function above 
    # to keep gamma and c within its boundaries 
    toolbox.decorate("mutate", checkBounds(GAMMA_MIN,GAMMA_MAX,C_MIN,C_MAX))
    #create the population
    pop = toolbox.population(n=POPULATION_SIZE)
    #maintain a delta to check if the fitness is changing 
    delta = 0
    previous_best_fitness = 0
    current_best_fitness = 0
    # list to collect the best individual and its fitness in each generation
    Fittest_hall_of_fame = []
    # Evaluate the entire population
    fitnesses = map(toolbox.evaluate, pop)
    #assign fitness to the individuals in the population
    for ind, fit in zip(pop, fitnesses):
        ind.fitness.values = fit
    generation = 0
    for g in range(NGEN):
        changed = 0
        #if the fitness does not change by more than the set minimum_delta, exit the evolution
        if generation > 30:
            if delta < MINIMUM_DELTA:
                return select_best_from_hall_of_fame(Fittest_hall_of_fame)
        # Select the next generation individuals. The population remains at a constant size
        offspring = toolbox.select(pop, len(pop))
        # Clone the selected individuals
        offspring = list(map(toolbox.clone, offspring))
        # Apply crossover and mutation on the offspring ([startAt:endBefore:skip])
        for child1, child2 in zip(offspring[::2], offspring[1::2]):
            if random.random() < CXPB:
                #toolbox.mate produces the children in place
                toolbox.mate(child1, child2)
                #delete the fitness as the chromosome composition has changed
                del child1.fitness.values
                del child2.fitness.values
        for mutant in offspring:
            if random.random() < MUTPB:
                toolbox.mutate(mutant)
                del mutant.fitness.values
        # Evaluate the individuals with an invalid fitness, those whose fitnesses we deleted
        invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
        fitnesses = map(toolbox.evaluate, invalid_ind)
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit
        #determine delta between previous generation and current generation
        previous_best_fitness = current_best_fitness
        current_best_fitest_indiv = toolbox.select(offspring,1)[0]
        current_best_fitness, = svm_optimization(current_best_fitest_indiv,train_and_test_folds)
        delta = abs(current_best_fitness - previous_best_fitness)
        Fittest_hall_of_fame.append((current_best_fitest_indiv,current_best_fitness))
        # The population is entirely replaced by the offspring
        pop[:] = offspring
        generation += 1
    return  select_best_from_hall_of_fame(Fittest_hall_of_fame)

            

SyntaxError: invalid syntax (<ipython-input-146-b6134d3adfa9>, line 11)

In [None]:
def evolution_helper(data_set):
    name,path = data_set
    data = preporcess_data(path)
    train_and_test_folds = split_kfold_data(data)
    #take the defauld [gamma,c] = ['auto',1.0] and see its fitness. Our aim is to improve on it
    default_svm_fitness, = svm_optimization(['auto',1.0],train_and_test_folds)
    print(name + " default_svm_fitness",default_svm_fitness)
    evolution(train_and_test_folds)
    return name, evolution(train_and_test_folds)


# Parallel evolution 

In [None]:
p = Pool(5)
#evolution_helper(('camel',camel))
print(p.map(evolution_helper, data_sets))