# Wykorzystanie biblioteki DEAP w problemie optymalizacji parametrów klasyfikatorów oraz selekcji cech

### I Optymalizacja parametrów klasyfikatorów

In [1]:
# data manipulation tools
import pandas as pd
import numpy as np

# Random Forest model
from sklearn.ensemble import RandomForestRegressor

# Linear Regression model
from sklearn.linear_model import LinearRegression

# Epsilon-Support Vector Regression
from sklearn import svm 

# model selection
from sklearn.model_selection import cross_val_score # perform cross-validation for estimator evaluation
from sklearn.model_selection import cross_val_predict # generate cross-validated estimates for each input
from sklearn.model_selection import train_test_split # splits arrays/matrices into random train and test subsets
from sklearn.model_selection import GridSearchCV # determines estimator paremeters values
from sklearn.model_selection import StratifiedKFold # stratified k-fold cross-validator
from sklearn.model_selection import KFold # k-fold cross-validator

# model evaluation
from sklearn.metrics import mean_absolute_error, median_absolute_error

# feature selection based on weights importance
from sklearn.feature_selection import SelectFromModel

# data preprocessing
from sklearn.preprocessing import MinMaxScaler

# plots
import matplotlib.pyplot as plt
import seaborn as sns

# rest
import random


In [2]:
# loading data
import pandas as pd

%store -r predictors_x
%store -r outcomes_y

predictors_x.head()

Unnamed: 0_level_0,landAvg_1_PriorYear,landAvg_2_PriorYear,landAvg_3_PriorYear,landAvg_4_PriorYear,landMax_2_PriorYear,landMax_5_PriorYear,landMin_4_PriorYear,land&OceanAvg_1_PriorYear,land&OceanAvg_2_PriorYear,oceanAvg_1_PriorYear,oceanAvg_2_PriorYear,oceanAvg_5_PriorYear
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1850-01-01,-1.344498,-1.343748,-1.342076,-1.341202,-1.404913,-1.403688,-1.277205,-1.177569,-1.176354,1.528022,1.52898,1.529537
1851-01-01,-1.344498,-1.343748,-1.342076,-1.341202,-1.404913,-1.403688,-1.277205,-1.177569,-1.176354,1.528022,1.52898,1.529537
1852-01-01,-1.439527,-1.343748,-1.342076,-1.341202,-1.404913,-1.403688,-1.277205,-1.543883,-1.176354,1.233484,1.52898,1.529537
1853-01-01,-1.452432,-1.438803,-1.342076,-1.341202,-1.25094,-1.403688,-1.277205,-1.554889,-1.543128,1.248574,1.234485,1.529537
1854-01-01,-1.59087,-1.451711,-1.437103,-1.341202,-1.361717,-1.403688,-1.277205,-1.624064,-1.554147,1.479957,1.249573,1.529537


In [3]:
outcomes_y.head()

Unnamed: 0_level_0,landAvg
date,Unnamed: 1_level_1
1850-01-01,-1.834912
1851-01-01,-1.440372
1852-01-01,-1.453273
1853-01-01,-1.591667
1854-01-01,-1.710123


In [4]:
x_train, x_test, y_train, y_test = train_test_split(predictors_x, outcomes_y, test_size=0.1, random_state=0)

In [5]:
x_train.shape, y_train.shape

((1792, 12), (1792, 1))

In [6]:
x_test.shape, y_test.shape

((200, 12), (200, 1))

In [7]:
# Preprocessing
numberOfAtributtes= len(x_train.columns)
print(numberOfAtributtes)

12


In [8]:
# Classification using SVC with default parameters and 5x K-fold cross validation for all 45 features
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

y=y_train.values.ravel()
df = df_norm=x_train.values

pipe = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
pipe.fit(df, y)
scores = pipe.score(x_test.values, y_test.values.ravel())
print(scores.mean())

0.9917577580935905


In [9]:
# Generationg new individual
import random
def parametersSVR(numberFeatures,icls):
    genome = list()

    #kernel
    listKernel = ["linear","rbf", "poly","sigmoid"]
    genome.append(listKernel[random.randint(0, 3)])

    #c
    k = random.uniform(0.1, 100)
    genome.append(k)

    #degree
    genome.append(random.uniform(0.1,5))

    #gamma
    gamma = random.uniform(0.001,5)
    genome.append(gamma)

    # coeff
    coeff = random.uniform(0.01, 10)
    genome.append(coeff)

    return icls(genome)

In [10]:
# Mutation
def mutationSVR(individual):
    numberParamer= random.randint(0,len(individual)-1)
    if numberParamer==0:
        # kernel
        listKernel = ["linear", "rbf", "poly", "sigmoid"]
        individual[0]=listKernel[random.randint(0, 3)]
    elif numberParamer==1:
        #C
        k = random.uniform(0.1,100)
        individual[1]=k
    elif numberParamer == 2:
        #degree
        individual[2]=random.uniform(0.1, 5)
    elif numberParamer == 3:
        #gamma
        gamma = random.uniform(0.01, 5)
        individual[3]=gamma
    elif numberParamer ==4:
        # coeff
        coeff = random.uniform(0.1, 20)
        individual[2] = coeff

In [11]:
# Fitness function
import math
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.svm import SVR

def parametersFitnessSVR(y,df,numberOfAtributtes,individual):
    split=5
    kf = KFold(n_splits=split)
    df_norm = df

    estimator = SVR(kernel=individual[0],C=individual[1],degree=individual[2],
                    gamma=individual[3],coef0=individual[4])
    resultSum = 0
    counter = 0
    for train, test in kf.split(df_norm):
        estimator.fit(df_norm[train], y[train])
        predicted = estimator.predict(df_norm[test])
        expected = y[test]
        resultSum=estimator.score(df_norm[test], y[test])
        counter+=1
    return resultSum/counter,

In [12]:
from deap import base, creator, tools
import random
from math import sin
import matplotlib.pyplot as plt
from timeit import default_timer as timer

In [13]:
sizePopulation = 100
probabilityMutation = 0.2
probabilityCrossover = 0.8
numberIteration = 100
numberElitism = 1

In [14]:
def configureDeap(fitness, parameters, mutation,
                  selection_param={'function':tools.selTournament, 'tournsize':3},
                 mate_params={'function':tools.cxTwoPoint},):
    
    creator.create('FitnessMax', base.Fitness, weights=(1.0,))
    creator.create("Individual", list, fitness=creator.FitnessMax)

    toolbox = base.Toolbox() 
    toolbox.register('individual', parameters, numberOfAtributtes, creator.Individual)
    toolbox.register('evaluate', fitness,y,df,numberOfAtributtes)
    toolbox.register('population', tools.initRepeat, list, toolbox.individual)
    toolbox.register('select', **selection_param)
    toolbox.register('mate', **mate_params)
    toolbox.register('mutate', mutation)

    return toolbox

In [15]:
def mutate(toolbox, offspring):
    for mutant in offspring:
        # mutation
        if random.random() < probabilityMutation:
            toolbox.mutate(mutant)
            del mutant.fitness.values
    return offspring

In [16]:
def setIndividualFitnessValue(individuals, fitnesses):
    for ind, fit in zip(individuals, fitnesses):
        ind.fitness.value = [fit]
    return individuals

In [17]:
def initPopulation(toolbox):
    population = toolbox.population(n=sizePopulation)
    fitnesses = list(map(toolbox.evaluate, population))
    population = setIndividualFitnessValue(population, fitnesses)
    return population

In [18]:
def crossover(toolbox, offspring):
    # [::2] - gets only individuals with odd indexes
    # [1::2] -  gets only individuals with even indexes
    for child1, child2 in zip(offspring[::2], offspring[1::2]):
        # crossover
        if random.random() < probabilityCrossover:
            toolbox.mate(child1, child2)
            del child1.fitness.values
            del child2.fitness.values
    return offspring

In [19]:
def evaluate(toolbox, offspring, verbose=0):
    # evaluate new individuals
    new_individuals = [ind for ind in offspring if not ind.fitness.valid]
    new_fitnesses = map(toolbox.evaluate, new_individuals)
    for ind, fit in zip(new_individuals, new_fitnesses):
        ind.fitness.values = fit
    
    if verbose>3:
        print('Ewaluated %i individuals'%len(new_individuals))
    return offspring

In [20]:
def getBestsForElitism(population):
    listElitism = []
    for x in range(0, numberElitism):
        listElitism.append(tools.selBest(population, 1)[0])
    return listElitism

In [21]:
def calculateStatistics(population,verbose=0):
    fits = [ind.fitness.values[0] for ind in population]
    length = len(population)
    mean = sum(fits)/length
    sum2 = sum(x*x for x in fits)
    std = abs(sum2/length-mean **2)**0.5
    if verbose>2:
        print("  Min %s" % min(fits))
        print("  Max %s" % max(fits))
        print("  Avg %s" % mean)
        print("  Std %s" % std)
    return (mean, std)

In [22]:
def pickBestIndividual(population, verbose=0):
    best_ind = tools.selBest(population, 1)[0]
    if verbose>1:
        print("Best individual is %s, %s" % (best_ind, best_ind.fitness.values))
    return best_ind

In [23]:
def plotSimple(y_vals=[], label_y='y', label_x='x', title='plot', instance_info='no_info', save_location='./plots/'):
    filename = save_location + title+'_'+ instance_info +'.png'
    x_vals = [x for x in range(len(y_vals))]
    plt.plot(x_vals, y_vals)
    plt.title(title)
    plt.ylabel(label_y)
    plt.xlabel(label_x)
#     plt.savefig(filename)
    plt.show()
    plt.close()

In [24]:
def plotResults(stats):
    plotSimple([best[0].fitness.values[0] for best in stats], 'best', 'epoch', 'fitness(iteration)')
    plotSimple([best[1] for best in stats], 'mean', 'epoch', 'mean(iteration)')
    plotSimple([best[2] for best in stats], 'std', 'epoch', 'std(iteration)')
    plotSimple([best[3] for best in stats], 'duration', 'epoch', 'epoch_duration(iteration)')

In [25]:
def startOptimizationLoop(toolbox, population, elitism=False, verbose=0):
    stats = []
    g = 0
    while g < numberIteration:
        g = g + 1
        if verbose>0:
            print('-- Generation %i --' % g)
        start = timer()  
        offspring = toolbox.select(population, len(population))
        offspring = list(map(toolbox.clone, offspring))
        
        elite = getBestsForElitism(offspring) if elitism else []
        
        offspring = crossover(toolbox, offspring)
        offspring = mutate(toolbox, offspring)
        offspring = evaluate(toolbox, offspring)
        duration = timer()-start
        
        population[:] = offspring + elite

        mean, std = calculateStatistics(population, verbose)
        best_ind = pickBestIndividual(population, verbose)
        stats.append((best_ind, mean, std, duration))
    
    if verbose>-1:
        best_overall = pickBestIndividual(population, verbose)
        print('Best found individual: (x1,x2) =', best_overall, ', y =', best_overall.fitness.values[0])
    return stats

In [26]:
def run(fitness, parameters, mutation, selection, mate, elitism, verbose):
    toolbox = configureDeap(fitness, parameters, mutation, selection, mate)
    population = initPopulation(toolbox)
    stats = startOptimizationLoop(toolbox, population, elitism, verbose)
    plotResults(stats)

In [None]:
# scenario 0 (tournament, cxTwoPoint, with elitism):
selection={'function':tools.selTournament, 'tournsize':3} 
mate={'function':tools.cxTwoPoint}

parameters=parametersSVR
fitness=parametersFitnessSVR
mutation=mutationSVR
elitism = False
verbose = 7
run(fitness, parameters, mutation, selection , mate , elitism, verbose)

In [None]:
ind = ['poly', 0.3690320297276768, 1.9084110197644817, 0.1053757953826651, 8.515094980694283]
print(SVCParametersFitness(y,df,numberOfAtributtes,ind))


### II Selekcja cech

In [None]:
def SVCParametersFeatureFitness(y,df,numberOfAtributtes,individual):
    split=5
    cv = StratifiedKFold(n_splits=split)
    
    listColumnsToDrop=[] #lista cech do usuniecia
    for i in range(numberOfAtributtes,len(individual)):
            if individual[i]==0: #gdy atrybut ma zero to usuwamy cechę
                listColumnsToDrop.append(i-numberOfAtributtes)

    dfSelectedFeatures=df.drop(df.columns[listColumnsToDrop], axis=1, inplace=False)
    
    mms = MinMaxScaler()
    df_norm = mms.fit_transform(dfSelectedFeatures)
    estimator = SVC(kernel=individual[0],C=individual[1],degree=individual[2],gamma=individual[3],coef0=individual[4],random_state=101)
    resultSum = 0
    for train, test in cv.split(df_norm, y):
        estimator.fit(df_norm[train], y[train])
        predicted = estimator.predict(df_norm[test])
        expected = y[test]
        tn, fp, fn, tp = metrics.confusion_matrix(expected, predicted).ravel()
        result = (tp + tn) / (tp + fp + tn + fn) #w oparciu o macierze pomyłek https://www.dataschool.io/simple-guide-to-confusion-matrix-terminology/
        resultSum = resultSum + result #zbieramy wyniki z poszczególnych etapów walidacji krzyżowej

    return resultSum / split,

In [None]:
def mutationFeatureSVC(individual):
    numberParamer= random.randint(0,len(individual)-1)
    if numberParamer==0:
        # kernel
        listKernel = ["linear", "rbf", "poly", "sigmoid"]
        individual[0]=listKernel[random.randint(0, 3)]
    elif numberParamer==1:
        #C
        k = random.uniform(0.1,100)
        individual[1]=k
    elif numberParamer == 2:
        #degree
        individual[2]=random.uniform(0.1, 5)
    elif numberParamer == 3:
        #gamma
        gamma = random.uniform(0.01, 1)
        individual[3]=gamma
    elif numberParamer ==4:
        # coeff
        coeff = random.uniform(0.1, 1)
        individual[2] = coeff
    else: #genetyczna selekcja cech
        if individual[numberParamer] == 0: 
            individual[numberParamer] = 1
        else:
            individual[numberParamer] = 0


In [None]:
# scenario 0 (tournament, mutGaussian, cxTwoPoint, with elitism):
selection={'function':tools.selTournament, 'tournsize':3} 
mutation={'function':mutationFeatureSVC}
# mutation={'function':tools.mutGaussian, 'mu':0.0, 'sigma':0.2, 'indpb':0.2}
mate={'function':tools.cxTwoPoint}
fitness=SVCParametersFeatureFitness
elitism = False
verbose = 0
run(selection, mutation, mate, fitness, elitism, verbose)