In [1]:
import random
import pandas as pd

from deap import base
from deap import creator
from deap import tools

In [2]:
from sklearn.preprocessing import LabelEncoder

def convert2binary(number, bitlength=0):
    val = "{0:0" + str(bitlength) + "b}"
    return [x for x in list(val.format(number))]

def encode_to_binary_string(sourcedf, column, targetdf):
    lb_make = LabelEncoder()
    encoded = lb_make.fit_transform(sourcedf[column])
    max_value = max(encoded)
    bitstring = convert2binary(max_value)
    bitlength = len(bitstring)
    binaryencoded = [convert2binary(x, bitlength) for x in encoded]
    for i in range(bitlength):
        newcol = [x[i] for x in binaryencoded]
        targetdf[str.format('{0}{1}', column, i)] = newcol

In [4]:
originaldata = pd.read_pickle('../datasets/car_original_dataset.pkl')
del originaldata['imagepath']
del originaldata['DriveTrain']
del originaldata['AirBags']

In [6]:
inputdata = pd.read_pickle('../datasets/dummy_cartable.pkl')

In [7]:
for c in inputdata.columns:
    if 'manufacturer' in c or 'type' in c or 'airbags' in c or 'drivetrain' in c:
        del inputdata[c]

In [8]:
for col in originaldata.columns:
    if originaldata[col].dtype == object and col != 'make':
        encode_to_binary_string(originaldata, col, inputdata)

In [9]:
inputdata = inputdata.apply(pd.to_numeric)
inputdata.head()

Unnamed: 0,price,mpg,num_of_cylinders,horsepower,fuel_tank_capacity,rpm,weight,automatic_gearbox,passenger_capacity,width,luggage_capacity,origin,manufacturer0,manufacturer1,manufacturer2,manufacturer3,manufacturer4,type0,type1,type2
1,0.256866,0.583333,0.5,0.466667,0.488889,0.969231,0.658952,0.0,0.625,0.871795,0.5,1.0,0,0,0,0,0,0,1,1
2,0.547658,0.447917,0.75,0.666667,0.666667,0.846154,0.867235,0.0,0.625,0.910256,0.681818,1.0,0,0,0,0,0,0,1,0
3,0.470113,0.479167,0.75,0.573333,0.625926,0.846154,0.822168,0.0,0.625,0.858974,0.636364,1.0,0,0,0,0,1,0,0,0
4,0.609047,0.46875,0.75,0.573333,0.781481,0.846154,0.829476,0.0,0.75,0.897436,0.772727,1.0,0,0,0,0,1,0,1,0
5,0.484653,0.541667,0.5,0.693333,0.781481,0.876923,0.886724,0.0,0.5,0.884615,0.590909,1.0,0,0,0,1,0,0,1,0


In [10]:
MANUFACTURER_COLS = [i for i in range(inputdata.shape[1]) if 'manufacturer' in inputdata.columns[i]]
MANUFACTURER_COLS = (min(MANUFACTURER_COLS), max(MANUFACTURER_COLS))

TYPE_COLS = [i for i in range(inputdata.shape[1]) if 'type' in inputdata.columns[i]]
TYPE_COLS = (min(TYPE_COLS), max(TYPE_COLS))

In [11]:
example_indices = originaldata.query("type == 'Sporty' and Origin == 0").index #[13, 33]

In [12]:
originaldata.loc[example_indices]

Unnamed: 0,make,manufacturer,type,price,mpg,num_of_cylinders,horsepower,fuel_tank_capacity,RPM,Wheelbase,Rear.seat.room,Weight,automatic_gearbox,passenger_capacity,length,width,luggage_capacity,Origin
14,Chevrolet Camaro,Chevrolet,Sporty,7550.0,23.5,6,160,15.5,4600,101,25.0,3240,0,4,193,74,13.0,0
19,Chevrolet Corvette,Chevrolet,Sporty,19000.0,21.0,8,300,20.0,5000,96,30.901059,3380,0,2,179,74,17.203669,0
28,Dodge Stealth,Dodge,Sporty,12900.0,21.0,6,300,19.8,6000,97,20.0,3805,0,4,180,72,11.0,0
34,Ford Mustang,Ford,Sporty,7950.0,25.5,4,105,15.4,4600,101,24.0,2850,0,4,180,68,12.0,0
35,Ford Probe,Ford,Sporty,7000.0,27.0,4,115,15.5,5500,103,23.0,2710,0,4,179,70,18.0,0
60,Mercury Capri,Mercury,Sporty,7050.0,24.5,4,100,11.1,5750,95,19.0,2450,0,4,166,65,6.0,0
72,Plymouth Laser,Plymouth,Sporty,7200.0,26.5,4,92,15.9,5000,97,24.5,2640,0,4,173,67,8.0,0
75,Pontiac Firebird,Pontiac,Sporty,8850.0,23.5,6,160,15.5,4600,101,25.0,3240,0,4,196,75,13.0,0


In [13]:
results = inputdata.loc[example_indices]

In [14]:
solution = [-1 for x in range(inputdata.shape[1])]

solution[-9] = 0
solution[-1] = 0
solution[-2] = 0
solution[-3] = 1

print(solution)

[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, 1, 0, 0]


In [15]:
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()

In [16]:
# Attribute generator 
toolbox.register("attr_bool", random.randint, -1, 1)

In [17]:
# Structure initializers: define 'individual' to be an individual
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, inputdata.shape[1])

In [18]:
# define the population to be a list of individuals
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

In [19]:
# the goal ('fitness') function to be maximized
def evalIndividual(individual):
    if len(individual) != len(solution):
        return 0,
    else:
        return sum(ch1 == ch2 for ch1, ch2 in zip(individual, solution)),
    
def evalQuery(individual, verbose=False):
    query = []
    for i in range(inputdata.shape[1]):
        col = inputdata.columns[i]
        if individual[i] <= -1:
            continue
        query.append(str.format('{0} <= {1}', col, individual[i]))

    query = ' and '.join(query)
        
    try:
        tuples = inputdata.query(query)
        output = set(tuples.index)
    except:
        tuples = []
        output = set()
        
    if verbose:
        print('Query: ', query)
        print('Num. Results: ', len(tuples))
        print('Tuple ids: ', output)
        
    return len(set(example_indices) & output) - len(output - set(example_indices)),

In [20]:
# register the goal / fitness function
toolbox.register("evaluate", evalQuery)

In [21]:
# register the crossover operator
toolbox.register("mate", tools.cxTwoPoint)

In [22]:
# register a mutation operator with a probability to
# flip each attribute/gene of 0.05
toolbox.register("mutate", tools.mutUniformInt, indpb=0.3, low=-1, up=1)

In [23]:
# operator for selecting individuals for breeding the next
# generation: each individual of the current generation
# is replaced by the 'fittest' (best) of three individuals
# drawn randomly from the current generation.
toolbox.register("select", tools.selTournament, tournsize=3)

In [None]:
random.seed(64)

pop = toolbox.population(n=200)

# CXPB  is the probability with which two individuals
#       are crossed
#
# MUTPB is the probability for mutating an individual
CXPB, MUTPB = 0.8, 0.2

print("Start of evolution")

# Evaluate the entire population
fitnesses = list(map(toolbox.evaluate, pop))
for ind, fit in zip(pop, fitnesses):
    ind.fitness.values = fit

print("  Evaluated %i individuals" % len(pop))

# Extracting all the fitnesses of 
fits = [ind.fitness.values[0] for ind in pop]

# Variable keeping track of the number of generations
g = 0

# Begin the evolution
while max(fits) < inputdata.shape[1] and g < 100:
    # A new generation
    g = g + 1
    print("-- Generation %i --" % g)

    # Select the next generation individuals
    offspring = toolbox.select(pop, len(pop))
    # Clone the selected individuals
    offspring = list(map(toolbox.clone, offspring))

    # Apply crossover and mutation on the offspring
    for child1, child2 in zip(offspring[::2], offspring[1::2]):

        # cross two individuals with probability CXPB
        if random.random() < CXPB:
            toolbox.mate(child1, child2)

            # fitness values of the children
            # must be recalculated later
            del child1.fitness.values
            del child2.fitness.values

    for mutant in offspring:

        # mutate an individual with probability MUTPB
        if random.random() < MUTPB:
            toolbox.mutate(mutant)
            del mutant.fitness.values

    # Evaluate the individuals with an invalid fitness
    invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
    fitnesses = map(toolbox.evaluate, invalid_ind)
    for ind, fit in zip(invalid_ind, fitnesses):
        ind.fitness.values = fit

    print("  Evaluated %i individuals" % len(invalid_ind))

    # The population is entirely replaced by the offspring
    pop[:] = offspring

    # Gather all the fitnesses in one list and print the stats
    fits = [ind.fitness.values[0] for ind in pop]

    length = len(pop)
    mean = sum(fits) / length
    sum2 = sum(x*x for x in fits)
    std = abs(sum2 / length - mean**2)**0.5

    print("  Min %s" % min(fits))
    print("  Max %s" % max(fits))
    print("  Avg %s" % mean)
    print("  Std %s" % std)
    #print("  Best: ", tools.selBest(pop, 1)[0])

print("-- End of (successful) evolution --")

best_ind = tools.selBest(pop, 1)[0]
print("Best individual is %s, %s" % (best_ind, best_ind.fitness.values))

Start of evolution
  Evaluated 200 individuals
-- Generation 1 --
  Evaluated 168 individuals
  Min -35.0
  Max 0.0
  Avg -0.205
  Std 2.492584000590552
-- Generation 2 --
  Evaluated 161 individuals
  Min -2.0
  Max 0.0
  Avg -0.01
  Std 0.14106735979665885
-- Generation 3 --
  Evaluated 166 individuals
  Min 0.0
  Max 0.0
  Avg 0.0
  Std 0.0
-- Generation 4 --
  Evaluated 170 individuals
  Min -5.0
  Max 0.0
  Avg -0.025
  Std 0.35266839949164713
-- Generation 5 --
  Evaluated 173 individuals
  Min 0.0
  Max 0.0
  Avg 0.0
  Std 0.0
-- Generation 6 --
  Evaluated 162 individuals
  Min 0.0
  Max 0.0
  Avg 0.0
  Std 0.0
-- Generation 7 --
  Evaluated 169 individuals
  Min 0.0
  Max 0.0
  Avg 0.0
  Std 0.0
-- Generation 8 --
  Evaluated 158 individuals
  Min 0.0
  Max 0.0
  Avg 0.0
  Std 0.0
-- Generation 9 --
  Evaluated 180 individuals
  Min -23.0
  Max 0.0
  Avg -0.115
  Std 1.6222746376615769
-- Generation 10 --


In [None]:
evalQuery(best_ind, verbose=True)