In [101]:
import pandas as pd
import numpy as np
import os
import random
import math

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.tree import DecisionTreeClassifier

import matplotlib.pyplot as plt

from minisom import MiniSom
from gpdeap import GeneticProgrammingQBE

In [102]:
original_data = pd.read_pickle(os.path.join('../datasets', 'car_original_dataset.pkl'))
original_data.columns = [str.lower(col.replace('.', '_')) for col in original_data.columns]
original_data['origin'] = original_data['origin'].map({0: False, 1: True})
original_data['automatic_gearbox'] = original_data['automatic_gearbox'].map({0: False, 1: True})

preprocessed_data = pd.read_pickle('..//datasets//1993CarsPrep.pkl')
queries = ["type == 'Sporty' and origin == 0", 
    "type != 'Sporty' and origin == 1",
    "automatic_gearbox == 1 and horsepower >= 150",
    "luggage_capacity >= 18 and passenger_capacity > 5",
    "price <= 7000 and mpg >= 26 and automatic_gearbox == 0",
    "manufacturer == 'Ford' or manufacturer == 'Chevrolet'"]

In [105]:
original_data = pd.read_pickle(os.path.join('../datasets', 'abalone_original.pkl'))
preprocessed_data = pd.read_pickle(os.path.join('../datasets', 'abalone.pkl'))

In [106]:
queries = ['height > 0.13 and rings >= 9', 'sex == "I" and diameter > 0.45 and length <= 0.53', 'rings > 9 or rings <= 7 and sex =="F" and shuckedweight <= 0.08']

In [113]:
data = preprocessed_data.values
concept = original_data.query(queries[2]).index.to_list()
labels = np.array([int(x in concept) for x in range(1, data.shape[0]+1)])

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, stratify=labels)

In [114]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
predicted = dt.predict(X_test)
precision_recall_fscore_support(y_test, predicted, average='binary')[:3]

(0.6982968369829684, 0.6866028708133971, 0.6924004825090471)

In [115]:
gp = GeneticProgrammingQBE(preprocessed_data)

In [None]:
gp.simple_search(population_size=300, crossover_rate=0.8, mutation_rate=0.2, num_generations=100, X_train=X_train, y_train=y_train, verbose=True)
predicted = gp.predict(X_test)

  'precision', 'predicted', average, warn_for)


   	      	              fitness              	        size       
   	      	-----------------------------------	-------------------
gen	nevals	avg     	max     	min	avg    	max	min
0  	300   	0.226581	0.713174	0  	6.10667	33 	3  
1  	258   	0.417365	0.706498	0  	6.18   	19 	3  
2  	244   	0.514645	0.706498	0  	6.55   	29 	3  
3  	252   	0.56591 	0.706498	0  	7.03667	23 	3  
4  	263   	0.537681	0.706498	0  	7.66667	25 	3  
5  	264   	0.546309	0.706498	0  	7.58   	32 	3  
6  	253   	0.550786	0.706498	0  	7.28667	22 	3  
7  	250   	0.58137 	0.699259	0  	7.3    	21 	3  
8  	245   	0.571007	0.707363	0  	7.49   	24 	3  
9  	258   	0.583479	0.707363	0  	7.43667	23 	3  
10 	268   	0.577776	0.707363	0  	7.92667	27 	3  
11 	229   	0.598824	0.707363	0  	8.32333	27 	3  
12 	251   	0.591557	0.707363	0  	8.85667	28 	3  
13 	242   	0.579827	0.707363	0  	9.55667	23 	3  
14 	261   	0.585893	0.707363	0  	10.45  	39 	3  
15 	264   	0.594077	0.703554	0  	11.44  	31 	4  
16 	250   	0.603835	0.703554	0  	

In [None]:
predicted = gp.predict(X_test)

In [None]:
precision_recall_fscore_support(y_test, predicted, average='binary')[:3]

In [9]:
# Initialization and training
som = MiniSom(28, 28, X_train.shape[1], sigma=3, learning_rate=0.5, neighborhood_function='gaussian')

som.pca_weights_init(X_train)
print("Training...")
som.train_batch(data, 2*len(X_train), verbose=True)  # random training
print("\n...ready!")

Training...
 [ 48258 / 48258 ] 100% - 669.14 it/s - 0:00:00 left  - quantization error: 0.3466223518554049

...ready!


In [10]:
print(len(y_train))
print(sum(y_train))

24129
72


In [12]:
def classify(som, data, class_assignments):
    """Classifies each sample in data in one of the classes definited
    using the method labels_map.
    Returns a list of the same length of data where the i-th element
    is the class assigned to data[i].
    """
    winmap = class_assignments
    default_class = np.sum(list(winmap.values())).most_common()[0][0]
    result = []
    for d in data:
        win_position = som.winner(d)
        if win_position in winmap:
            result.append(winmap[win_position].most_common()[0][0])
        else:
            result.append(default_class)
    return result

class_assignments = som.labels_map(X_train, y_train)
predicted = classify(som, X_test, class_assignments)
precision_recall_fscore_support(y_test, predicted, average='binary')[:3]

  'precision', 'predicted', average, warn_for)


(0.0, 0.0, 0.0)

In [21]:
from sklearn.utils import resample
X = pd
labels = np.array(labels)
pos = data[labels == 1]
unl = data[labels == 0]

In [23]:
res = resample(pos,
    replace=True, # sample with replacement
    n_samples=len(unl), # match number in majority class
    random_state=27) # reproducible results

array([[0.09589041, 0.33333333, 0.18063532, ..., 0.        , 0.47959184,
        0.95      ],
       [0.02739726, 0.33333333, 0.16916576, ..., 0.        , 0.13265306,
        0.95      ],
       [0.16438356, 0.83333333, 0.23272257, ..., 0.        , 0.39795918,
        0.95      ],
       ...,
       [0.31506849, 0.66666667, 0.01328474, ..., 0.53696051, 0.34693878,
        0.95      ],
       [0.43835616, 0.33333333, 0.03577042, ..., 0.        , 0.39795918,
        0.95      ],
       [0.46575342, 0.33333333, 0.01826864, ..., 0.        , 0.60204082,
        0.95      ]])