In [3]:
from paje.base.data import Data
from sklearn import datasets
from sklearn.model_selection import train_test_split
import pandas as pd
import random as rd
import numpy as np


In [4]:
def iris_dataset(perc_label=0.3):
        dataset = datasets.load_iris()
        unlabeled_features, labeled_features, unlabeled_labels, labeled_labels = train_test_split(dataset.data,
            dataset.target, test_size=perc_label, random_state=42)
        unlabeled_features_0 = pd.DataFrame(unlabeled_features, columns=dataset.feature_names)
        labeled_features_0 = pd.DataFrame(labeled_features, columns=dataset.feature_names)
        unlabeled_labels_0 = pd.DataFrame(unlabeled_labels, columns=["label"])
        labeled_labels_0 = pd.DataFrame(labeled_labels, columns=["label"])
        return unlabeled_features_0, labeled_features_0, unlabeled_labels_0, labeled_labels_0
    


In [5]:
unlabeled_features, labeled_features, unlabeled_labels, labeled_labels = iris_dataset()

In [6]:
unlabeled_data = Data(name='IrisUnlabeled', X=unlabeled_features.values, 
                      Y=unlabeled_labels.values, 
                      columns=list(unlabeled_features.columns) + list(unlabeled_labels.columns), 
                      history=None)

labeled_data = Data(name='IrisLabeled', X=labeled_features.values, 
                      Y=labeled_labels.values, 
                      columns=list(labeled_features.columns) + list(labeled_labels.columns), 
                      history=None)

unlabeled_iris = Data(name='IrisUnlabeled', X=unlabeled_features.values, 
                      columns=list(unlabeled_features.columns), 
                      history=None)

In [5]:
unlabeled_iris.X.dtype

dtype('float64')

In [7]:
class Oracle:
    def __init__(self):
        pass
    


In [20]:
class Strategy:
    def __init__(self, datasource, ds_type, **kargs):
        self.datasource = datasource
        self.ds_type = ds_type.lower()
    
    def query(self, nr_instances):
        pass
    
    @classmethod
    def suported_types(cls):
        return []
    

class RandomSampling(Strategy):
    def __init__(self, datasource, ds_type, replacement=False, seed=None, **kargs):
        super().__init__(datasource, ds_type)
        self.set_type()

        self.replacement = replacement
        
        if replacement:
            self.to_query = None
        else:
            self.to_query = list(range(self.features_shape[0]))
            
        self.queried = []
        rd.seed(seed)

    def set_type(self):
        if self.ds_type == 'paje_data':
            self.features_shape = self.datasource.X.shape
            self.query_function = self.query_paje_data
        else:
            print("Warning: type not defined!")

    @classmethod
    def suported_types(cls):
        return ['paje_data', 'sql']

    def query(self, nr_records):
        return self.query_function(nr_records)

    def query_paje_data(self, nr_records):
        insts = []
        arr_inst = np.empty((0,self.features_shape[1]), self.datasource.X.dtype)
        
        if len(self.queried) == self.features_shape[0]:
            return None
        
        if self.replacement:
            insts += rd.sample(range(self.features_shape[0]), k=nr_records)
            self.queried += insts
        else:
            aux_insts = rd.sample(range(len(self.to_query)), k=nr_records)
            
            for j in aux_insts:
                insts.append(self.to_query[j])
                del self.to_query[j]
            self.queried += insts
            
        for record in insts:
            arr_inst = np.append(arr_inst, [self.datasource.X[record,:]], axis=0)

        return arr_inst
            
                
                
                


In [21]:
rdss = RandomSampling(unlabeled_iris, 'paje_data')

In [23]:
rdss.query(2)

array([[5.6, 3. , 4.1, 1.3],
       [4.9, 2.5, 4.5, 1.7]])