In [68]:
import pandas as pd

import importlib

from os import listdir
from os.path import isfile, join


def class_for_name(module_name, class_name):
    try:
        # load the module, will raise ImportError if module cannot be loaded
        m = importlib.import_module(module_name)
        # get the class, will raise AttributeError if class cannot be found
        c = getattr(m, class_name)()
        return c
    except:
        print('Could not load module: '+module_name+'.'+class_name)
        return -1

#Data: dataset to test (from the test datasets that illustrate different requirements)
#Primitive: primitive being tested. We assume it has the fit() method
def passTest(data, primitive):
    target = data.iloc[:,-1]
    train = data.drop(data.columns[[len(data.columns)-1]], axis=1) #drop target column (the last one)
    try:
        y_pred = primitive.fit(train,target)#.predict(train)# Not all primitives have a predict, but should have fit
        print("PASSED: "+data.name) 
    except:
        print("NOT PASSED: " +data.name)
    #CASE HERE with all the values to be printed
    #print ('banana')

#Path: String with the path to the dataset folders. The system assumes to have three: clean_data, requirement_data and performance_data
#Primitive module name: string with the module name. E.g., 'sklearn.svm'
#Primitive name: string with the name of the primitive to be loaded. E.g., 'SVC'
#testPerformance: boolean that is true if you want to test the performance tests (will require more time)
def getPrimitiveRequirements(path, primitiveModuleName, primitiveName, testPerformance):
    CLEAN = path + "clean_data"
    REQ = path + "requirement_data"
    PERF = path + "performance_data"
    #module = importlib.import_module(primitiveModuleName)
    #prim = getattr(module, primitiveName)()
    prim =  class_for_name(primitiveModuleName,primitiveName)
    if(prim == -1):
        return
    #Clean data files
    data_clean_int = pd.read_csv(CLEAN +'/int_clean_data.csv')
    data_clean_int.name = "CLEAN DATA INT"
    data_clean_float = pd.read_csv(CLEAN +'/float_clean_data.csv') 
    data_clean_float.name = "CLEAN DATA FLOAT"
    passTest(data_clean_int, prim)
    passTest(data_clean_float, prim)
    #TESTS (this should be a method)
    nlyfiles = [f for f in listdir(REQ) if isfile(join(REQ, f))]
    #print(onlyfiles)
    for d in onlyfiles:
        data = pd.read_csv(REQ+"/"+d)
        data.name = d
        passTest(data, prim)

DATADIR = "data_profiler/"
print('sklearn.svm.SVC')
getPrimitiveRequirements(DATADIR,'sklearn.svm','SVC','false')
print('sklearn.linear_model.LogisticRegression')
getPrimitiveRequirements(DATADIR,'sklearn.linear_model','LogisticRegression','false')




sklearn.svm.SVC
PASSED: CLEAN DATA INT
PASSED: CLEAN DATA FLOAT
NOT PASSED: all_string_version.csv
PASSED: float_negative_version.csv
PASSED: int_negative_version.csv
NOT PASSED: one_missing_version.csv
NOT PASSED: some_missing_version.csv
NOT PASSED: some_string_version.csv
PASSED: unique_value_version.csv
sklearn.linear_model.LogisticRegression
PASSED: CLEAN DATA INT
PASSED: CLEAN DATA FLOAT
NOT PASSED: all_string_version.csv
PASSED: float_negative_version.csv
PASSED: int_negative_version.csv
NOT PASSED: one_missing_version.csv
NOT PASSED: some_missing_version.csv
NOT PASSED: some_string_version.csv
PASSED: unique_value_version.csv


In [50]:
#Files with different issues: missing values, constant values, etc.
from os import listdir
from os.path import isfile, join
onlyfiles = [f for f in listdir(REQ) if isfile(join(REQ, f))]
#print(onlyfiles)
for d in onlyfiles:
    data = pd.read_csv(DATADIR+"/requirement_data"+"/"+d)
    data.name = d
    passTest(data, prim)

#Files with the performance (TO DO)


['all_string_version.csv', 'float_negative_version.csv', 'int_negative_version.csv', 'one_missing_version.csv', 'some_missing_version.csv', 'some_string_version.csv', 'unique_value_version.csv']
NOT PASSED: all_string_version.csv
PASSED: float_negative_version.csv
PASSED: int_negative_version.csv
NOT PASSED: one_missing_version.csv
NOT PASSED: some_missing_version.csv
NOT PASSED: some_string_version.csv
PASSED: unique_value_version.csv
