In [7]:
import pandas as pd

import importlib

import numpy as np

import re

from os import listdir
from os.path import isfile, join

import time

def class_for_name(module_name, class_name):
    try:
        # load the module, will raise ImportError if module cannot be loaded
        m = importlib.import_module(module_name)
        # get the class, will raise AttributeError if class cannot be found
        c = getattr(m, class_name)()
        return c
    except:
        print('Could not load module: '+module_name+'.'+class_name)
        return -1

#Data: dataset to test (from the test datasets that illustrate different requirements)
#Primitive: primitive being tested. We assume it has the fit() method
#The second field returned indicates whether the primitives needs an array or not
def passTest(data, primitive):
    target = data.iloc[:,-1]
    train = data.drop(data.columns[[len(data.columns)-1]], axis=1) #drop target column (the last one)
    #test
    try:
        y_pred = primitive.fit(train,target).predict(train)# Not all primitives have a predict, but should have fit
        #print("PASSED: "+data.name)
        #print (y_pred)
        return True, False
    except Exception as e:
        #print(e)
        try:
            y_pred = primitive.fit(train).transform(train)
            #print (y_pred)
            return True, False
        except:
            #Some primitives can only be applied to arrays, not matrix!
            try:
                for col in train.columns:
                    #print (col)
                    #Need to do the transform, otherwise exceptions may not be raised
                    y_pred = primitive.fit(train[col]).transform(train[col])
                return True, True
            except: 
                return False, False
        #print("NOT PASSED: " +data.name)
        


#Path: String with the path to the dataset folders. The system assumes to have three: clean_data, requirement_data and performance_data
#Primitive module name: string with the module name. E.g., 'sklearn.svm'
#Primitive name: string with the name of the primitive to be loaded. E.g., 'SVC'
#testPerformance: boolean that is true if you want to test the performance tests (will require more time)
def getPrimitiveRequirements(path, primitiveModuleName, primitiveName, testPerformance):
    CLEAN = path + "clean_data"
    REQ = path + "requirement_data"
    PERF = path + "performance_data"
    primExec =  class_for_name(primitiveModuleName,primitiveName)
    if(primExec == -1):
        print("The primitive module could not be loaded.")
        return -1
    prim = {}
    #prim = Primitive(primitiveName)
    prim["Name"] = primitiveName
    prim["id"] = primitiveModuleName+"."+primitiveName
    #Clean data files: all primitives should pass these tests
    data_clean_int = pd.read_csv(CLEAN +'/int_clean_data.csv')
    data_clean_float = pd.read_csv(CLEAN +'/float_clean_data.csv')
    data_clean_int.name = "CLEAN DATA INT" 
    data_clean_float.name = "CLEAN DATA FLOAT"
    
    if not hasattr(primExec, 'fit'):
        print("Primitive does not have fit method. No requirements considered")
        return -1
    
    passed, p = (passTest(data_clean_int, primExec)) and (passTest(data_clean_float, primExec))
    #print(passed)
    if(not passed):
        print("The primitive "+primitiveName+" cannot execute the clean datasets. No further requirements addressed")
        return -1
    
    if hasattr(primExec, 'predict'):
        #primitive is a classifier/regression
        target = data_clean_float.iloc[:,-1]
        train = data_clean_float.drop(data_clean_float.columns[[len(data_clean_float.columns)-1]], axis=1) #drop target column (the last one)
        y_pred = primExec.fit(train,target).predict(train)
        prim["Task"] = "Modeling"        
        if issubclass(y_pred.dtype.type, np.floating):
            prim["LearningType"] = "Regression"
        else:
            prim["LearningType"] = "Classification"
        
    
    #Rest of the tests
    onlyfiles = [f for f in listdir(REQ) if isfile(join(REQ, f))]
    requirements = []
    for d in onlyfiles:
        data = pd.read_csv(REQ+"/"+d)
        data.name = d
        passed,array = passTest(data, primExec)
        if ("missing" in data.name) and (not passed) and ("NON-MISSING-VALUES" not in requirements):
            #print("Primitive cannot handle missing values")
            requirements.append("NON-MISSING-VALUES")
        if ("categorical" in data.name) and (not passed) and ("NUMERICAL" not in requirements):
            #print("Primitive cannot handle string/categorical values")
            requirements.append("NUMERICAL")
        if ("unique" in data.name) and (not passed) and ("NOT-UNIQUE" not in requirements):
            #print("Primitive cannot handle having a column of unique values")
            requirements.append("NOT-UNIQUE")
        if ("negative" in data.name) and (not passed) and ("NON-NEGATIVE" not in requirements):
            #print("Primitive cannot handle negative values")
            requirements.append("NON-NEGATIVE")
        if(array):
            #prim.isArray = True
            prim["IsArray"] = True
    prim["Requirements"] = requirements
            
    if(testPerformance):
        onlyfiles = [f for f in listdir(PERF) if isfile(join(PERF, f))]
        performance = []
        for d in onlyfiles:
            data = pd.read_csv(PERF+"/"+d)
            data.name = d
            start = time.time()
            passed,array = passTest(data, primExec)
            end = time.time()
            total = end - start
            #print(data.name +": "+ str(end - start))  
            fileNames = re.findall(r'\d+', data.name)
            element = {}
            element["rows"] = fileNames[0]
            element["columns"] = fileNames[1]
            element["time"] = total
            performance.append(element)
        prim["Performance"] = performance
    return prim
      
def allPrimitivesToText(jsonFile):
    output = []
    for i in data['search_primitives']:
        if i["is_class"]: #necessary because otherwise it will attempt to try many primitives that may even download data
            primitive = i["id"]
            print (primitive [:primitive.rindex('.')]+"."+primitive [primitive.rindex('.')+1:])
            p = getPrimitiveRequirements(DATADIR,primitive [:primitive.rindex('.')],primitive [primitive.rindex('.')+1:],True)
            if(p != -1):
                output.append(p)
                #print (primitiveToJSON(p))
    primitives = {}
    primitives["primitive_catalog"] = output
    #TO DO: Return JSON dumps of the array
    return json.dumps(primitives)
        
#Main script        
DATADIR = "data_profiler/" #Dir with the profiling datasets
import json
#print(json.dumps(getPrimitiveRequirements(DATADIR,'sklearn.svm','SVC',True)))
#print (primitiveToJSON(getPrimitiveRequirements(DATADIR,'sklearn.svm','SVC',True)))
#print (json.dumps(getPrimitiveRequirements(DATADIR,'sklearn.linear_model','LinearRegression',True)))
#print (json.dumps(getPrimitiveRequirements(DATADIR,'sklearn.preprocessing','LabelEncoder',True)))

#print (json.dumps(getPrimitiveRequirements(DATADIR,'sklearn.feature_extraction.text','TfidfVectorizer',False)))
#sklearn.metrics.scorer.get_scorer
#print (json.dumps(getPrimitiveRequirements(DATADIR,'sklearn.linear_model.sgd_fast','Regression',False)))
#print (json.dumps(getPrimitiveRequirements(DATADIR,'sklearn.decomposition.dict_learning','DictionaryLearning',False)))

from pprint import pprint
import json

#JSON with the primitives by Khe-Thia
with open('sklearn-supersimple.json') as data_file:    
    data = json.load(data_file)
print(allPrimitivesToText(data))



sklearn.svm.classes.SVC
sklearn.preprocessing.label.LabelEncoder
{"primitive_catalog": [{"Name": "SVC", "id": "sklearn.svm.classes.SVC", "Task": "Modeling", "LearningType": "Classification", "Requirements": ["NUMERICAL", "NON-MISSING-VALUES"], "Performance": [{"rows": "100", "columns": "350", "time": 0.015965938568115234}, {"rows": "100", "columns": "50", "time": 0.003998994827270508}, {"rows": "300", "columns": "350", "time": 0.12764334678649902}, {"rows": "300", "columns": "50", "time": 0.024962902069091797}, {"rows": "600", "columns": "350", "time": 0.522212028503418}, {"rows": "600", "columns": "50", "time": 0.08600711822509766}]}, {"Name": "LabelEncoder", "id": "sklearn.preprocessing.label.LabelEncoder", "IsArray": true, "Requirements": ["NON-MISSING-VALUES"], "Performance": [{"rows": "100", "columns": "350", "time": 0.04904294013977051}, {"rows": "100", "columns": "50", "time": 0.010004997253417969}, {"rows": "300", "columns": "350", "time": 0.07406878471374512}, {"rows": "300", 