In [1]:
from pandas import read_csv, Series
import numpy as np
import json

In [2]:
def clean_and_read_adult(data_file="train"):
    try:
        f = open('adult_%s.csv' % data_file)
    except IOError:
        f = open('adult.%s' % data_file)
    lines = f.read().split('\n')
    w = open('adult_%s_clean.csv' % data_file, 'wb')
    w.write(lines[0]+'\n')
    for l in lines[1:]:
        w.write(','.join([_.strip() for _ in l.split(',')])+'\n')
    w.close()
    f.close()
    data = read_csv('adult_%s_clean.csv' % data_file)
    _class = data['salary'] == '>50K'
    data['class'] = Series(_class)
    return data

In [3]:
data = clean_and_read_adult()

In [4]:
class Query:

    def __init__(self, queryName):
        self.queryName = queryName

    def mask(self, feature, value, operation='eq'):
        operationsDict = {
            'eq': feature == value,
            'lt': feature < value,
            'gt': feature > value,
            'neq': feature != value,
            'gte': feature >= value,
            'lte': feature <= value
        }
        resultMask = operationsDict[operation]
        return resultMask
    
    def lookup(self, dataFrame, query_params):

        self.conjunctions = query_params['conjunctions']        
        self.conditions = query_params['conditions']
        
        assert len(self.conjunctions) == len(self.conditions) - 1, "Mismatch in conditions and conjunctions"

        resultMask = np.ones((len(dataFrame),),dtype=bool)
        conjunctions = [c for c in self.conjunctions]
        
        # adding True mask and dummy 'and'
        conjunctions.insert(0, 'and')
        
        for condition, conjunction in zip(self.conditions, conjunctions):
            feature, value, operation = condition['feature'], condition['value'], condition['operation']
            feature = dataFrame[feature]
            currentMask = self.mask(feature, value, operation)
            if conjunction == 'and':
                resultMask &= currentMask
            elif conjunction == 'or':
                resultMask |= currentMask
        resultCount = int(np.bincount(resultMask)[1])
        return dataFrame[resultMask], resultCount

In [5]:
class Suite:
    def __init__(self, dataFrame, suiteName):
        self.dataFrame = dataFrame
        self.suiteName = suiteName
        self.queries = []
        self.resulSets = []
        
    def add_query(self, query, name="query"):
        assert type(query) == dict
        self.queries.append((query,name))
    
    def run(self):
        print "Initiating Run for test suite",self.suiteName
        for query,name in self.queries:
            print "Testing query:",name
            q = Query(name)
            resultSet, resultCount = q.lookup(self.dataFrame, query)
            jsonOut = resultSet.to_dict(orient='split')
            jsonOut['resultCount'] = resultCount
            yield jsonOut

In [6]:
query = { 'conditions': [
            {
                'feature': 'race',
                'operation': 'eq',
                'value': 'Black'
            },
            {
                'feature': 'sex',
                'operation': 'eq',
                'value': 'Female'
            },
            {
                'feature': 'marital-status',
                'operation': 'eq',
                'value': 'Divorced'
            },
            {
                'feature': 'age',
                'operation': 'gte',
                'value': 35
            }],
        'conjunctions': ['and', 'and', 'and']
    }

In [7]:
suite = Suite(data,"demo_suite")

In [8]:
suite

<__main__.Suite instance at 0x112468c20>

In [9]:
suite.add_query(query, "demo_query")

In [10]:
for out in suite.run():
    print out.keys()

Initiating Run for test suite demo_suite
Testing query: demo_query
['index', 'resultCount', 'data', 'columns']
