## Challenge

Given columnar data format vicol1 compare its reading performance with CSV table. 
Check select and where clause.

In [1]:
from texttable import Texttable
import json 

class ViCol1Table:
    def __init__(self, filepath):
        self.filepath = filepath
        self.__conditions = {}
        self.__column_names = set()
        self.__meta = None
        self.__f = None
        self.__ref_offset = 0
    
    def where(self, column_name, condition):
        self.__conditions.setdefault(column_name,[]).append(condition)
            
        return self
    
    def select(self, column_names):
        self.__column_names |= set(column_names)
        return self
    
    def __find_ids(self, column_name, conditions):
        filter_column = self.__meta['columns'][column_name]
        start, end = filter_column['start'], filter_column['end']
        self.__f.seek(self.__ref_offset + start)
        raw_values = self.__f.read(end - start) 
        values = raw_values and raw_values.split(',') or []
        found_indices = [i for i, v in enumerate(values) if all([c(v) for c in conditions])]
        return found_indices
    
    def __select_columns(self, indices):
        data = []
        
        for column in self.__column_names:
        
            start, end = self.__meta['columns'][column]['start'], self.__meta['columns'][column]['end']
            
            self.__f.seek(self.__ref_offset + start)
            raw_values = self.__f.read(end - start) 
            values = raw_values and raw_values.split(',') or []
            
            data.append([values[index] for index in indices])
        return data
        
    def __display(self, headers, rows):
        table = Texttable()
        table.add_rows([headers] + rows)
        print(table.draw())

    
    def execute(self):
        with open(self.filepath, 'r') as f:
            self.__meta = json.loads(f.readline())
            self.__ref_offset = f.tell() # ref offset
            self.__f = f
            
            ids = set()
            for column, conditions in self.__conditions.items():
                if ids:
                    ids &= set(self.__find_ids(column, conditions))
                else:
                    ids |= set(self.__find_ids(column, conditions))
            print(ids)
            data = self.__select_columns(ids)
            print(data)
            self.__display(self.__column_names, list(zip(*data)))
            
            



             
    
    
    
    

In [2]:

(ViCol1Table('resources/employees.vicol1')
     .where('department', lambda x: x == 'BigData')
     .where('first_name', lambda x: x >= "Julia")
     .select(['department', 'salary', 'first_name'])
     .execute())

{4, 6}
[['BigData', 'BigData'], ['9431', '17512'], ['Michael', 'Julia']]
+------------+--------+------------+
| department | salary | first_name |
| BigData    | 9431   | Michael    |
+------------+--------+------------+
| BigData    | 17512  | Julia      |
+------------+--------+------------+


In [3]:
# this one handles only one where clause (overwrites while chaingin)

from texttable import Texttable
import json 
class ViCol1Table:
    def __init__(self, filepath):
        self.filepath = filepath
        self.__condition = None
        self.__column_names = set()
        self.__meta = None
        self.__f = None
        self.__ref_offset = 0
    
    def where(self, column_name, condition):
        self.__condition = (column_name, condition)
            
        return self
    
    def select(self, column_names):
        self.__column_names = set(column_names)
        return self
    
    def __find_ids(self, column_name, condition):
        filter_column = self.__meta['columns'][column_name]
        start, end = filter_column['start'], filter_column['end']
        self.__f.seek(self.__ref_offset + start)
        raw_values = self.__f.read(end - start) 
        values = raw_values and raw_values.split(',') or []
        found_indices = [i for i, v in enumerate(values) if condition(v)]
        return found_indices
    
    def __select_columns(self, indices):
        data = []
        
        for column in self.__column_names:
        
            start, end = self.__meta['columns'][column]['start'], self.__meta['columns'][column]['end']
            print(column, start, end)
            self.__f.seek(self.__ref_offset + start)
            raw_values = self.__f.read(end - start) 
            values = raw_values and raw_values.split(',') or []
            
            data.append([values[index] for index in indices])
        return data
        
    def __display(self, headers, rows):
        table = Texttable()
        table.add_rows([headers] + rows)
        print(table.draw())

    
    def execute(self):
        with open(self.filepath, 'r') as f:
            self.__meta = json.loads(f.readline())
            self.__ref_offset = f.tell() # ref offset
            self.__f = f
            
            column, condition = self.__condition
            ids = self.__find_ids(column, condition)
            print(ids)
            data = self.__select_columns(ids)
            print(data)
            self.__display(self.__column_names, list(zip(*data)))
            
            



             
    
    
    
    

[1, 4, 6, 9]
department 135 214
salary 214 270
first_name 0 65
[['Embedded', 'BigData', 'BigData', 'Embedded'], ['21162', '9431', '17512', '7895'], ['Manuel', 'Michael', 'Julia', 'Todd']]
+------------+--------+------------+
| department | salary | first_name |
| Embedded   | 21162  | Manuel     |
+------------+--------+------------+
| BigData    | 9431   | Michael    |
+------------+--------+------------+
| BigData    | 17512  | Julia      |
+------------+--------+------------+
| Embedded   | 7895   | Todd       |
+------------+--------+------------+
