# Experimentation

## Parser for OData Queries

In [141]:
# Test cases
test_cases = [
    "ColA eq 3",
    "ColB eq 'abc'",
    "ColA eq 3 and ColB eq 'abc'",
    "startswith(ColA, 'hello')",
    "(ColA eq 3 and ColB eq 'abc') or (ColC eq 55 and ColD = 'lel')",
    "startswith(ColA, 'hello') and substringof('hello', ColB)",
]

In [145]:
import re

def parse_odata_filter(query):
    # lt
    query = query.replace(' lt ', ' < ')
    # le
    query = query.replace(' le ', ' <= ')
    # gt
    query = query.replace(' gt ', ' > ')
    # ge
    query = query.replace(' ge ', ' >= ')
    # eq
    query = query.replace(' eq ', ' = ')
    # ne
    query = query.replace(' ne ', ' != ')
    # startswith(column, string)
    matches_sw = re.match('startswith\(.*?\)', query.lower())
    if matches_sw:
        span_sw = matches_sw.span()
        sw_query = query[span_sw[0]:span_sw[1]]
        # Extract text between brackets
        sw_terms = re.sub('.*\(', '', sw_query)
        sw_terms = re.sub('\).*', '', sw_terms)
        sw_terms = [s.strip() for s in sw_terms.split(',')]
        sw_terms[1] = re.sub('[^a-zA-Z0-9]', '', sw_terms[1])
        query = re.sub(sw_query.replace('(', '\(').replace(')', '\)'), f"{sw_terms[0]} LIKE '{sw_terms[1]}%'", query)
        
    # substringof(string, column)
    matches_so = re.search('substringof\(.*?\)', query.lower())
    if matches_so:
        span_so = matches_so.span()
        so_query = query[span_so[0]:span_so[1]]
        # Extract text between brackets
        so_terms = re.sub('.*\(', '', so_query)
        so_terms = re.sub('\).*', '', so_terms)
        so_terms = [s.strip() for s in so_terms.split(',')]
        so_terms[0] = re.sub('[^a-zA-Z0-9]', '', so_terms[0])
        query = re.sub(so_query.replace('(', '\(').replace(')', '\)'), f"{so_terms[1]} LIKE '%{so_terms[0]}%'", query)
    # day()
    # month()
    # year()
    # hour()
    # minute()
    # second()
    return query

In [146]:
parse_odata_filter(test_cases[-1])

"ColA LIKE 'hello%' and ColB LIKE '%hello%'"

In [147]:
for test_case in test_cases:
    print(parse_odata_filter(test_case))

ColA = 3
ColB = 'abc'
ColA = 3 and ColB = 'abc'
ColA LIKE 'hello%'
(ColA = 3 and ColB = 'abc') or (ColC = 55 and ColD = 'lel')
ColA LIKE 'hello%' and ColB LIKE '%hello%'


In [16]:
def parse_odata_filter(query, joins):
  # Replace lookup column with the associated table
  main_col_matches = re.findall('\w+\/', query)
  main_col_replacements = []
  for match in main_col_matches:
    main_col_replacements.append(joins[match[:-1]]['table'])
  for match, replacement in zip(main_col_matches, main_col_replacements):
    query = re.sub(match, replacement + '.', query)
  
  # Replace slashes with dots
  # query = query.replace('/', '.')
  # lt
  query = query.replace(' lt ', ' < ')
  # le
  query = query.replace(' le ', ' <= ')
  # gt
  query = query.replace(' gt ', ' > ')
  # ge
  query = query.replace(' ge ', ' >= ')
  # eq
  query = query.replace(' eq ', ' = ')
  # ne
  query = query.replace(' ne ', ' != ')
  # startswith(column, string)
  query = re.sub('startsWith', 'startswith', query, re.IGNORECASE)
  matches_sw = re.findall('startswith\(.*?\)', query)
  if len(matches_sw) > 0:
    for match in matches_sw:
      # Extract text between brackets
      sw_terms = re.sub('.*\(', '', match)
      sw_terms = re.sub('\).*', '', sw_terms)
      sw_terms = [s.strip() for s in sw_terms.split(',')]
      sw_terms[1] = re.sub('[^a-zA-Z0-9]', '', sw_terms[1])
      query = re.sub(match.replace('(', '\(').replace(')', '\)'), f"{sw_terms[0]} LIKE '{sw_terms[1]}%'", query)
      
      print(query)
      
  # substringof(string, column)
  matches_so = re.findall('substringof\(.*?\)', query, re.IGNORECASE)
  if len(matches_so) > 0:
    for match in matches_so:
      # Extract text between brackets
      so_terms = re.sub('.*\(', '', match)
      so_terms = re.sub('\).*', '', so_terms)
      so_terms = [s.strip() for s in so_terms.split(',')]
      so_terms[0] = re.sub('[^a-zA-Z0-9]', '', so_terms[0])
      query = re.sub(match.replace('(', '\(').replace(')', '\)'), f"{so_terms[1]} LIKE '%{so_terms[0]}%'", query)
  
  # day()

  # month()
  
  # year()
  
  # hour()
  
  # minute()
  
  # second()
  
  return query

In [20]:
query1 = {
    '$select': 'Id,tableTitle,parentDatasetID/datasetTitle,parentDatasetID/dataDomain,parentDatasetID/owner',
    '$filter': "startswith(parentDatasetID/dataDomain,'O') and startswith(parentDatasetID/owner,'B')",
    '$expand': 'parentDatasetID'
}

query2 = {
    '$select': 'Id,tableTitle,parentDatasetID/datasetTitle,parentDatasetID/dataDomain,parentDatasetID/owner',
    '$filter': "parentDatasetID/dataDomain eq 'Ops'",
    '$expand': 'parentDatasetID'
}

joins = {
    'parentDatasetID': {
        'table': 'dc_datasets'
    }
}

In [31]:
import re

def parse_odata_query(query):
    output = {
        'main_cols': [],
        'join_cols': [],
        'filter_query': '',
        'expand_cols': []
    }
    if not query:
        return
    for query, value in query.items():
        if query == '$filter':
            output['filter_query'] = value
        else:
            columns = [v.strip() for v in value.split(',')]
            if query == '$select':    
                for column in columns:
                    if '/' in column:
                        output['join_cols'].append(column)
                    else:
                        output['main_cols'].append(column)
            elif query == '$expand':
                output['expand_cols'].extend(columns)
    return output

In [32]:
parse_odata_query(query1)

{'main_cols': ['Id', 'tableTitle'],
 'join_cols': ['parentDatasetID/datasetTitle',
  'parentDatasetID/dataDomain',
  'parentDatasetID/owner'],
 'filter_query': "startswith(parentDatasetID/dataDomain,'O') and startswith(parentDatasetID/owner,'B')",
 'expand_cols': ['parentDatasetID']}

## Generate Fake Data

In [2]:
import numpy as np
import pandas as pd

from sklearn.datasets import make_regression

In [7]:
np.random.choice(np.arange(10,51))

50

In [20]:
tx_x, tx_y = make_regression(
    n_samples=200, n_features=np.random.choice(np.arange(10,51))
)

In [28]:
tx = pd.concat(list(map(pd.DataFrame, [tx_x, tx_y])), axis=1)
tx.columns = 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,38,39,40,41,42,43,44,45,46,0.1
0,-0.405242,1.563611,-0.560169,-0.548326,-0.265887,-1.080761,-0.399823,0.397871,0.145388,-0.945286,...,1.488113,-0.078717,0.873683,0.595999,-0.866957,0.061608,-0.696177,0.150038,-1.345082,-126.282693
1,0.577230,-1.453273,0.837880,0.000365,0.428471,0.666035,-0.051718,-1.899854,0.168371,-2.029906,...,0.884018,0.603968,0.086788,0.621469,-0.241594,0.882170,-1.309602,0.504533,1.268360,79.657430
2,-0.270537,0.229875,-0.260704,-0.579307,-0.829434,-0.380175,-0.707313,0.436839,0.643897,-0.260127,...,-0.877877,-0.401614,-0.559773,-0.075459,-0.217509,0.033785,2.447476,1.069963,0.123853,14.921889
3,-0.137173,1.455347,-0.949456,-0.062532,-0.806887,-0.644719,0.786938,0.951608,-0.426624,0.823934,...,-1.279880,-0.520409,-0.548194,1.182162,1.419393,-0.937536,0.811578,0.165678,-0.159242,-32.058162
4,-1.350423,0.064717,0.265143,-0.991653,1.261522,0.853679,0.435000,-0.145628,-0.209120,0.331257,...,0.389554,-1.252367,-0.613106,-2.144936,0.469378,-1.461348,-3.199658,-0.730193,0.286586,74.389400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,-1.406874,-0.114641,0.464138,0.077306,0.265561,-0.103346,-0.439622,-0.990663,-2.630228,-0.016192,...,-0.985112,-0.714991,-0.165615,0.648288,0.061849,-0.694321,-0.626601,1.673050,-0.141113,-182.203931
196,1.015519,0.051775,-0.935499,-0.116951,1.277190,-0.234225,0.589215,-1.697893,-1.566697,1.037926,...,-0.438656,-1.314228,-0.008543,-0.114182,-1.078698,-1.271480,0.061094,0.473788,-0.433591,-159.881789
197,-2.075236,-0.020496,-0.859549,0.439129,0.632554,-1.761820,-1.026540,-0.075858,1.088921,-1.218484,...,0.272646,-0.411999,0.312554,-0.237267,-0.634291,0.402382,-0.041349,-0.952367,1.615691,-190.627092
198,0.481490,-1.471381,-0.101904,-0.866245,0.626325,0.409735,0.638157,-1.123474,-0.611361,-2.097236,...,0.209504,-0.878155,1.113632,-1.194316,-0.339853,-0.527343,-0.322131,0.451015,0.654958,49.421897
