In [4]:
import json 
import sqlparse
from sqlparse.sql import Identifier, IdentifierList, remove_quotes, Token, TokenList, Where
from sqlparse.tokens import Keyword, Name, Punctuation, String, Whitespace
from sqlparse.utils import imt
import pandas as pd
from flatten_dict import flatten
from pprint import pprint

In [17]:
## Files imported

## counter queries (sql+redis)
json_file_path = '/Users/mathieupeychet/Downloads/sql-export.json'

## foreign_key csv generated
foreign_key_df = pd.read_csv('/Users/mathieupeychet/Documents/foreign_keys.csv')

## To join an event to a ultimate_namespace_id, we have 3 potential standard tables to join
table_to_join = ['projects', 'namespaces', 'groups']

In [16]:

def sql_queries_dict(json_file):
    ''' 
    function that transforms the sql-export.json file into a Python dict with only SQL batch counters
    '''
    with open(json_file) as f:
        data = json.load(f)

    from flatten_dict.reducer import make_reducer
    full_payload_dict = flatten(data, reducer=make_reducer(delimiter='.'))

    sql_queries_dict  = {}

    for (key, value) in full_payload_dict.items():
       # Check if key is even then add pair to new dictionary
       if isinstance(value, str) and str.startswith(value, 'SELECT') is True:
           sql_queries_dict[key] = value
    
    return sql_queries_dict

sql_queries_dict = sql_queries_dict(json_file_path)

In [20]:

def create_join_mapping_df(sql_queries_dict):
    '''
        The functoin returns a dataframe with the following columns
        - counter: name of the counter, which is the item key in the dictionary passed
        - sql_query: query run to calculate the counter, item value in the dictionary passed as argument
        - table_name: name of the table in the FROM statement of the sql_query
        - foreingn_table_name: 
        - foreign_column_name:
    '''
    final_join_mapping_df = pd.DataFrame()
    for key, value in sql_queries_dict.items():
        sql_value = sqlparse.parse(value)[0]

        ## get the table which is queried in the FROM statement
        queried_table = sql_value.get_name()

        ## 
        potential_joins = foreign_key_df[(foreign_key_df['table_name'] == queried_table) & (foreign_key_df['foreign_table_name'].isin(table_to_join))]
        potential_joins = potential_joins.drop_duplicates()

        if potential_joins[potential_joins.foreign_table_name == 'projects'].empty is False:
            table_to_append = potential_joins[potential_joins.foreign_table_name == 'projects']
            table_to_append["counter"] = key
            table_to_append["sql_query"] = value
            final_join_mapping_df = final_join_mapping_df.append(table_to_append, ignore_index=True)
        elif potential_joins[potential_joins.foreign_table_name == 'groups'].empty is False:
            table_to_append = potential_joins[potential_joins.foreign_table_name == 'groups']
            table_to_append["counter"] = key
            table_to_append["sql_query"] = value
            final_join_mapping_df = final_join_mapping_df.append(table_to_append, ignore_index=True)
        elif potential_joins[potential_joins.foreign_table_name == 'namespaces'].empty is False:
            table_to_append = potential_joins[potential_joins.foreign_table_name == 'namespaces']
            table_to_append["counter"] = key
            table_to_append["sql_query"] = value
            final_join_mapping_df = final_join_mapping_df.append(table_to_append, ignore_index=True)        
    potential_joins = foreign_key_df[(foreign_key_df['table_name'] == 'issues') & (foreign_key_df['foreign_table_name'].isin(table_to_join))]
    return final_join_mapping_df

final_join_mapping_df = create_join_mapping_df(sql_queries_dict)

In [19]:
def _update_table_names(
    tables: List[str], tokens: List[sqlparse.sql.Token], index: int, last_keyword: str
) -> List[str]:
    """
    Return new table names matching database.table or database.schema.table notation
    :type tables list[str]
    :type tokens list[sqlparse.sql.Token]
    :type index int
    :type last_keyword str
    :rtype: list[str]
    """

    token = tokens[index]
    last_token = tokens[index - 1].value.upper() if index > 0 else None
    next_token = tokens[index + 1].value.upper() if index + 1 < len(tokens) else None

    if (
        last_keyword
        in [
            "FROM",
            "JOIN",
            "INNER JOIN",
            "FULL JOIN",
            "FULL OUTER JOIN",
            "LEFT JOIN",
            "RIGHT JOIN",
            "LEFT OUTER JOIN",
            "RIGHT OUTER JOIN",
            "INTO",
            "UPDATE",
            "TABLE",
        ]
        and last_token not in ["AS"]
        and token.value not in ["AS", "SELECT"]
    ):
        if last_token == "." and next_token != ".":
            # we have database.table notation example
            table_name = "{}.{}".format(tokens[index - 2], tokens[index])
            if len(tables) > 0:
                tables[-1] = table_name
            else:
                tables.append(table_name)

        schema_notation_match = (Name, ".", Name, ".", Name)
        schema_notation_tokens = (
            (
                tokens[index - 4].ttype,
                tokens[index - 3].value,
                tokens[index - 2].ttype,
                tokens[index - 1].value,
                tokens[index].ttype,
            )
            if len(tokens) > 4
            else None
        )
        if schema_notation_tokens == schema_notation_match:
            # we have database.schema.table notation example
            table_name = "{}.{}.{}".format(
                tokens[index - 4], tokens[index - 2], tokens[index]
            )
            if len(tables) > 0:
                tables[-1] = table_name
            else:
                tables.append(table_name)
        elif tokens[index - 1].value.upper() not in [",", last_keyword]:
            # it's not a list of tables, e.g. SELECT * FROM foo, bar
            # hence, it can be the case of alias without AS, e.g. SELECT * FROM foo bar
            pass
        else:
            table_name = str(token.value.strip("`"))
            tables.append(table_name)

    return tables


NameError: name 'List' is not defined

In [54]:
sql_test = 'SELECT test FROM "services" LEFT JOIN "users" ON users.id = services.id WHERE users.id = 5'

sql_test = sql_test.replace('"', "")
#print(sql_test)
parsed = sqlparse.parse(sql_test)
tokens = TokenList(parsed[0].tokens).flatten()
# print([(token.value, token.ttype) for token in tokens])

test = [token for token in tokens if token.ttype is not Whitespace]

table_syntax_keywords = [
    # SELECT queries
    "FROM",
    "WHERE",
    "JOIN",
    "INNER JOIN",
    "FULL JOIN",
    "FULL OUTER JOIN",
    "LEFT OUTER JOIN",
    "RIGHT OUTER JOIN",
    "LEFT JOIN",
    "RIGHT JOIN",
    "ON",
    # INSERT queries
    "INTO",
    "VALUES",
    # UPDATE queries
    "UPDATE",
    "SET",
    # Hive queries
    "TABLE",  # INSERT TABLE
]

tables = []
last_keyword = None
last_token=None

print(test)
for index, token in enumerate(test):
    print(token.ttype is Name)
    #print([token, token.ttype, last_token, last_keyword, token.is_keyword, index])
    if token.is_keyword and token.value.upper() == 'WHERE':
        # keep the name of the last keyword, the next one can be a table name
        where_index = index
        print(where_index)
    elif (
        token.is_keyword
        and str(token).upper() == "SELECT"
    ):
        # reset the last_keyword for "INSERT INTO SELECT" and "INSERT TABLE SELECT" queries
        last_keyword = None
        select_index = index
        print(2)
    elif token.ttype is Name or token.is_keyword:
        tables.append(str(token))
        print(3)
        
print(tables)
tables[0] = 'replace'
tables

[<DML 'SELECT' at 0x11CFDFE28>, <Name 'test' at 0x11CE84468>, <Keyword 'FROM' at 0x11CE844C8>, <Name 'servic...' at 0x11CE84588>, <Keyword 'LEFT J...' at 0x11CE84348>, <Name 'users' at 0x11CE84708>, <Keyword 'ON' at 0x11CE841C8>, <Name 'users' at 0x11CE84948>, <Punctuation '.' at 0x11CE849A8>, <Name 'id' at 0x11CE84A08>, <Comparison '=' at 0x11CE84F48>, <Name 'servic...' at 0x11CE84D08>, <Punctuation '.' at 0x11CE84B88>, <Name 'id' at 0x11CE84A68>, <Keyword 'WHERE' at 0x11CE84C48>, <Name 'users' at 0x11CE84E88>, <Punctuation '.' at 0x11CE84DC8>, <Name 'id' at 0x11CE84E28>, <Comparison '=' at 0x11CF217C8>, <Integer '5' at 0x11CF21888>]
False
2
True
3
False
3
True
3
False
3
True
3
False
3
True
3
False
True
3
False
True
3
False
True
3
False
14
True
3
False
True
3
False
False
['test', 'FROM', 'services', 'LEFT JOIN', 'users', 'ON', 'users', 'id', 'services', 'id', 'users', 'id']


['replace',
 'FROM',
 'services',
 'LEFT JOIN',
 'users',
 'ON',
 'users',
 'id',
 'services',
 'id',
 'users',
 'id']

In [6]:
for index, row in final_join.iterrows():
   print(row['table_name'])

In [7]:
for index, row in final_join.iterrows():
    if row.table_name != 'users':
        join_to_insert = 'LEFT JOIN ' + row.foreign_table_name + ' ON ' + row.foreign_table_name + '.' + row.foreign_column_name + ' = ' + row.table_name + '.' + row.column_name
        if row.foreign_table_name == 'projects':
            join_to_insert += ' LEFT JOIN namespaces ON projects.namespace_id = namespaces.id '
        print(join_to_insert)
    parsed_sql_query = sqlparse.parse(row.sql_query)[0]
    parsed_join_to_insert = sqlparse.parse(join_to_insert)
    parsed_sql_query.insert_before(-1, parsed_join_to_insert[0])
    print(parsed_sql_query)
    print(row.counter)
        

    
group_by_statement = 'GROUP BY 1'
print(group_by_statement)


GROUP BY 1


In [25]:
## verify it is a where clause

parsed = sqlparse.parse(value)
where = parsed[0][-1]
print(where)
where_clause = False
for i in where.tokens:
    if str(i).find('WHERE') >= 0:
        where_clause = True
    
for i in parsed[0].tokens:
    try:
        for j in i.tokens:

            if str(j) == '"services"':
                print(j)
    except:
        pass
    
idx, _ = parsed[0].token_next_by(m=(Identifier, 'WHERE'))
print(idx)

WHERE "services"."type" = 'GithubService' AND "services"."pipeline_events" = TRUE AND "services"."active" = TRUE AND "projects"."created_at" BETWEEN '2020-11-17 19:22:32.723497' AND '2020-12-15 19:22:32.723560'
"services"
None


In [107]:
import sql_metadata

sql_metadata.get_query_tables(value)


['projects', 'services']

In [28]:
import json 
import sqlparse
from sqlparse.sql import Identifier, IdentifierList, remove_quotes, Token, TokenList, Where
from sqlparse.tokens import Keyword, Name, Punctuation, String, Whitespace
from sqlparse.utils import imt
import pandas as pd

sql_first = sql_queries_dict["counts.issues"]

parsed = sqlparse.parse(sql_first)[0]
test_list = []

print(parsed.get_name())

parsed._pprint_tree()
where_statment = parsed[-1]
select_statement = parsed

for x in select_statement:
    test_list.append(str(x))
    
test_list

left_join = ' LEFT JOIN projects ON test.project_id = projects.id'
test_list.append(left_join)
test_list

None
|- 0 DML 'SELECT'
|- 1 Whitespace ' '
|- 2 Function 'COUNT(...'
|  |- 0 Identifier 'COUNT'
|  |  `- 0 Name 'COUNT'
|  `- 1 Parenthesis '("issu...'
|     |- 0 Punctuation '('
|     |- 1 Identifier '"issue...'
|     |  |- 0 Symbol '"issue...'
|     |  |- 1 Punctuation '.'
|     |  `- 2 Symbol '"{:sta...'
|     `- 2 Punctuation ')'
|- 3 Whitespace ' '
|- 4 Keyword 'FROM'
|- 5 Whitespace ' '
`- 6 Identifier '"issue...'
   `- 0 Symbol '"issue...'


['SELECT',
 ' ',
 'COUNT("issues"."{:start=>1, :finish=>465}")',
 ' ',
 'FROM',
 ' ',
 '"issues"',
 ' LEFT JOIN projects ON test.project_id = projects.id']

In [120]:

table_to_look = 'issues'
table_to_join = ['projects', 'namespaces', 'groups']
foreign_key_df.head(20)

potential_joins = foreign_key_df[(foreign_key_df['table_name'] == table_to_look) & (foreign_key_df['foreign_table_name'].isin(table_to_join))]

for index, row in final_join.iterrows():
    if row.table_name != 'users':
        print('LEFT JOIN ' + row.foreign_table_name + ' ON ' + row.foreign_table_name + '.' + row.foreign_column_name + ' = ' + row.table_name + '.' + row.column_name)
        if row.foreign_table_name == 'projects':
            print('LEFT JOIN namespaces ON projects.namespace_id = namespaces.id')
    print(row.table_name)
    print(row.counter)

    
group_by_statement = 'GROUP BY 1'
print(group_by_statement)


users
active_user_count
LEFT JOIN projects ON projects.id = boards.project_id
LEFT JOIN namespaces ON projects.namespace_id = namespaces.id
boards
counts.boards
LEFT JOIN projects ON projects.id = ci_builds.project_id
LEFT JOIN namespaces ON projects.namespace_id = namespaces.id
ci_builds
counts.ci_builds
LEFT JOIN projects ON projects.id = ci_pipelines.project_id
LEFT JOIN namespaces ON projects.namespace_id = namespaces.id
ci_pipelines
counts.ci_internal_pipelines
LEFT JOIN projects ON projects.id = ci_pipelines.project_id
LEFT JOIN namespaces ON projects.namespace_id = namespaces.id
ci_pipelines
counts.ci_external_pipelines
LEFT JOIN projects ON projects.id = ci_pipelines.project_id
LEFT JOIN namespaces ON projects.namespace_id = namespaces.id
ci_pipelines
counts.ci_pipeline_config_auto_devops
LEFT JOIN projects ON projects.id = ci_pipelines.project_id
LEFT JOIN namespaces ON projects.namespace_id = namespaces.id
ci_pipelines
counts.ci_pipeline_config_repository
LEFT JOIN projects O

users
counts.user_preferences_group_overview_security_dashboard
LEFT JOIN projects ON projects.id = deployments.project_id
LEFT JOIN namespaces ON projects.namespace_id = namespaces.id
deployments
counts.ingress_modsecurity_logging
LEFT JOIN projects ON projects.id = deployments.project_id
LEFT JOIN namespaces ON projects.namespace_id = namespaces.id
deployments
counts.ingress_modsecurity_blocking
LEFT JOIN projects ON projects.id = deployments.project_id
LEFT JOIN namespaces ON projects.namespace_id = namespaces.id
deployments
counts.ingress_modsecurity_disabled
LEFT JOIN projects ON projects.id = deployments.project_id
LEFT JOIN namespaces ON projects.namespace_id = namespaces.id
deployments
counts.ingress_modsecurity_not_installed
LEFT JOIN projects ON projects.id = container_expiration_policies.project_id
LEFT JOIN namespaces ON projects.namespace_id = namespaces.id
container_expiration_policies
counts.projects_with_expiration_policy_disabled
LEFT JOIN projects ON projects.id = con

In [31]:
isinstance(where_statment, Where)

False

In [32]:
idx, _ = parsed.token_next_by(m=(Keyword, "BETWEEN"))
print(idx)
if idx is not None:
    _, token = parsed.token_next(idx=idx)
    if token:
        if isinstance(token, IdentifierList):
            # In case of "LIMIT <offset>, <limit>", find comma and extract
            # first succeeding non-whitespace token
            idx, _ = token.token_next_by(m=(sqlparse.tokens.Punctuation, ","))
            _, token = token.token_next(idx=idx)
        if token and token.ttype == sqlparse.tokens.Literal.Number.Integer:
            print(int(token.value))

where = next(token for token in parsed.tokens if isinstance(token, Where))


None


StopIteration: 

In [102]:
elements = sqlparse.parse(value)
tok_list = elements[0].tokens

isinstance(tok_list[-1], Where)

tok_list

## add group by at the end
## add joins (what if there is a namespace, group or project table already)
## add namespace_id


[<DML 'SELECT' at 0x11CF32948>,
 <Whitespace ' ' at 0x15AC873A8>,
 <Function 'COUNT(...' at 0x15B55DCF0>,
 <Whitespace ' ' at 0x15AC87888>,
 <Keyword 'FROM' at 0x15AC878E8>,
 <Whitespace ' ' at 0x15AC87948>,
 <Identifier '"proje...' at 0x15AA2B228>,
 <Whitespace ' ' at 0x15AC87A08>,
 <Keyword 'INNER ...' at 0x15AC87A68>,
 <Whitespace ' ' at 0x15AC87AC8>,
 <Identifier '"servi...' at 0x15AA2B2A0>,
 <Whitespace ' ' at 0x15AC87B88>,
 <Keyword 'ON' at 0x15AC87BE8>,
 <Whitespace ' ' at 0x15AC87C48>,
 <Comparison '"servi...' at 0x15AA2B390>,
 <Whitespace ' ' at 0x15AC98048>,
 <Keyword 'AND' at 0x15AC980A8>,
 <Whitespace ' ' at 0x15AC98108>,
 <Comparison '"servi...' at 0x15AA2B408>,
 <Whitespace ' ' at 0x15AC98408>,
 <Where 'WHERE ...' at 0x15B55DD68>]

In [105]:
for index, token in enumerate(tok_list):
    #Token.Keyword.DML
    if (token.is_keyword and str(token) == 'SELECT') is True:
        select_index = index
    if isinstance(token, Where) is True:
        where_index = index

new_tok_list = tok_list[:]
new_tok_list.insert(where_index + 1 , ' GROUP BY 1')
new_tok_list.insert(where_index, ' LEFT JOIN projects ON services.id = projects.project_id')
new_tok_list.insert(select_index + 1, ' namespace_id,')

tok_list_str = [str(token) for token in new_tok_list]
query_stringed = ''.join(tok_list_str)
query_parsed = sqlparse.parse(query_stringed)
tok_list_transformed = query_parsed[0].tokens

print(query_stringed)
print(tok_list_transformed)
print(tok_list)

SELECT namespace_id, COUNT(DISTINCT "projects"."creator_id") FROM "projects" INNER JOIN "services" ON "services"."project_id" = "projects"."id" AND "services"."type" = 'GithubService'  LEFT JOIN projects ON services.id = projects.project_idWHERE "services"."type" = 'GithubService' AND "services"."pipeline_events" = TRUE AND "services"."active" = TRUE AND "projects"."created_at" BETWEEN '2020-11-17 19:22:32.723497' AND '2020-12-15 19:22:32.723560' GROUP BY 1
[<DML 'SELECT' at 0x15AF07048>, <Whitespace ' ' at 0x15A8F2A08>, <IdentifierList 'namesp...' at 0x15AF875E8>, <Whitespace ' ' at 0x15A90A0A8>, <Keyword 'FROM' at 0x15A90A108>, <Whitespace ' ' at 0x15A90A168>, <Identifier '"proje...' at 0x15AF87228>, <Whitespace ' ' at 0x15A90A228>, <Keyword 'INNER ...' at 0x15A90A288>, <Whitespace ' ' at 0x15A90A2E8>, <Identifier '"servi...' at 0x15AF87318>, <Whitespace ' ' at 0x15A90A3A8>, <Keyword 'ON' at 0x15A90A408>, <Whitespace ' ' at 0x15A90A468>, <Comparison '"servi...' at 0x15AF87408>, <Whit