In [1]:
class Color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'
    
def without_color():
    Color.PURPLE = ''
    Color.CYAN = ''
    Color.DARKCYAN = ''
    Color.BLUE = ''
    Color.GREEN = ''
    Color.YELLOW = ''
    Color.RED = ''
    Color.BOLD = ''
    Color.UNDERLINE = ''
    Color.END = ''


In [2]:
class Column:
    def __init__(self, name='', type=None, equivalences=None):
        self._name = name

        if not type:
            type = []
        self._type = type

        if not equivalences:
            equivalences = []
        self._equivalences = equivalences

        self.primary = False
        self.foreign = False

    @property
    def name(self):
        return self._name

    @property
    def type(self):
        return self._type

    def add_type(self, type):
        self.type.append(type)
    def get_type(self):
        return self.type

    @property
    def equivalences(self):
        return self._equivalences

    def add_equivalence(self, equivalence):
        self.equivalences.append(equivalence)

    def is_equivalent(self, word):
        if word in self.equivalences:
            return True
        else:
            return False

    def is_primary(self):
        return self.primary

    def set_as_primary(self):
        self.primary = True

    def is_foreign(self):
        return self.foreign

    def set_as_foreign(self, references):
        self.foreign = references


In [3]:
class Table:
    def __init__(self, name='', columns=None, equivalences=None):
        self._name = name

        if not columns:
            columns = []
        self.columns = columns

        if not equivalences:
            equivalences = []
        self.equivalences = equivalences

    @property
    def name(self):
        return self._name

    @name.setter
    def name(self, value):
        self._name = value

    def get_number_of_columns(self):
        return len(self.columns)

    def get_columns(self):
        return self.columns

    def get_column_by_name(self, column_name):
        for column in self.columns:
            if column.name == column_name:
                return column

    def add_column(self, column_name, column_type, column_equivalences):
        self.columns.append(Column(column_name, column_type, column_equivalences))

    def get_equivalences(self):
        return self.equivalences

    def add_equivalence(self, equivalence):
        self.equivalences.append(equivalence)

    def is_equivalent(self, word):
        if word in self.equivalences:
            return True
        else:
            return False

    def get_primary_keys(self):
        primary_keys = []
        for column in self.columns:
            if column.is_primary():
                primary_keys.append(column)
        return primary_keys

    def get_primary_key_names(self):
        primary_keys = []
        for column in self.columns:
            if column.is_primary():
                primary_keys.append(column.name)
        return primary_keys

    def add_primary_key(self, primary_key_column):
        for column in self.columns:
            if column.name == primary_key_column:
                column.set_as_primary()

    def get_foreign_keys(self):
        foreign_keys = []
        for column in self.columns:
            if column.is_foreign():
                foreign_keys.append(column)
        return foreign_keys

    def get_foreign_key_names(self):
        foreign_keys = []
        for column in self.columns:
            if column.is_foreign():
                foreign_keys.append(column.name)
        return foreign_keys

    def add_foreign_key(self, column_name, foreign_table, foreign_column):
        for column in self.columns:
            if column.name == column_name:
                column.set_as_foreign({'foreign_table': foreign_table, 'foreign_column': foreign_column})


In [4]:
import os
import re
class Database:

    def __init__(self):
        self.tables = []
        self.thesaurus_object = None

    def set_thesaurus(self, thesaurus):
        self.thesaurus_object = thesaurus

    def get_number_of_tables(self):
        return len(self.tables)

    def get_tables(self):
        return self.tables

    def get_column_with_this_name(self, name):
        for table in self.tables:
            for column in table.get_columns():
                if column.name == name:
                    return column

    def get_table_by_name(self, table_name):
        for table in self.tables:
            if table.name == table_name:
                return table

    def get_tables_into_dictionary(self):
        data = {}
        for table in self.tables:
            data[table.name] = []
            for column in table.get_columns():
                data[table.name].append(column.name)
        return data

    def get_primary_keys_by_table(self):
        data = {}
        for table in self.tables:
            data[table.name] = table.get_primary_keys()
        return data

    def get_foreign_keys_by_table(self):
        data = {}
        for table in self.tables:
            data[table.name] = table.get_foreign_keys()
        return data

    def get_primary_keys_of_table(self, table_name):
        for table in self.tables:
            if table.name == table_name:
                return table.get_primary_keys()

    def get_primary_key_names_of_table(self, table_name):
        for table in self.tables:
            if table.name == table_name:
                return table.get_primary_key_names()

    def get_foreign_keys_of_table(self, table_name):
        for table in self.tables:
            if table.name == table_name:
                return table.get_foreign_keys()

    def get_foreign_key_names_of_table(self, table_name):
        for table in self.tables:
            if table.name == table_name:
                return table.get_foreign_key_names()

    def add_table(self, table):
        self.tables.append(table)

    @staticmethod
    def _generate_path(path):
        cwd = os.path.dirname(__file__)
        filename = os.path.join(cwd, path)
        return filename

    def load(self, path):
        with open(path) as f:
            content = f.read()
            tables_string = [p.split(';')[0] for p in content.split('CREATE') if ';' in p]
            for table_string in tables_string:
                if 'TABLE' in table_string:
                    table = self.create_table(table_string)
                    self.add_table(table)
            alter_tables_string = [p.split(';')[0] for p in content.split('ALTER') if ';' in p]
            for alter_table_string in alter_tables_string:
                if 'TABLE' in alter_table_string:
                    self.alter_table(alter_table_string)

    def predict_type(self, string):
        if 'int' in string.lower():
            return 'int'
        elif 'char' in string.lower() or 'text' in string.lower():
            return 'string'
        elif 'date' in string.lower():
            return 'date'
        else:
            return 'unknow'

    def create_table(self, table_string):
        lines = table_string.split("\n")
        table = Table()
        for line in lines:
            if 'TABLE' in line:
                table_name = re.search("`(\w+)`", line)
                table.name = table_name.group(1)
                if self.thesaurus_object is not None:
                    table.equivalences = self.thesaurus_object.get_synonyms_of_a_word(table.name)
            elif 'PRIMARY KEY' in line:
                primary_key_columns = re.findall("`(\w+)`", line)
                for primary_key_column in primary_key_columns:
                    table.add_primary_key(primary_key_column)
            else:
                column_name = re.search("`(\w+)`", line)
                if column_name is not None:
                    column_type = self.predict_type(line)
                    if self.thesaurus_object is not None:
                        equivalences = self.thesaurus_object.get_synonyms_of_a_word(column_name.group(1))
                    else:
                        equivalences = []
                    table.add_column(column_name.group(1), column_type, equivalences)
        return table

    def alter_table(self, alter_string):
        lines = alter_string.replace('\n', ' ').split(';')
        for line in lines:
            if 'PRIMARY KEY' in line:
                table_name = re.search("TABLE `(\w+)`", line).group(1)
                table = self.get_table_by_name(table_name)
                primary_key_columns = re.findall("PRIMARY KEY \(`(\w+)`\)", line)
                for primary_key_column in primary_key_columns:
                    table.add_primary_key(primary_key_column)
            elif 'FOREIGN KEY' in line:
                table_name = re.search("TABLE `(\w+)`", line).group(1)
                table = self.get_table_by_name(table_name)
                foreign_keys_list = re.findall("FOREIGN KEY \(`(\w+)`\) REFERENCES `(\w+)` \(`(\w+)`\)", line)
                for column, foreign_table, foreign_column in foreign_keys_list:
                    table.add_foreign_key(column, foreign_table, foreign_column)

    def print_me(self):
        for table in self.tables:
            print('+-------------------------------------+')
            print("| %25s           |" % (table.name.upper()))
            print('+-------------------------------------+')
            for column in table.columns:
                if column.is_primary():
                    print("| 🔑 %31s           |" % (Color.BOLD + column.name + ' (' + column.get_type() + ')' + Color.END))
                elif column.is_foreign():
                    print("| #️⃣ %31s           |" % (Color.BOLD + column.name + ' (' + column.get_type() + ')' + Color.END))
                else:
                    print("|   %23s           |" % (column.name + ' (' + column.get_type() + ')'))
            print('+-------------------------------------+\n')


In [5]:
database = Database()
database.load('database_store/city.sql')
database.print_me()

+-------------------------------------+
|                      CITY           |
+-------------------------------------+
| 🔑                [1mid (int)[0m           |
|         cityName (string)           |
+-------------------------------------+

+-------------------------------------+
|                       EMP           |
+-------------------------------------+
| 🔑                [1mid (int)[0m           |
|             name (string)           |
| #️⃣            [1mcityId (int)[0m           |
|               score (int)           |
+-------------------------------------+



In [6]:
class Select():
    def __init__(self):
        self.columns = []

    def add_column(self, column, column_type):
        if [column, column_type] not in self.columns:
            self.columns.append([column, column_type])

    def get_columns(self):
        return self.columns

    def get_just_column_name(self, column):
        if column != str(None):
            return column.rsplit('.', 1)[1]
        else:
            return column

    def print_column(self, selection):
        column = selection[0]
        column_type = selection[1]

        if column is None:
            if column_type is not None:
                if 'COUNT' in column_type:
                    return Color.BOLD + 'COUNT(' + Color.END + '*' + Color.BOLD + ')' + Color.END
                else:
                    return '*'
            else:
                return '*'
        else:
            if 'DISTINCT' in column_type:
                if 'COUNT' in column_type:
                    return Color.BOLD + 'COUNT(DISTINCT ' + Color.END + str(column) + Color.BOLD + ')' + Color.END
                else:
                    return Color.BOLD + 'DISTINCT ' + Color.END + str(column)
            if 'COUNT' in column_type:
                return Color.BOLD + 'COUNT(' + Color.END + str(column) + Color.BOLD + ')' + Color.END
            elif 'AVG' in column_type:
                return Color.BOLD + 'AVG(' + Color.END + str(column) + Color.BOLD + ')' + Color.END
            elif 'SUM' in column_type:
                return Color.BOLD + 'SUM(' + Color.END + str(column) + Color.BOLD + ')' + Color.END
            elif 'MAX' in column_type:
                return Color.BOLD + 'MAX(' + Color.END + str(column) + Color.BOLD + ')' + Color.END
            elif 'MIN' in column_type:
                return Color.BOLD + 'MIN(' + Color.END + str(column) + Color.BOLD + ')' + Color.END
            else:
                return str(column)

    def __str__(self):
        select_string = ''
        for i in range(0, len(self.columns)):
            if i == (len(self.columns) - 1):
                select_string = select_string + str(self.print_column(self.columns[i]))
            else:
                select_string = select_string + str(self.print_column(self.columns[i])) + ', '

        return Color.BOLD + 'SELECT ' + Color.END + select_string

    def print_json(self, output):
        if len(self.columns) >= 1:
            if len(self.columns) == 1:
                output.write('\t"select": {\n')
                output.write('\t\t"column": "' + self.get_just_column_name(str(self.columns[0][0])) + '",\n')
                output.write('\t\t"type": "' + str(self.columns[0][1]) + '"\n')
                output.write('\t},\n')
            else:
                output.write('\t"select": {\n')
                output.write('\t\t"columns": [\n')
                for i in range(0, len(self.columns)):
                    if i == (len(self.columns) - 1):
                        output.write(
                            '\t\t\t{ "column": "' + self.get_just_column_name(str(self.columns[i][0])) + '",\n')
                        output.write('\t\t\t  "type": "' + str(self.columns[i][1]) + '"\n')
                        output.write('\t\t\t}\n')
                    else:
                        output.write(
                            '\t\t\t{ "column": "' + self.get_just_column_name(str(self.columns[i][0])) + '",\n')
                        output.write('\t\t\t  "type": "' + str(self.columns[i][1]) + '"\n')
                        output.write('\t\t\t},\n')
                output.write('\t\t]\n')
                output.write('\t},\n')
        else:
            output.write('\t"select": {\n')
            output.write('\t},\n')


class From():
    table = ''

    def __init__(self, table=None):
        if table is not None:
            self.table = table
        else:
            self.table = ''

    def set_table(self, table):
        self.table = table

    def get_table(self):
        return self.table

    def __str__(self):
        return '\n' + Color.BOLD + 'FROM ' + Color.END + str(self.table)

    def print_json(self, output):
        if self.table != '':
            output.write('\t"from": {\n')
            output.write('\t\t"table": "' + str(self.table) + '"\n')
            output.write('\t},\n')
        else:
            output.write('\t"from": {\n')
            output.write('\t},\n')



class Condition():
    column = ''
    column_type = ''
    operator = ''
    value = ''

    def __init__(self, column, column_type, operator, value):
        self.column = column
        self.column_type = column_type
        self.operator = operator
        self.value = value

    def get_column(self):
        return self.column

    def get_column_type(self):
        return self.column_type

    def get_operator(self):
        return self.operator

    def get_value(self):
        return self.value

    def get_in_list(self):
        return [self.column, self.column_type, self.operator, self.value]

    def get_just_column_name(self, column):
        if column != str(None):
            return column.rsplit('.', 1)[1]
        else:
            return column

    def get_column_with_type_operation(self, column, column_type):
        if column_type is None:
            return self.column
        else:
            return Color.BOLD + str(column_type) + '(' + Color.END + self.column + Color.BOLD + ')' + Color.END

    def get_pretty_operator(self, operator):
        if operator == 'BETWEEN':
            return Color.BOLD + 'BETWEEN' + Color.END + ' OOV ' + Color.BOLD + 'AND' + Color.END
        else:
            return Color.BOLD + operator + Color.END

    def __str__(self):
        return str(self.get_column_with_type_operation(self.column, self.column_type)) + ' ' + str(
            self.get_pretty_operator(self.operator)) + ' ' + str(self.value)

    def print_json(self, output):

        output.write(
            '\t\t\t{ "column": "' + self.get_just_column_name(str(self.column)) + '",\n\t\t\t  "type": "' + str(
                self.column_type) + '",\n\t\t\t  "operator": "' + str(self.operator) + '",\n\t\t\t  "value": "' + str(
                self.value) + '"\n\t\t\t}')


class Where():
    conditions = []

    def __init__(self, clause=None):
        if clause is not None:
            self.conditions.append([None, clause])
        else:
            self.conditions = []

    def add_condition(self, junction, clause):
        self.conditions.append([junction, clause])

    def get_conditions(self):
        return self.conditions

    def __str__(self):
        string = ''

        if len(self.conditions) >= 1:
            for i in range(0, len(self.conditions)):
                if i == 0:
                    string += '\n' + Color.BOLD + 'WHERE' + Color.END + ' ' + str(self.conditions[i][1])
                else:
                    string += '\n' + Color.BOLD + str(self.conditions[i][0]) + Color.END + ' ' + str(
                        self.conditions[i][1])

            return string
        else:
            return string

    def print_json(self, output):
        if len(self.conditions) >= 1:
            if len(self.conditions) == 1:
                output.write('\t"where": {\n')
                output.write('\t\t"condition": [\n')
                self.conditions[0][1].print_json(output)

                output.write('\n')
                output.write('\t\t]\n')
                output.write('\t},\n')
            else:
                output.write('\t"where": {\n')
                output.write('\t\t"conditions": [\n')
                for i in range(0, len(self.conditions)):
                    if i != 0:
                        output.write('\t\t\t{\n\t\t\t  "operator": "' + str(self.conditions[i][0]) + '"\n\t\t\t},\n')
                    self.conditions[i][1].print_json(output)
                    if i != (len(self.conditions) - 1):
                        output.write(',')
                    output.write('\n')
                output.write('\t\t]\n')
                output.write('\t},\n')
        else:
            output.write('\t"where": {\n')
            output.write('\t},\n')




class Query():
    select = None
    _from = None
    join = None
    where = None
    group_by = None
    order_by = None

    def __init__(self, select=None, _from=None, join=None, where=None, group_by=None, order_by=None):
        if select is not None:
            self.select = select
        else:
            self.select = None
        if _from is not None:
            self._from = _from
        else:
            self._from = None
        if join is not None:
            self.join = join
        else:
            self.join = None
        if where is not None:
            self.where = where
        else:
            self.where = None
        if group_by is not None:
            self.group_by = group_by
        else:
            self.group_by = None
        if order_by is not None:
            self.order_by = order_by
        else:
            self.order_by = None

    def set_select(self, select):
        self.select = select

    def get_select(self):
        return self.select

    def set_from(self, _from):
        self._from = _from

    def get_from(self):
        return self._from

    def set_join(self, join):
        self.join = join

    def get_join(self):
        return self.join

    def set_where(self, where):
        self.where = where

    def get_where(self):
        return self.where

    def set_group_by(self, group_by):
        self.group_by = group_by

    def get_group_by(self):
        return self.group_by

    def set_order_by(self, order_by):
        self.order_by = order_by

    def get_order_by(self):
        return self.order_by

    def __str__(self):
        return '\n' + str(self.select) + str(self._from) + str(self.join) + str(self.where) + str(self.group_by) + str(
            self.order_by) + ';\n'

    def print_json(self, filename="output.json"):
        output = open(filename, 'a')
        output.write('{\n')
        self.select.print_json(output)
        self._from.print_json(output)
        self.join.print_json(output)
        self.where.print_json(output)
        self.group_by.print_json(output)
        self.order_by.print_json(output)
        output.write('}\n')
        output.close()


In [7]:
class ParsingException(Exception):
    def __init__(self, reason=''):
        self.reason = reason

    def __str__(self):
        return Color.BOLD + Color.RED + self.reason + Color.END


In [8]:
import re
import string
import sys
import unicodedata
import functools
from threading import Thread


class SelectParser(Thread):
    def __init__(self, columns_of_select, tables_of_from, phrase, count_keywords, sum_keywords, average_keywords,
                 max_keywords, min_keywords, distinct_keywords, database_dico, database_object):
        Thread.__init__(self)
        self.select_objects = []
        self.columns_of_select = columns_of_select
        self.tables_of_from = tables_of_from
        self.phrase = phrase
        self.count_keywords = count_keywords
        self.sum_keywords = sum_keywords
        self.average_keywords = average_keywords
        self.max_keywords = max_keywords
        self.min_keywords = min_keywords
        self.distinct_keywords = distinct_keywords
        self.database_dico = database_dico
        self.database_object = database_object

    def get_tables_of_column(self, column):
        tmp_table = []
        for table in self.database_dico:
            if column in self.database_dico[table]:
                tmp_table.append(table)
        return tmp_table

    def get_column_name_with_alias_table(self, column, table_of_from):
        one_table_of_column = self.get_tables_of_column(column)[0]
        tables_of_column = self.get_tables_of_column(column)
        if table_of_from in tables_of_column:
            return str(table_of_from) + '.' + str(column)
        else:
            return str(one_table_of_column) + '.' + str(column)

    def uniquify(self, list):
        already = []
        for element in list:
            if element not in already:
                already.append(element)
        return already

    def run(self):
        for table_of_from in self.tables_of_from:  # for each query
            self.select_object = Select()
            is_count = False
            self.columns_of_select = self.uniquify(self.columns_of_select)
            number_of_select_column = len(self.columns_of_select)

            if number_of_select_column == 0:
                select_type = []
                for count_keyword in self.count_keywords:
                    # if count_keyword in (word.lower() for word in self.phrase):
                    # so that it matches multiple words too in keyword synonymn in .lang rather than just single word for COUNT
                    # (e.g. QUERY-> "how many city there are in which the employe name is aman ?" )
                    lower_self_phrase = ' '.join(word.lower() for word in self.phrase)
                    if count_keyword in lower_self_phrase:
                        select_type.append('COUNT')

                self.select_object.add_column(None, self.uniquify(select_type))
            else:
                select_phrases = []
                previous_index = 0

                for i in range(0, len(self.phrase)):
                    for column_name in self.columns_of_select:
                        if (self.phrase[i] == column_name) or (
                                    self.phrase[i] in self.database_object.get_column_with_this_name(column_name).equivalences):
                            select_phrases.append(self.phrase[previous_index:i + 1])
                            previous_index = i + 1

                select_phrases.append(self.phrase[previous_index:])

                for i in range(0, len(select_phrases)):  # for each select phrase (i.e. column processing)
                    select_type = []

                    phrase = [word.lower() for word in select_phrases[i]]

                    for keyword in self.average_keywords:
                        if keyword in phrase:
                            select_type.append('AVG')
                    for keyword in self.count_keywords:
                        if keyword in phrase:
                            select_type.append('COUNT')
                    for keyword in self.max_keywords:
                        if keyword in phrase:
                            select_type.append('MAX')
                    for keyword in self.min_keywords:
                        if keyword in phrase:
                            select_type.append('MIN')
                    for keyword in self.sum_keywords:
                        if keyword in phrase:
                            select_type.append('SUM')
                    for keyword in self.distinct_keywords:
                        if keyword in phrase:
                            select_type.append('DISTINCT')

                    if (i != len(select_phrases) - 1):
                        column = self.get_column_name_with_alias_table(self.columns_of_select[i], table_of_from)
                        self.select_object.add_column(column, self.uniquify(select_type))

            self.select_objects.append(self.select_object)

    def join(self):
        Thread.join(self)
        return self.select_objects


class FromParser(Thread):
    def __init__(self, tables_of_from, columns_of_select, columns_of_where, database_object):
        Thread.__init__(self)
        self.queries = []
        self.tables_of_from = tables_of_from
        self.columns_of_select = columns_of_select
        self.columns_of_where = columns_of_where

        self.database_object = database_object
        self.database_dico = self.database_object.get_tables_into_dictionary()

    def get_tables_of_column(self, column):
        tmp_table = []
        for table in self.database_dico:
            if column in self.database_dico[table]:
                tmp_table.append(table)
        return tmp_table

    def intersect(self, a, b):
        return list(set(a) & set(b))

    def difference(self, a, b):
        differences = []
        for _list in a:
            if _list not in b:
                differences.append(_list)
        return differences

    def is_direct_join_is_possible(self, table_src, table_trg):
        fk_column_of_src_table = self.database_object.get_foreign_keys_of_table(table_src)
        fk_column_of_trg_table = self.database_object.get_foreign_keys_of_table(table_trg)

        for column in fk_column_of_src_table:
            if column.is_foreign()['foreign_table'] == table_trg:
                return [(table_src, column.name), (table_trg, column.is_foreign()['foreign_column'])]

        for column in fk_column_of_trg_table:
            if column.is_foreign()['foreign_table'] == table_src:
                return [(table_src, column.is_foreign()['foreign_column']), (table_trg, column.name)]

                # pk_table_src = self.database_object.get_primary_key_names_of_table(table_src)
                # pk_table_trg = self.database_object.get_primary_key_names_of_table(table_trg)
                # match_pk_table_src_with_table_trg = self.intersect(pk_table_src, self.database_dico[table_trg])
                # match_pk_table_trg_with_table_src = self.intersect(pk_table_trg, self.database_dico[table_src])

                # if len(match_pk_table_src_with_table_trg) >= 1:
                #     return [(table_trg, match_pk_table_src_with_table_trg[0]), (table_src, match_pk_table_src_with_table_trg[0])]
                # elif len(match_pk_table_trg_with_table_src) >= 1:
                # return [(table_trg, match_pk_table_trg_with_table_src[0]),
                # (table_src, match_pk_table_trg_with_table_src[0])]

    def get_all_direct_linked_tables_of_a_table(self, table_src):
        links = []
        for table_trg in self.database_dico:
            if table_trg != table_src:
                link = self.is_direct_join_is_possible(table_src, table_trg)
                if link is not None:
                    links.append(link)
        return links

    def is_join(self, historic, table_src, table_trg):
        historic = historic
        links = self.get_all_direct_linked_tables_of_a_table(table_src)

        differences = []
        for join in links:
            if join[0][0] not in historic:
                differences.append(join)
        links = differences

        for join in links:
            if join[1][0] == table_trg:
                return [0, join]

        path = []
        historic.append(table_src)

        for join in links:
            result = [1, self.is_join(historic, join[1][0], table_trg)]
            if result[1] != []:
                if result[0] == 0:
                    path.append(result[1])
                    path.append(join)
                else:
                    path = result[1]
                    path.append(join)
        return path

    def get_link(self, table_src, table_trg):
        path = self.is_join([], table_src, table_trg)
        if len(path) > 0:
            path.pop(0)
            path.reverse()
        return path

    def unique(self, _list):
        return [list(x) for x in set(tuple(x) for x in _list)]

    def unique_ordered(self, _list):
        frequency = []
        for element in _list:
            if element not in frequency:
                frequency.append(element)
        return frequency

    def run(self):
        self.queries = []

        for table_of_from in self.tables_of_from:
            links = []
            query = Query()
            query.set_from(From(table_of_from))
            join_object = Join()

            for column in self.columns_of_select:
                if column not in self.database_dico[table_of_from]:
                    foreign_table = self.get_tables_of_column(column)[0]
                    join_object.add_table(foreign_table)
                    link = self.get_link(table_of_from, foreign_table)

                    if not link:
                        self.queries = ParsingException(
                            "There is at least column `" + column + "` that is unreachable from table `" + table_of_from.upper() + "`!")
                        return
                    else:
                        links.extend(link)

            for column in self.columns_of_where:
                if column not in self.database_dico[table_of_from]:
                    foreign_table = self.get_tables_of_column(column)[0]
                    join_object.add_table(foreign_table)
                    link = self.get_link(table_of_from, foreign_table)

                    if not link:
                        self.queries = ParsingException(
                            "There is at least column `" + column + "` that is unreachable from table `" + table_of_from.upper() + "`!")
                        return
                    else:
                        links.extend(link)

            join_object.set_links(self.unique_ordered(links))
            query.set_join(join_object)
            self.queries.append(query)

    def join(self):
        Thread.join(self)
        return self.queries


class WhereParser(Thread):
    def __init__(self, phrases, tables_of_from, columns_of_values_of_where, count_keywords, sum_keywords,
                 average_keywords, max_keywords, min_keywords, greater_keywords, less_keywords, between_keywords,
                 negation_keywords, junction_keywords, disjunction_keywords, like_keywords, distinct_keywords,
                 database_dico, database_object):
        Thread.__init__(self)
        self.where_objects = []
        self.phrases = phrases
        self.tables_of_from = tables_of_from
        self.columns_of_values_of_where = columns_of_values_of_where
        self.count_keywords = count_keywords
        self.sum_keywords = sum_keywords
        self.average_keywords = average_keywords
        self.max_keywords = max_keywords
        self.min_keywords = min_keywords
        self.greater_keywords = greater_keywords
        self.less_keywords = less_keywords
        self.between_keywords = between_keywords
        self.negation_keywords = negation_keywords
        self.junction_keywords = junction_keywords
        self.disjunction_keywords = disjunction_keywords
        self.like_keywords = like_keywords
        self.distinct_keywords = distinct_keywords
        self.database_dico = database_dico
        self.database_object = database_object

    def get_tables_of_column(self, column):
        tmp_table = []
        for table in self.database_dico:
            if column in self.database_dico[table]:
                tmp_table.append(table)
        return tmp_table

    def get_column_name_with_alias_table(self, column, table_of_from):
        one_table_of_column = self.get_tables_of_column(column)[0]
        tables_of_column = self.get_tables_of_column(column)
        if table_of_from in tables_of_column:
            return str(table_of_from) + '.' + str(column)
        else:
            return str(one_table_of_column) + '.' + str(column)

    def intersect(self, a, b):
        return list(set(a) & set(b))

    def predict_operation_type(self, previous_column_offset, current_column_offset):
        interval_offset = list(range(previous_column_offset, current_column_offset))
        if (len(self.intersect(interval_offset, self.count_keyword_offset)) >= 1):
            return 'COUNT'
        elif (len(self.intersect(interval_offset, self.sum_keyword_offset)) >= 1):
            return 'SUM'
        elif (len(self.intersect(interval_offset, self.average_keyword_offset)) >= 1):
            return 'AVG'
        elif (len(self.intersect(interval_offset, self.max_keyword_offset)) >= 1):
            return 'MAX'
        elif (len(self.intersect(interval_offset, self.min_keyword_offset)) >= 1):
            return 'MIN'
        else:
            return None

    def predict_operator(self, current_column_offset, next_column_offset):
        interval_offset = list(range(current_column_offset, next_column_offset))

        if (len(self.intersect(interval_offset, self.negation_keyword_offset)) >= 1) and (
                    len(self.intersect(interval_offset, self.greater_keyword_offset)) >= 1):
            return '<'
        elif (len(self.intersect(interval_offset, self.negation_keyword_offset)) >= 1) and (
                    len(self.intersect(interval_offset, self.less_keyword_offset)) >= 1):
            return '>'
        if (len(self.intersect(interval_offset, self.less_keyword_offset)) >= 1):
            return '<'
        elif (len(self.intersect(interval_offset, self.greater_keyword_offset)) >= 1):
            return '>'
        elif (len(self.intersect(interval_offset, self.between_keyword_offset)) >= 1):
            return 'BETWEEN'
        elif (len(self.intersect(interval_offset, self.negation_keyword_offset)) >= 1):
            return '!='
        elif (len(self.intersect(interval_offset, self.like_keyword_offset)) >= 1):
            return 'LIKE'
        else:
            return '='

    def predict_junction(self, previous_column_offset, current_column_offset):
        interval_offset = list(range(previous_column_offset, current_column_offset))
        junction = 'AND'
        if (len(self.intersect(interval_offset, self.disjunction_keyword_offset)) >= 1):
            return 'OR'
        elif (len(self.intersect(interval_offset, self.junction_keyword_offset)) >= 1):
            return 'AND'

        first_encountered_junction_offset = -1
        first_encountered_disjunction_offset = -1

        for offset in self.junction_keyword_offset:
            if offset >= current_column_offset:
                first_encountered_junction_offset = offset
                break

        for offset in self.disjunction_keyword_offset:
            if offset >= current_column_offset:
                first_encountered_disjunction_offset = offset
                break

        if first_encountered_junction_offset >= first_encountered_disjunction_offset:
            return 'AND'
        else:
            return 'OR'

    def uniquify(self, list):
        already = []
        for element in list:
            if element not in already:
                already.append(element)
        return already

    def run(self):
        number_of_where_columns = 0
        columns_of_where = []
        offset_of = {}
        column_offset = []
        self.count_keyword_offset = []
        self.sum_keyword_offset = []
        self.average_keyword_offset = []
        self.max_keyword_offset = []
        self.min_keyword_offset = []
        self.greater_keyword_offset = []
        self.less_keyword_offset = []
        self.between_keyword_offset = []
        self.junction_keyword_offset = []
        self.disjunction_keyword_offset = []
        self.negation_keyword_offset = []
        self.like_keyword_offset = []

        for phrase in self.phrases:
            phrase_offset_string = ''
            for i in range(0, len(phrase)):
                for table_name in self.database_dico:
                    columns = self.database_object.get_table_by_name(table_name).get_columns()
                    for column in columns:
                        if (phrase[i] == column.name) or (phrase[i] in column.equivalences):
                            number_of_where_columns += 1
                            columns_of_where.append(column.name)
                            offset_of[phrase[i]] = i
                            column_offset.append(i)
                            break
                    else:
                        continue
                    break

                phrase_keyword = str(phrase[i]).lower()  # for robust keyword matching
                phrase_offset_string += phrase_keyword + " "

                for keyword in self.count_keywords:
                    if keyword in phrase_offset_string :    # before the column
                        if (phrase_offset_string.find(keyword) + len(keyword) + 1 == len(phrase_offset_string) ) :
                            self.count_keyword_offset.append(i)

                for keyword in self.sum_keywords:
                    if keyword in phrase_offset_string :    # before the column
                        if (phrase_offset_string.find(keyword) + len(keyword) + 1 == len(phrase_offset_string) ) :
                            self.sum_keyword_offset.append(i)

                for keyword in self.average_keywords:
                    if keyword in phrase_offset_string :    # before the column
                        if (phrase_offset_string.find(keyword) + len(keyword) + 1 == len(phrase_offset_string) ) :
                            self.average_keyword_offset.append(i)

                for keyword in self.max_keywords:
                    if keyword in phrase_offset_string :    # before the column
                        if (phrase_offset_string.find(keyword) + len(keyword) + 1 == len(phrase_offset_string) ) :
                            self.max_keyword_offset.append(i)

                for keyword in self.min_keywords:
                    if keyword in phrase_offset_string :    # before the column
                        if (phrase_offset_string.find(keyword) + len(keyword) + 1 == len(phrase_offset_string) ) :
                            self.min_keyword_offset.append(i)

                for keyword in self.greater_keywords:
                    if keyword in phrase_offset_string :    # after the column
                        if (phrase_offset_string.find(keyword) + len(keyword) + 1 == len(phrase_offset_string) ) :
                            self.greater_keyword_offset.append(i)

                for keyword in self.less_keywords:
                    if keyword in phrase_offset_string :    # after the column
                        if (phrase_offset_string.find(keyword) + len(keyword) + 1 == len(phrase_offset_string) ) :
                            self.less_keyword_offset.append(i)

                for keyword in self.between_keywords:
                    if keyword in phrase_offset_string :    # after the column
                        if (phrase_offset_string.find(keyword) + len(keyword) + 1 == len(phrase_offset_string) ) :
                            self.between_keyword_offset.append(i)

                for keyword in self.junction_keywords:
                    if keyword in phrase_offset_string :    # after the column
                        if (phrase_offset_string.find(keyword) + len(keyword) + 1 == len(phrase_offset_string) ) :
                            self.junction_keyword_offset.append(i)

                for keyword in self.disjunction_keywords:
                    if keyword in phrase_offset_string :    # after the column
                        if (phrase_offset_string.find(keyword) + len(keyword) + 1 == len(phrase_offset_string) ) :
                            self.disjunction_keyword_offset.append(i)

                for keyword in self.negation_keywords:
                    if keyword in phrase_offset_string :
                        if (phrase_offset_string.find(keyword) + len(keyword) + 1 == len(phrase_offset_string) ) :
                            self.negation_keyword_offset.append(i)

                for keyword in self.like_keywords:
                    if keyword in phrase_offset_string :    # after the column
                        if (phrase_offset_string.find(keyword) + len(keyword) + 1 == len(phrase_offset_string) ) :
                            self.like_keyword_offset.append(i)


        for table_of_from in self.tables_of_from:
            where_object = Where()
            for i in range(0, len(column_offset)):
                current = column_offset[i]

                if i == 0:
                    previous = 0
                else:
                    previous = column_offset[i - 1]

                if i == (len(column_offset) - 1):
                    _next = 999
                else:
                    _next = column_offset[i + 1]

                junction = self.predict_junction(previous, current)
                column = self.get_column_name_with_alias_table(columns_of_where[i], table_of_from)
                operation_type = self.predict_operation_type(previous, current)

                if len(self.columns_of_values_of_where) > i:
                    value = self.columns_of_values_of_where[
                        len(self.columns_of_values_of_where) - len(columns_of_where) + i]
                else:
                    value = 'OOV'  # Out Of Vocabulary: default value

                operator = self.predict_operator(current, _next)
                where_object.add_condition(junction, Condition(column, operation_type, operator, value))
            self.where_objects.append(where_object)

    def join(self):
        Thread.join(self)
        return self.where_objects




class Parser:
    database_object = None
    database_dico = None

    count_keywords = []
    sum_keywords = []
    average_keywords = []
    max_keywords = []
    min_keywords = []
    junction_keywords = []
    disjunction_keywords = []
    greater_keywords = []
    less_keywords = []
    between_keywords = []
    order_by_keywords = []
    asc_keywords = []
    desc_keywords = []
    group_by_keywords = []
    negation_keywords = []
    equal_keywords = []
    like_keywords = []

    def __init__(self, database, config):
        self.database_object = database
        self.database_dico = self.database_object.get_tables_into_dictionary()

        self.count_keywords = config.get_count_keywords()
        self.sum_keywords = config.get_sum_keywords()
        self.average_keywords = config.get_avg_keywords()
        self.max_keywords = config.get_max_keywords()
        self.min_keywords = config.get_min_keywords()
        self.junction_keywords = config.get_junction_keywords()
        self.disjunction_keywords = config.get_disjunction_keywords()
        self.greater_keywords = config.get_greater_keywords()
        self.less_keywords = config.get_less_keywords()
        self.between_keywords = config.get_between_keywords()
        self.order_by_keywords = config.get_order_by_keywords()
        self.asc_keywords = config.get_asc_keywords()
        self.desc_keywords = config.get_desc_keywords()
        self.group_by_keywords = config.get_group_by_keywords()
        self.negation_keywords = config.get_negation_keywords()
        self.equal_keywords = config.get_equal_keywords()
        self.like_keywords = config.get_like_keywords()
        self.distinct_keywords = config.get_distinct_keywords()

    @staticmethod
    def _myCmp(s1,s2):
        if len(s1.split()) == len(s2.split()) :
            if len(s1) >= len(s2) :
                return 1
            else:
                return -1
        else:
            if len(s1.split()) >= len(s2.split()):
                return 1
            else:
                return -1


    @classmethod
    def transformation_sort(cls,transition_list):
        # Sort on basis of two keys split length and then token lengths in the respective priority.
        return sorted(transition_list, key=functools.cmp_to_key(cls._myCmp),reverse=True)


    def remove_accents(self, string):
        nkfd_form = unicodedata.normalize('NFKD', str(string))
        return "".join([c for c in nkfd_form if not unicodedata.combining(c)])

    def parse_sentence(self, sentence, stopwordsFilter=None):
        sys.tracebacklimit = 0  # Remove traceback from Exception

        number_of_table = 0
        number_of_select_column = 0
        number_of_where_column = 0
        last_table_position = 0
        columns_of_select = []
        columns_of_where = []

        if stopwordsFilter is not None:
            sentence = stopwordsFilter.filter(sentence)

        input_for_finding_value = sentence.rstrip(string.punctuation.replace('"', '').replace("'", ""))
        columns_of_values_of_where = []

        filter_list = [",", "!"]

        for filter_element in filter_list:
            input_for_finding_value = input_for_finding_value.replace(filter_element, " ")

        input_word_list = input_for_finding_value.split()

        number_of_where_column_temp = 0
        number_of_table_temp = 0
        last_table_position_temp = 0
        start_phrase = ''
        med_phrase = ''

        # TODO: merge this part of the algorithm (detection of values of where)
        #  in the rest of the parsing algorithm (about line 725) '''

        for i in range(0, len(input_word_list)):
            for table_name in self.database_dico:
                if (input_word_list[i] == table_name) or (
                            input_word_list[i] in self.database_object.get_table_by_name(table_name).equivalences):
                    if number_of_table_temp == 0:
                        start_phrase = input_word_list[:i]
                    number_of_table_temp += 1
                    last_table_position_temp = i

                columns = self.database_object.get_table_by_name(table_name).get_columns()
                for column in columns:
                    if (input_word_list[i] == column.name) or (input_word_list[i] in column.equivalences):
                        if number_of_where_column_temp == 0:
                            med_phrase = input_word_list[len(start_phrase):last_table_position_temp + 1]
                        number_of_where_column_temp += 1
                        break
                    else:
                        if (number_of_table_temp != 0) and (number_of_where_column_temp == 0) and (
                                    i == (len(input_word_list) - 1)):
                            med_phrase = input_word_list[len(start_phrase):]
                else:
                    continue
                break

        end_phrase = input_word_list[len(start_phrase) + len(med_phrase):]

        irext = ' '.join(end_phrase)

        ''' @todo set this part of the algorithm (detection of values of where) in the WhereParser thread '''

        if irext:
            irext = self.remove_accents(irext.lower())

            filter_list = [",", "!"]

            for filter_element in filter_list:
                irext = irext.replace(filter_element, " ")

            assignment_list = self.equal_keywords + self.like_keywords + self.greater_keywords + self.less_keywords + self.negation_keywords
            # As these words can also be part of assigners

            # custom operators added as they can be possibilities
            assignment_list.append(':')
            assignment_list.append('=')

            # Algorithmic logic for best substitution for extraction of values with the help of assigners.
            assignment_list = self.transformation_sort(assignment_list)

            maverickjoy_general_assigner = "*res*@3#>>*"
            maverickjoy_like_assigner = "*like*@3#>>*"

            for idx, assigner in enumerate(assignment_list):
                if assigner in self.like_keywords:
                    assigner = str(" " + assigner + " ")
                    irext = irext.replace(assigner, str(" " + maverickjoy_like_assigner + " "))
                else:
                    assigner = str(" " + assigner + " ")
                    # Reason for adding " " these is according to the LOGIC implemented assigner operators help us extract the value,
                    # hence they should be independent entities not part of some other big entity else logic will fail.
                    # for eg -> "show data for city where cityName where I like to risk my life  is Pune" will end up extacting ,
                    # 'k' and '1' both. I know its a lame sentence but something like this could be a problem.

                    irext = irext.replace(assigner, str(" " + maverickjoy_general_assigner + " "))

            # replace all spaces from values to <_> for proper value assignment in SQL
            # eg. (where name is 'abc def') -> (where name is abc<_>def)
            for i in re.findall("(['\"].*?['\"])", irext):
                irext = irext.replace(i, i.replace(' ', '<_>').replace("'", '').replace('"', ''))

            irext_list = irext.split()

            for idx, x in enumerate(irext_list):
                index = idx + 1
                if x == maverickjoy_like_assigner:
                    if index < len(irext_list) and irext_list[index] != maverickjoy_like_assigner and irext_list[index] !=\
                            maverickjoy_general_assigner:
                        # replace back <_> to spaces from the values assigned
                        columns_of_values_of_where.append(str("'%" + str(irext_list[index]).replace('<_>', ' ') + "%'"))

                if x == maverickjoy_general_assigner:
                    if index < len(irext_list) and irext_list[index] != maverickjoy_like_assigner and irext_list[index] != \
                            maverickjoy_general_assigner:
                        # replace back <_> to spaces from the values assigned
                        columns_of_values_of_where.append(str("'" + str(irext_list[index]).replace('<_>', ' ') + "'"))

        ''' ----------------------------------------------------------------------------------------------------------- '''

        tables_of_from = []
        select_phrase = ''
        from_phrase = ''
        where_phrase = ''

        words = re.findall(r"[\w]+", self.remove_accents(sentence))

        for i in range(0, len(words)):
            for table_name in self.database_dico:
                if (words[i] == table_name) or (
                            words[i] in self.database_object.get_table_by_name(table_name).equivalences):
                    if number_of_table == 0:
                        select_phrase = words[:i]
                    tables_of_from.append(table_name)
                    number_of_table += 1
                    last_table_position = i

                columns = self.database_object.get_table_by_name(table_name).get_columns()
                for column in columns:
                    if (words[i] == column.name) or (words[i] in column.equivalences):
                        if number_of_table == 0:
                            columns_of_select.append(column.name)
                            number_of_select_column += 1
                        else:
                            if number_of_where_column == 0:
                                from_phrase = words[len(select_phrase):last_table_position + 1]
                            columns_of_where.append(column.name)
                            number_of_where_column += 1
                        break
                    else:
                        if (number_of_table != 0) and (number_of_where_column == 0) and (i == (len(words) - 1)):
                            from_phrase = words[len(select_phrase):]

        where_phrase = words[len(select_phrase) + len(from_phrase):]

        if (number_of_select_column + number_of_table + number_of_where_column) == 0:
            raise ParsingException("No keyword found in sentence!")

        if len(tables_of_from) > 0:
            from_phrases = []
            previous_index = 0
            for i in range(0, len(from_phrase)):
                for table in tables_of_from:
                    if (from_phrase[i] == table) or (
                                from_phrase[i] in self.database_object.get_table_by_name(table).equivalences):
                        from_phrases.append(from_phrase[previous_index:i + 1])
                        previous_index = i + 1

            last_junction_word_index = -1

            for i in range(0, len(from_phrases)):
                number_of_junction_words = 0
                number_of_disjunction_words = 0

                for word in from_phrases[i]:
                    if word in self.junction_keywords:
                        number_of_junction_words += 1
                    if word in self.disjunction_keywords:
                        number_of_disjunction_words += 1

                if (number_of_junction_words + number_of_disjunction_words) > 0:
                    last_junction_word_index = i

            if last_junction_word_index == -1:
                from_phrase = sum(from_phrases[:1], [])
                where_phrase = sum(from_phrases[1:], []) + where_phrase
            else:
                from_phrase = sum(from_phrases[:last_junction_word_index + 1], [])
                where_phrase = sum(from_phrases[last_junction_word_index + 1:], []) + where_phrase

        real_tables_of_from = []

        for word in from_phrase:
            for table in tables_of_from:
                if (word == table) or (word in self.database_object.get_table_by_name(table).equivalences):
                    real_tables_of_from.append(table)

        tables_of_from = real_tables_of_from

        if len(tables_of_from) == 0:
            raise ParsingException("No table name found in sentence!")

        group_by_phrase = []
        order_by_phrase = []
        new_where_phrase = []
        previous_index = 0
        previous_phrase_type = 0
        yet_where = 0

        for i in range(0, len(where_phrase)):
            if where_phrase[i] in self.order_by_keywords:
                if yet_where > 0:
                    if previous_phrase_type == 1:
                        order_by_phrase.append(where_phrase[previous_index:i])
                    elif previous_phrase_type == 2:
                        group_by_phrase.append(where_phrase[previous_index:i])
                else:
                    new_where_phrase.append(where_phrase[previous_index:i])
                previous_index = i
                previous_phrase_type = 1
                yet_where += 1
            if where_phrase[i] in self.group_by_keywords:
                if yet_where > 0:
                    if previous_phrase_type == 1:
                        order_by_phrase.append(where_phrase[previous_index:i])
                    elif previous_phrase_type == 2:
                        group_by_phrase.append(where_phrase[previous_index:i])
                else:
                    new_where_phrase.append(where_phrase[previous_index:i])
                previous_index = i
                previous_phrase_type = 2
                yet_where += 1

        if previous_phrase_type == 1:
            order_by_phrase.append(where_phrase[previous_index:])
        elif previous_phrase_type == 2:
            group_by_phrase.append(where_phrase[previous_index:])
        else:
            print(where_phrase)
            new_where_phrase.append(where_phrase)

        try:
            select_parser = SelectParser(columns_of_select, tables_of_from, select_phrase, self.count_keywords,
                                         self.sum_keywords, self.average_keywords, self.max_keywords, self.min_keywords,
                                         self.distinct_keywords, self.database_dico, self.database_object)
            from_parser = FromParser(tables_of_from, columns_of_select, columns_of_where, self.database_object)
            where_parser = WhereParser(new_where_phrase, tables_of_from, columns_of_values_of_where,
                                       self.count_keywords, self.sum_keywords, self.average_keywords, self.max_keywords,
                                       self.min_keywords, self.greater_keywords, self.less_keywords,
                                       self.between_keywords, self.negation_keywords, self.junction_keywords,
                                       self.disjunction_keywords, self.like_keywords, self.distinct_keywords,
                                       self.database_dico, self.database_object)
            group_by_parser = GroupByParser(group_by_phrase, tables_of_from, self.database_dico, self.database_object)
            order_by_parser = OrderByParser(order_by_phrase, tables_of_from, self.asc_keywords, self.desc_keywords,
                                            self.database_dico, self.database_object)

            select_parser.start()
            from_parser.start()
            where_parser.start()
            group_by_parser.start()
            order_by_parser.start()

            queries = from_parser.join()
        except:
            raise ParsingException("Parsing error occured in thread!")

        if isinstance(queries, ParsingException):
            raise queries

        try:
            select_objects = select_parser.join()
            where_objects = where_parser.join()
            group_by_objects = group_by_parser.join()
            order_by_objects = order_by_parser.join()
        except:
            raise ParsingException("Parsing error occured in thread!")

        for i in range(0, len(queries)):
            query = queries[i]
            query.set_select(select_objects[i])
            query.set_where(where_objects[i])
            query.set_group_by(group_by_objects[i])
            query.set_order_by(order_by_objects[i])

        return queries


In [9]:
class Thesaurus:
    def __init__(self):
        self.dictionary = {}

    def add_entry(self, word, synonyms):
        self.dictionary[word] = synonyms

    def add_synonym_to_a_word(self, word, synonym):
        self.dictionary[word].append(synonym)

    def add_synonyms_to_a_word(self, word, synonyms):
        if word in self.dictionary:
            self.dictionary[word] += synonyms
        else:
            self.dictionary[word] = synonyms

    def get_synonyms_of_a_word(self, word):
        if word in list(self.dictionary.keys()):
            return self.dictionary[word]

    def remove_accents(self, string):
        nkfd_form = unicodedata.normalize('NFKD', str(string))
        return "".join([c for c in nkfd_form if not unicodedata.combining(c)])

    @staticmethod
    def _generate_path(path):
        cwd = os.path.dirname(__file__)
        filename = os.path.join(cwd, path)
        return filename

    def load(self, path):
        with open(self._generate_path(path)) as f:
            content = f.readlines()
            # we jump content[0] because it is the encoding-type line : useless to parse
            for line_id in range(1, len(content)):
                if '(' not in content[line_id]:
                    line = content[line_id].split("|")
                    word = self.remove_accents(line[0])
                    synonyms = self.remove_accents(content[line_id + 1]).split("|")
                    synonyms.pop(0)
                    self.add_synonyms_to_a_word(word, synonyms)

    def print_me(self):
        for keys, values in list(self.dictionary.items()):
            print(keys)
            print(values)


In [10]:
import os
import re
import unicodedata


class StopwordFilter:
    def __init__(self):
        self.list = []

    def add_stopword(self, word):
        self.list.append(word)

    def get_stopword_list(self):
        return self.list

    def filter(self, sentence):
        tmp_sentence = ""
        words = re.findall(r"[\w]+", self.remove_accents(sentence))
        for word in words:
            word = self.remove_accents(word).lower()
            if word not in self.list:
                tmp_sentence += word + " "
        return tmp_sentence.strip()

    def remove_accents(self, string):
        nkfd_form = unicodedata.normalize('NFKD', str(string))
        return "".join([c for c in nkfd_form if not unicodedata.combining(c)])

    @staticmethod
    def _generate_path(path):
        cwd = os.path.dirname(__file__)
        filename = os.path.join(cwd, path)
        return filename

    def load(self, path):
        with open(self._generate_path(path)) as f:
            lines = f.read().split('\n')
            for word in lines:
                stopword = self.remove_accents(word).lower()
                self.add_stopword(stopword)


In [13]:
import os
import unicodedata


class LangConfig:
    def __init__(self):
        self.avg_keywords = []
        self.sum_keywords = []
        self.max_keywords = []
        self.min_keywords = []
        self.count_keywords = []
        self.junction_keywords = []
        self.disjunction_keywords = []
        self.greater_keywords = []
        self.less_keywords = []
        self.between_keywords = []
        self.order_by_keywords = []
        self.asc_keywords = []
        self.desc_keywords = []
        self.group_by_keywords = []
        self.negation_keywords = []
        self.equal_keywords = []
        self.like_keywords = []
        self.distinct_keywords = []

    def get_avg_keywords(self):
        return self.avg_keywords

    def get_sum_keywords(self):
        return self.sum_keywords

    def get_max_keywords(self):
        return self.max_keywords

    def get_min_keywords(self):
        return self.min_keywords

    def get_count_keywords(self):
        return self.count_keywords

    def get_junction_keywords(self):
        return self.junction_keywords

    def get_disjunction_keywords(self):
        return self.disjunction_keywords

    def get_greater_keywords(self):
        return self.greater_keywords

    def get_less_keywords(self):
        return self.less_keywords

    def get_between_keywords(self):
        return self.between_keywords

    def get_order_by_keywords(self):
        return self.order_by_keywords

    def get_asc_keywords(self):
        return self.asc_keywords

    def get_desc_keywords(self):
        return self.desc_keywords

    def get_group_by_keywords(self):
        return self.group_by_keywords

    def get_negation_keywords(self):
        return self.negation_keywords

    def get_equal_keywords(self):
        return self.equal_keywords

    def get_like_keywords(self):
        return self.like_keywords

    def get_distinct_keywords(self):
        return self.distinct_keywords

    def remove_accents(self, string):
        nkfd_form = unicodedata.normalize('NFKD', str(string))
        return "".join([c for c in nkfd_form if not unicodedata.combining(c)])

    @staticmethod
    def _generate_path(path):
        cwd = os.path.dirname(__file__)
        filename = os.path.join(cwd, path)
        return filename

    def load(self, path):
        with open(path) as f:
            content = f.readlines()
            self.avg_keywords = list(
                map(self.remove_accents, list(map(str.strip, content[0].replace(':', ',').split(",")))))
            self.avg_keywords = self.avg_keywords[1:len(self.avg_keywords)]
            self.avg_keywords = [keyword.lower() for keyword in self.avg_keywords]

            self.sum_keywords = list(
                map(self.remove_accents, list(map(str.strip, content[1].replace(':', ',').split(",")))))
            self.sum_keywords = self.sum_keywords[1:len(self.sum_keywords)]
            self.sum_keywords = [keyword.lower() for keyword in self.sum_keywords]

            self.max_keywords = list(
                map(self.remove_accents, list(map(str.strip, content[2].replace(':', ',').split(",")))))
            self.max_keywords = self.max_keywords[1:len(self.max_keywords)]
            self.max_keywords = [keyword.lower() for keyword in self.max_keywords]

            self.min_keywords = list(
                map(self.remove_accents, list(map(str.strip, content[3].replace(':', ',').split(",")))))
            self.min_keywords = self.min_keywords[1:len(self.min_keywords)]
            self.min_keywords = [keyword.lower() for keyword in self.min_keywords]

            self.count_keywords = list(
                map(self.remove_accents, list(map(str.strip, content[4].replace(':', ',').split(",")))))
            self.count_keywords = self.count_keywords[1:len(self.count_keywords)]
            self.count_keywords = [keyword.lower() for keyword in self.count_keywords]

            self.junction_keywords = list(
                map(self.remove_accents, list(map(str.strip, content[5].replace(':', ',').split(",")))))
            self.junction_keywords = self.junction_keywords[1:len(self.junction_keywords)]
            self.junction_keywords = [keyword.lower() for keyword in self.junction_keywords]

            self.disjunction_keywords = list(
                map(self.remove_accents, list(map(str.strip, content[6].replace(':', ',').split(",")))))
            self.disjunction_keywords = self.disjunction_keywords[1:len(self.disjunction_keywords)]
            self.disjunction_keywords = [keyword.lower() for keyword in self.disjunction_keywords]

            self.greater_keywords = list(
                map(self.remove_accents, list(map(str.strip, content[7].replace(':', ',').split(",")))))
            self.greater_keywords = self.greater_keywords[1:len(self.greater_keywords)]
            self.greater_keywords = [keyword.lower() for keyword in self.greater_keywords]

            self.less_keywords = list(
                map(self.remove_accents, list(map(str.strip, content[8].replace(':', ',').split(",")))))
            self.less_keywords = self.less_keywords[1:len(self.less_keywords)]
            self.less_keywords = [keyword.lower() for keyword in self.less_keywords]

            self.between_keywords = list(
                map(self.remove_accents, list(map(str.strip, content[9].replace(':', ',').split(",")))))
            self.between_keywords = self.between_keywords[1:len(self.between_keywords)]
            self.between_keywords = [keyword.lower() for keyword in self.between_keywords]

            self.order_by_keywords = list(
                map(self.remove_accents, list(map(str.strip, content[10].replace(':', ',').split(",")))))
            self.order_by_keywords = self.order_by_keywords[1:len(self.order_by_keywords)]
            self.order_by_keywords = [keyword.lower() for keyword in self.order_by_keywords]

            self.asc_keywords = list(
                map(self.remove_accents, list(map(str.strip, content[11].replace(':', ',').split(",")))))
            self.asc_keywords = self.asc_keywords[1:len(self.asc_keywords)]
            self.asc_keywords = [keyword.lower() for keyword in self.asc_keywords]

            self.desc_keywords = list(
                map(self.remove_accents, list(map(str.strip, content[12].replace(':', ',').split(",")))))
            self.desc_keywords = self.desc_keywords[1:len(self.desc_keywords)]
            self.desc_keywords = [keyword.lower() for keyword in self.desc_keywords]

            self.negation_keywords = list(
                map(self.remove_accents, list(map(str.strip, content[14].replace(':', ',').split(",")))))
            self.negation_keywords = self.negation_keywords[1:len(self.negation_keywords)]
            self.negation_keywords = [keyword.lower() for keyword in self.negation_keywords]

            self.equal_keywords = list(
                map(self.remove_accents, list(map(str.strip, content[15].replace(':', ',').split(",")))))
            self.equal_keywords = self.equal_keywords[1:len(self.equal_keywords)]
            self.equal_keywords = [keyword.lower() for keyword in self.equal_keywords]

            self.like_keywords = list(
                map(self.remove_accents, list(map(str.strip, content[16].replace(':', ',').split(",")))))
            self.like_keywords = self.like_keywords[1:len(self.like_keywords)]
            self.like_keywords = [keyword.lower() for keyword in self.like_keywords]

            self.distinct_keywords = list(
                map(self.remove_accents, list(map(str.strip, content[17].replace(':', ',').split(",")))))
            self.distinct_keywords = self.distinct_keywords[1:len(self.distinct_keywords)]
            self.distinct_keywords = [keyword.lower() for keyword in self.distinct_keywords]

    def print_me(self):
        print(self.avg_keywords)
        print(self.sum_keywords)
        print(self.max_keywords)
        print(self.min_keywords)
        print(self.count_keywords)
        print(self.junction_keywords)
        print(self.disjunction_keywords)
        print(self.greater_keywords)
        print(self.less_keywords)
        print(self.between_keywords)
        print(self.order_by_keywords)
        print(self.asc_keywords)
        print(self.desc_keywords)
        print(self.group_by_keywords)
        print(self.negation_keywords)
        print(self.equal_keywords)
        print(self.like_keywords)
        print(self.distinct_keywords)


In [14]:
#!/usr/bin/python3
import argparse
import os
class Ln2sql:
    def __init__(
            self,
            database_path,
            language_path,
            json_output_path=None,
            thesaurus_path=None,
            stopwords_path=None,
            color=False
    ):
        if color == False:
            without_color()

        database = Database()
        self.stopwordsFilter = None

        if thesaurus_path:
            thesaurus = Thesaurus()
            thesaurus.load(thesaurus_path)
            database.set_thesaurus(thesaurus)

        if stopwords_path:
            self.stopwordsFilter = StopwordFilter()
            self.stopwordsFilter.load(stopwords_path)

        database.load(database_path)
        database.print_me()

        config = LangConfig()
        config.load(language_path)

        self.parser = Parser(database, config)
        self.json_output_path = json_output_path
        

    def get_query(self, input_sentence):
        queries = self.parser.parse_sentence(input_sentence, self.stopwordsFilter)

        if self.json_output_path:
            self.remove_json(self.json_output_path)
            for query in queries:
                query.print_json(self.json_output_path)

        full_query = ''

        for query in queries:
            full_query += str(query)
            print(query)

        return full_query

    def remove_json(self, filename="output.json"):
        if os.path.exists(filename):
            os.remove(filename)


In [None]:
ln2sql = Ln2sql('database_store/city.sql','lang_store/english.csv')
ln2sql.get_query('what is the id of the city ')

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



+-------------------------------------+
|                      CITY           |
+-------------------------------------+
| 🔑                        id (int)           |
|         cityName (string)           |
+-------------------------------------+

+-------------------------------------+
|                       EMP           |
+-------------------------------------+
| 🔑                        id (int)           |
|             name (string)           |
| #️⃣                    cityId (int)           |
|               score (int)           |
ERROR! Session/line number was not unique in database. History logging moved to new session 102
+-------------------------------------+

[]
Traceback (most recent call last):
NameError: name 'GroupByParser' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
ParsingException: Parsing error occured in thread!

During handling of the above exception, another exception occurred:

Trace