In [1]:
from __future__ import print_function

from keras.models import Model
from keras.layers import Input, LSTM, Dense
import numpy as np

Using TensorFlow backend.


In [5]:
import nltk 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, sent_tokenize 
stop_words = set(stopwords.words('english')) 

In [12]:
data_path = 'data/test.txt'

In [63]:
import os
import re


In [131]:
from nltk.corpus import wordnet
def isSynom(word1,word2):
    for syn in wordnet.synsets(word1):
        if word2 in syn.lemma_names():
            return True
    return False


In [76]:
class Column:
    def __init__(self, name='', type=None, equivalences=None):
        self._name = name

        if not type:
            type = []
        self._type = type

        if not equivalences:
            equivalences = []
        self._equivalences = equivalences

        self.primary = False
        self.foreign = False

    @property
    def name(self):
        return self._name

    @property
    def get_type(self):
        return self._type

    def add_type(self, type):
        self.type.append(type)

    @property
    def equivalences(self):
        return self._equivalences

    def add_equivalence(self, equivalence):
        self.equivalences.append(equivalence)

    def is_equivalent(self, word):
        if word in self.equivalences:
            return True
        else:
            return False

    def is_primary(self):
        return self.primary

    def set_as_primary(self):
        self.primary = True

    def is_foreign(self):
        return self.foreign

    def set_as_foreign(self, references):
        self.foreign = references


In [65]:
class Table:
    def __init__(self, name='', columns=None, equivalences=None):
        self._name = name

        if not columns:
            columns = []
        self.columns = columns

        if not equivalences:
            equivalences = []
        self.equivalences = equivalences

    @property
    def name(self):
        return self._name

    @name.setter
    def name(self, value):
        self._name = value

    def get_number_of_columns(self):
        return len(self.columns)

    def get_columns(self):
        return self.columns

    def get_column_by_name(self, column_name):
        for column in self.columns:
            if column.name == column_name:
                return column

    def add_column(self, column_name, column_type, column_equivalences):
        self.columns.append(Column(column_name, column_type, column_equivalences))

    def get_equivalences(self):
        return self.equivalences

    def add_equivalence(self, equivalence):
        self.equivalences.append(equivalence)

    def is_equivalent(self, word):
        if word in self.equivalences:
            return True
        else:
            return False

    def get_primary_keys(self):
        primary_keys = []
        for column in self.columns:
            if column.is_primary():
                primary_keys.append(column)
        return primary_keys

    def get_primary_key_names(self):
        primary_keys = []
        for column in self.columns:
            if column.is_primary():
                primary_keys.append(column.name)
        return primary_keys

    def add_primary_key(self, primary_key_column):
        for column in self.columns:
            if column.name == primary_key_column:
                column.set_as_primary()

    def get_foreign_keys(self):
        foreign_keys = []
        for column in self.columns:
            if column.is_foreign():
                foreign_keys.append(column)
        return foreign_keys

    def get_foreign_key_names(self):
        foreign_keys = []
        for column in self.columns:
            if column.is_foreign():
                foreign_keys.append(column.name)
        return foreign_keys

    def add_foreign_key(self, column_name, foreign_table, foreign_column):
        for column in self.columns:
            if column.name == column_name:
                column.set_as_foreign({'foreign_table': foreign_table, 'foreign_column': foreign_column})

In [80]:
class Database:

    def __init__(self):
        self.tables = []
        self.thesaurus_object = None

    def set_thesaurus(self, thesaurus):
        self.thesaurus_object = thesaurus

    def get_number_of_tables(self):
        return len(self.tables)

    def get_tables(self):
        return self.tables

    def get_column_with_this_name(self, name):
        for table in self.tables:
            for column in table.get_columns():
                if column.name == name:
                    return column

    def get_table_by_name(self, table_name):
        for table in self.tables:
            if table.name == table_name:
                return table

    def get_tables_into_dictionary(self):
        data = {}
        for table in self.tables:
            data[table.name] = []
            for column in table.get_columns():
                data[table.name].append(column.name)
        return data

    def get_primary_keys_by_table(self):
        data = {}
        for table in self.tables:
            data[table.name] = table.get_primary_keys()
        return data

    def get_foreign_keys_by_table(self):
        data = {}
        for table in self.tables:
            data[table.name] = table.get_foreign_keys()
        return data

    def get_primary_keys_of_table(self, table_name):
        for table in self.tables:
            if table.name == table_name:
                return table.get_primary_keys()

    def get_primary_key_names_of_table(self, table_name):
        for table in self.tables:
            if table.name == table_name:
                return table.get_primary_key_names()

    def get_foreign_keys_of_table(self, table_name):
        for table in self.tables:
            if table.name == table_name:
                return table.get_foreign_keys()

    def get_foreign_key_names_of_table(self, table_name):
        for table in self.tables:
            if table.name == table_name:
                return table.get_foreign_key_names()

    def add_table(self, table):
        self.tables.append(table)

    @staticmethod
    def _generate_path(path):
        cwd = os.path.dirname(__file__)
        filename = os.path.join(cwd, path)
        return filename

    def load(self, path):
        with open(path) as f:
            content = f.read()
            tables_string = [p.split(';')[0] for p in content.split('CREATE') if ';' in p]
            for table_string in tables_string:
                if 'TABLE' in table_string:
                    table = self.create_table(table_string)
                    self.add_table(table)
            alter_tables_string = [p.split(';')[0] for p in content.split('ALTER') if ';' in p]
            for alter_table_string in alter_tables_string:
                if 'TABLE' in alter_table_string:
                    self.alter_table(alter_table_string)

    def predict_type(self, string):
        if 'int' in string.lower():
            return 'int'
        elif 'char' in string.lower() or 'text' in string.lower():
            return 'string'
        elif 'date' in string.lower():
            return 'date'
        else:
            return 'unknow'

    def create_table(self, table_string):
        lines = table_string.split("\n")
        table = Table()
        for line in lines:
            if 'TABLE' in line:
                table_name = re.search("`(\w+)`", line)
                table.name = table_name.group(1)
                if self.thesaurus_object is not None:
                    table.equivalences = self.thesaurus_object.get_synonyms_of_a_word(table.name)
            elif 'PRIMARY KEY' in line:
                primary_key_columns = re.findall("`(\w+)`", line)
                for primary_key_column in primary_key_columns:
                    table.add_primary_key(primary_key_column)
            else:
                column_name = re.search("`(\w+)`", line)
                if column_name is not None:
                    column_type = self.predict_type(line)
                    if self.thesaurus_object is not None:
                        equivalences = self.thesaurus_object.get_synonyms_of_a_word(column_name.group(1))
                    else:
                        equivalences = []
                    table.add_column(column_name.group(1), column_type, equivalences)
        return table

    def alter_table(self, alter_string):
        lines = alter_string.replace('\n', ' ').split(';')
        for line in lines:
            if 'PRIMARY KEY' in line:
                table_name = re.search("TABLE `(\w+)`", line).group(1)
                table = self.get_table_by_name(table_name)
                primary_key_columns = re.findall("PRIMARY KEY \(`(\w+)`\)", line)
                for primary_key_column in primary_key_columns:
                    table.add_primary_key(primary_key_column)
            elif 'FOREIGN KEY' in line:
                table_name = re.search("TABLE `(\w+)`", line).group(1)
                table = self.get_table_by_name(table_name)
                foreign_keys_list = re.findall("FOREIGN KEY \(`(\w+)`\) REFERENCES `(\w+)` \(`(\w+)`\)", line)
                for column, foreign_table, foreign_column in foreign_keys_list:
                    table.add_foreign_key(column, foreign_table, foreign_column)

    def print_me(self):
        for table in self.tables:
            print('+-------------------------------------+')
            print("| %25s           |" % (table.name.upper()))
            print('+-------------------------------------+')
            for column in table.columns:
                if column.is_primary():
                    print("| 🔑 %31s           |" % (column.name ))
                elif column.is_foreign():
                    print("| #️⃣ %31s           |" % (column.name ))
                else:
                    print("|   %23s           |" % (column.name ))
            print('+-------------------------------------+\n')

In [82]:
database =Database()
database.load(path)
database.print_me()
database.get_tables_into_dictionary()

+-------------------------------------+
|                      CITY           |
+-------------------------------------+
| 🔑                              id           |
|                  cityName           |
+-------------------------------------+

+-------------------------------------+
|                       EMP           |
+-------------------------------------+
| 🔑                              id           |
|                      name           |
| #️⃣                          cityId           |
|                     score           |
+-------------------------------------+



{'city': ['id', 'cityName'], 'emp': ['id', 'name', 'cityId', 'score']}

In [144]:
def data_preprocesssing(data_path):
    database =Database()
    path='database_store/city.sql'
    database.load(path)
    dict_database = database.get_tables_into_dictionary()
    
    with open(data_path,"r") as file:
        lines = list(file.read().split('\n'))
        train_set=[]
        if lines != []:
            for line in lines:
                data = {'sentence':'', 'select':'', 'from':'','where': ''}
                data['sentence'] =line
                words = word_tokenize(line)
                word_dict=nltk.pos_tag(words)
                print(word_dict)
                for i in range(len(word_dict)):
                    if (word_dict[i][-1][0]== 'W'):
                        for j in range (i,len(word_dict)):
                            if (word_dict[j][-1][0]== 'N'):
                                draft = word_dict[j][0]
                                if(database.get_column_with_this_name(draft)!= None):
                                    data['select'] = draft
                                elif(isSynom(database.get_column_with_this_name(draft).name,draft)):
                                    data['select'] = database.get_column_with_this_name(draft)
                                break
                        break
                for i in range(len(word_dict)):
                    if (word_dict[i][-1][0]== 'N') or (word_dict[i][-1]== 'JJR'):
                        if (word_dict[i][0]) in dict_database.keys():
                            data['from']=word_dict[i][0]
                        else:
                            for key in dict_database.keys():
                                if isSynom(key,word_dict[i][0]):
                                    data['from']= key
                                    
                
                for i in range(len(word_dict)):
                    if (word_dict[i][-1][0]== 'N') or (word_dict[i][-1]== 'JJR'):
                        if(database.get_column_with_this_name(word_dict[i][0])!= None):
                            if data['select'] != word_dict[i][0]:
                                data['where'] = word_dict[i][0]
                                
        
                train_set.append(data)
        
        for e in train_set:
            if e['sentence']=='':
                train_set.remove(e)
                
        return(train_set)  

In [145]:
print(data_preprocesssing(data_path))

[('what', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('name', 'NN'), ('of', 'IN'), ('the', 'DT'), ('city', 'NN'), ('which', 'WDT'), ('id', 'NN'), ('is', 'VBZ'), ('1111', 'CD')]
[('what', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('id', 'NN'), ('of', 'IN'), ('the', 'DT'), ('city', 'NN'), ('which', 'WDT'), ('name', 'NN'), ('is', 'VBZ'), ("'jaffna", 'CD'), ("'", "''")]
[('what', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('score', 'NN'), ('of', 'IN'), ('the', 'DT'), ('emp', 'NN'), ('which', 'WDT'), ('id', 'NN'), ('is', 'VBZ'), ('345', 'CD')]
[('which', 'WDT'), ('name', 'NN'), ('of', 'IN'), ('the', 'DT'), ('city', 'NN'), ('has', 'VBZ'), ('score', 'JJR'), ('of', 'IN'), ('44', 'CD')]
[]
[]
[{'sentence': 'what is the name of the city which id is 1111', 'select': 'name', 'from': 'city', 'where': 'id'}, {'sentence': "what is the id of the city which name is 'jaffna'", 'select': 'id', 'from': 'city', 'where': 'name'}, {'sentence': 'what is the score of the emp which id is 345', 'select': 'score', 'from': 'emp', 

In [135]:
print(isSynom('good','just'))

True


In [134]:
from nltk.corpus import wordnet

for syn in wordnet.synsets("good"):
    for name in syn.lemma_names():
        print(name)

good
good
goodness
good
goodness
commodity
trade_good
good
good
full
good
good
estimable
good
honorable
respectable
beneficial
good
good
good
just
upright
adept
expert
good
practiced
proficient
skillful
skilful
good
dear
good
near
dependable
good
safe
secure
good
right
ripe
good
well
effective
good
in_effect
in_force
good
good
serious
good
sound
good
salutary
good
honest
good
undecomposed
unspoiled
unspoilt
good
well
good
thoroughly
soundly
good
