In [1]:
import string
import snakecase
import nltk
import math

from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from similarity.ngram import NGram
from itertools import product
from enum import Enum
from collections import namedtuple
from anytree import Node, RenderTree, PostOrderIter

# Test data

In [2]:
data = 'POList. Order #. 01-Id.'

In [156]:
data = 'Abbreviations and acronyms are expanded, e.g. {PO, Lines} 01 {Purchase, Order, Lines}.'

In [4]:
table = { 'name': 'nation',
         'columns': [
             { 'name': 'n_nationkey',
             'type': 'INTEGER'
             }, 
             { 'name': 'n_name',
             'type': 'CHAR(25)'
             },
             { 'name': 'n_regionkey',
             'type': 'INTEGER'
             },
             { 'name': 'n_comment',
             'type': 'VARCHAR(152)'
             }
         ]
        }

# Models

In [26]:
class SchemaElement:
    def __init__(self, name):
        self.categories = list()
        self.data_type = None # an element can belong to multiple categories
        self.tokens = list()
        self.initial_name = name
#         self.table_name = None
        
    def add_category(self, category):
        self.categories.append(category)
        
    def add_token(self, token):
        if type(token) is Token:
            self.tokens.append(token)
        else:
            print("Incorrect token type. The type should be 'Token'")
            
    def get_tokens_data(self, tokens=None):
        if tokens is None:
            return list(map(lambda t: t.data, self.tokens)) 
        else:
            return list(map(lambda t: t.data, tokens)) 
    
    def get_tokens_data_type(self, tokens=None):
        if tokens is None:
            return list(map(lambda t: (t.data, t.token_type), self.tokens))
        else:
            return list(map(lambda t: (t.data, t.token_type), tokens))
    
    def sort_by_token_type(self):
        return sorted(self.tokens, key=lambda token: token.token_type.token_name)
    
    def get_tokens_by_token_type(self, token_type):
        sorted_tokens = self.sort_by_token_type()
        return list(filter(lambda t: t.token_type == token_type, sorted_tokens))
        
class Token:
    def __init__(self):
        self.ignore = False
        self.data = None
        self.token_type = None
        
TokenType = namedtuple('TokenType', ['token_name', 'weight'])
        
class TokenTypes(Enum):
    SYMBOLS = TokenType('symbols', 0)
    NUMBER = TokenType('number', 0.1)
    COMMON_WORDS = TokenType('common words', 0.1)
    CONTENT = TokenType('content', 0.8)
        
    @property
    def weight(self):
        return self.value.weight
    
    @property
    def token_name(self):
        return self.value.token_name

In [3]:
class Table:
    def __init__(self, name):
        self.name = name
        self.columns = list()
        
    def add_column(self, column_name, column_type=None):
        schema_element = normalize(column_name)
        if column_type:
            schema_element.category = column_type
        self.columns.append(schema_element)
        
    def get_all_columns(self):
        return list(map(lambda c: c.get_tokens_data_type(), self.columns))
    
    def get_column_by_type(self, column_type):
        return list(map(lambda c: c.get_tokens_data_type(),
            filter(lambda c: column_type in c.category, self.columns)))
        

In [4]:
class Schema:
    def __init__(self, name):
        self.name = name
        self.tables = list()
        
    def add_table_by_name(self, table_name):
        table = Table(table_name)
        self.tables.append(table)
        
    def add_table(self, table):
        self.tables.append(table)
                
    def get_all_tables(self):
        return list(map(lambda t: t.get_all_columns(), self.tables))

# Linguistic Matching

## Normalization

In [111]:
def normalize(element, schema_element=None):
    if schema_element is None:
        schema_element = SchemaElement(element)
    tokens = nltk.word_tokenize(element)
    
    for token in tokens:
        token_obj = Token()
        if token in string.punctuation:
            token_obj.ignore = True
            token_obj.data = token
            token_obj.token_type = TokenTypes.SYMBOLS
#             token_obj.initial_name = token
            schema_element.add_token(token_obj)
        else:
            try:
                token_float = float(token)
                token_obj.data = token
                token_obj.token_type = TokenTypes.NUMBER
#                 token_obj.initial_name = token
                schema_element.add_token(token_obj)
            except ValueError:
                token_snake = snakecase.convert(token)
                if '_' in token_snake:
                    token_snake = token_snake.replace('_', ' ')
                    schema_element = normalize(token_snake, schema_element)
                elif token.lower() in stopwords.words('english'):
                    token_obj.data = token.lower()
                    token_obj.ignore = True
                    token_obj.token_type = TokenTypes.COMMON_WORDS
#                     token_obj.initial_name = token
                    schema_element.add_token(token_obj)
                else:
                    token_obj.data = token.lower()
                    token_obj.token_type = TokenTypes.CONTENT
#                     token_obj.initial_name = token
                    schema_element.add_token(token_obj)
    
    return schema_element

### Test

In [158]:
s = normalize(data)
# s.get_tokens_data()
# s.get_tokens_data_type()
sbc = s.sort_by_token_type()
# s.get_tokens_data_category()
s.get_tokens_data_type(s.get_tokens_by_token_type(TokenTypes.CONTENT))

[('abbreviations',
  <TokenTypes.CONTENT: TokenType(token_name='content', weight=0.8)>),
 ('acronyms',
  <TokenTypes.CONTENT: TokenType(token_name='content', weight=0.8)>),
 ('expanded',
  <TokenTypes.CONTENT: TokenType(token_name='content', weight=0.8)>),
 ('e.g', <TokenTypes.CONTENT: TokenType(token_name='content', weight=0.8)>),
 ('po', <TokenTypes.CONTENT: TokenType(token_name='content', weight=0.8)>),
 ('lines', <TokenTypes.CONTENT: TokenType(token_name='content', weight=0.8)>),
 ('purchase',
  <TokenTypes.CONTENT: TokenType(token_name='content', weight=0.8)>),
 ('order', <TokenTypes.CONTENT: TokenType(token_name='content', weight=0.8)>),
 ('lines', <TokenTypes.CONTENT: TokenType(token_name='content', weight=0.8)>)]

## Name similarity

In [6]:
def name_similarity_tokens(token_set1, token_set2):
    sum1 = get_partial_similarity(token_set1, token_set2)
    sum2 = get_partial_similarity(token_set2, token_set1)
    
    return (sum1 + sum2) / (len(token_set1) + len(token_set2))

In [7]:
def get_partial_similarity(token_set1, token_set2, n=2):
    total_sum = 0
    for t1 in token_set1:
        max_sim = -math.inf
        for t2 in token_set2:
            sim = compute_similarity_wordnet(t1.data, t2.data)
            if math.isnan(sim):
                sim = 1 - compute_similarity_ngram(t1.data, t2.data, n)
                
            if sim > max_sim:
                max_sim = sim
                
        total_sum = total_sum + max_sim
    
    return total_sum            

In [112]:
# the higher, the better
def compute_similarity_wordnet(word1, word2):
    allsyns1 = set(ss for ss in wn.synsets(word1))
    allsyns2 = set(ss for ss in wn.synsets(word2))

    if len(allsyns1) == 0 or len(allsyns2) == 0:
        return math.nan
    
    best = max((wn.wup_similarity(s1, s2) or 0, s1, s2) for s1, s2 in product(allsyns1, allsyns2))
#     print(best)
    
    return best[0]

In [9]:
# the lower, the better
def compute_similarity_ngram(word1, word2, N):
    ngram = NGram(N)
    sim = ngram.distance(word1, word2)
#     print(sim)
    return sim

### Test

In [13]:
d1 = "PODeliverTo"
d2 = "ShipTO"

s1 = normalize(d1)
# s1.get_tokens_data_type()

s2 = normalize(d2)
# s2.get_tokens_data_type()

name_similarity_tokens(s1.tokens, s2.tokens)

0.6266666666666667

## Comparison

In [10]:
# max is 0.5
def name_similarity_elements(element1, element2):
    sum1 = 0 
    sum2 = 0
        
    for tt in TokenTypes:
        if tt == TokenTypes.SYMBOLS:
            continue 
        t1 = element1.get_tokens_by_token_type(tt)
        t2 = element2.get_tokens_by_token_type(tt)
        
        if len(t1) == 0 or len(t2) == 0:
            continue
            
        sim = name_similarity_tokens(t1, t2)
        sum1 = sum1 + tt.weight * sim
        sum2 = sum2 + tt.weight * (len(t1) + len(t2))
        
    return sum1/sum2

## Linguistic similarity

In [69]:
def compute_lsim(element1, element2): 
    ns = name_similarity_elements(element1, element2)
    max_c = -math.inf
    for c1 in element1.categories:
        c1 = normalize(c1)
        for c2 in element2.categories:
            c2 = normalize(c2)
            nsc = name_similarity_elements(c1, c2)
            if nsc > max_c:
                max_c = nsc
    
    return ns * max_c

### Test

In [110]:
d1 = "employee"
d2 = "employee-territory"

s1 = normalize(d1)
s2 = normalize(d2)

s1.add_category('CHAR(25)')
s2.add_category('CHAR(25)')

print(s1.get_tokens_data_type())

lsim = compute_lsim(s1, s2)
print(lsim)

[('employee', <TokenTypes.CONTENT: TokenType(token_name='content', weight=0.8)>)]
0.125


# Weighted similarity

In [75]:
def compute_weighted_similairty(ssim, lsim, w_struct=0.5):
    return w_struct * ssim + (1 - w_struct) * lsim

# Structural Matching

In [76]:
def compute_structural_matching(node_s, node_t, sims, th_accept=0.5):
    s_leaves = list(map(lambda n: n.name.initial_name, node_s.leaves))
    t_leaves = list(map(lambda n: n.name.initial_name, node_t.leaves))
    all_leaves = product(s_leaves, t_leaves)
    
    filtered_pairs = [pair for pair in filter(lambda s: sims[s]['wsim'] > th_accept, sims.keys())
                      if pair in all_leaves]
    
    return len(filtered_pairs) / (len(s_leaves) + len(t_leaves))

In [77]:
def change_structural_similarity(leaves_s, leaves_t, sims, factor):
    all_leaves = product(leaves_s, leaves_t)
    for pair in all_leaves:
        sims[pair]['ssim'] = sims[pair]['ssim'] * factor

# Tree Match

In [82]:
def tree_match(source_tree, target_tree, leaf_w_struct=0.5, w_struct=0.6, th_accept=0.14, th_high=0.15, 
               th_low=0.13, c_inc=1.2, c_dec=0.9):
    
    s_leaves = list(map(lambda n: n.name, source_tree.leaves))
    t_leaves = list(map(lambda n: n.name, target_tree.leaves))
    all_leaves = product(s_leaves, t_leaves)
    sims = dict()
    
    for s, t in all_leaves:
        ssim = name_similarity_elements(normalize(s.data_type), normalize(t.data_type))
        lsim = compute_lsim(s, t)
        wsim = compute_weighted_similairty(ssim, lsim, leaf_w_struct)
        sims[(s.initial_name, t.initial_name)] = {'ssim': ssim, 'lsim': lsim, 'wsim': wsim}
        
    s_post_order = [node for node in PostOrderIter(source_tree)]
    t_post_order = [node for node in PostOrderIter(target_tree)] 
    
    for s in s_post_order:
        if type(s.name) is not SchemaElement:
            continue
        
        for t in t_post_order:
            if type(t.name) is not SchemaElement:
                continue
                
            if s.name not in s_leaves or t.name not in t_leaves:
                ssim = compute_structural_matching(s, t, sims, th_accept)
                lsim = compute_lsim(s.name, t.name)
                wsim = compute_weighted_similairty(ssim, lsim, w_struct)
                sims[(s.name.initial_name, t.name.initial_name)] = {'ssim': ssim, 'lsim': lsim, 'wsim': wsim}
            
            if sims[(s.name.initial_name, t.name.initial_name)]['wsim'] > th_high:
                change_structural_similarity(list(map(lambda n: n.name.initial_name, s.leaves)), 
                                            list(map(lambda n: n.name.initial_name, t.leaves)), sims, c_inc)
                
            if sims[(s.name.initial_name, t.name.initial_name)]['wsim'] < th_low:
                change_structural_similarity(list(map(lambda n: n.name.initial_name, s.leaves)), 
                                            list(map(lambda n: n.name.initial_name, t.leaves)), sims, c_dec)
    return sims

# Pipeline example

In [108]:
employees = ['EmployeeID', 'FirstName', 'LastName', 'Title', 'EmailName', 'Extension', 'Workphone']
et = ['EmployeeIdFk', 'TeritoryId']

schema = Node('rdb_schema')

emp = normalize('employee')
emp.add_category('string')
emp.data_type = 'string'
employee = Node(emp, parent=schema)

emp_ter = normalize('employee-teritory')
emp_ter.add_category('string')
emp_ter.data_type = 'string'
teritory = Node(emp_ter, parent=schema)

for e in employees:
    sch = normalize(e)
    sch.add_category('string')
    sch.data_type = 'string'
    n = Node(sch, parent=employee)
    
for e in et:
    sch = normalize(e)
    sch.add_category('str')
    sch.data_type = 'str'
    n = Node(sch, parent=teritory)
    
for pre, fill, node in RenderTree(schema):
    print("%s%s" % (pre, node.name))

rdb_schema
├── <__main__.SchemaElement object at 0x12a7d2198>
│   ├── <__main__.SchemaElement object at 0x10abc3358>
│   ├── <__main__.SchemaElement object at 0x10abbe320>
│   ├── <__main__.SchemaElement object at 0x12a7d2a58>
│   ├── <__main__.SchemaElement object at 0x10abbe160>
│   ├── <__main__.SchemaElement object at 0x10abbf0f0>
│   ├── <__main__.SchemaElement object at 0x10abc32e8>
│   └── <__main__.SchemaElement object at 0x10abc5240>
└── <__main__.SchemaElement object at 0x12a7d9470>
    ├── <__main__.SchemaElement object at 0x10abbe5c0>
    └── <__main__.SchemaElement object at 0x10abc34e0>


In [109]:
from operator import *

sims = tree_match(employee, teritory)
# print(sorted(sims, key=lambda x: sims[x]['wsim'], reverse=True))
sorted_sims = sorted(sims.items(),key=lambda x:getitem(x[1],'wsim'), reverse=True)
print(sorted_sims)
# print(sims)
# a = [node for node in PostOrderIter(employee)]
# list(map(lambda n: n.name, a[2].leaves))[0].initial_name
tuples = list(map(lambda x: x[0], filter(lambda s: s[1]['wsim'] > 0.14, sorted_sims)))

print(tuples)

[(('EmployeeID', 'employee-teritory'), {'ssim': 0.6666666666666666, 'lsim': 0.0718954248366013, 'wsim': 0.4287581699346405}), (('employee', 'employee-teritory'), {'ssim': 0.4444444444444444, 'lsim': 0.1323529411764706, 'wsim': 0.3196078431372549}), (('EmployeeID', 'EmployeeIdFk'), {'ssim': 0.324, 'lsim': 0.04125, 'wsim': 0.145625}), (('EmployeeID', 'TeritoryId'), {'ssim': 0.324, 'lsim': 0.03841145833333333, 'wsim': 0.14420572916666666}), (('Title', 'TeritoryId'), {'ssim': 0.243, 'lsim': 0.03510802469135802, 'wsim': 0.14255401234567902}), (('Extension', 'TeritoryId'), {'ssim': 0.243, 'lsim': 0.033950617283950615, 'wsim': 0.1419753086419753}), (('Extension', 'EmployeeIdFk'), {'ssim': 0.243, 'lsim': 0.024305555555555556, 'wsim': 0.1371527777777778}), (('FirstName', 'EmployeeIdFk'), {'ssim': 0.243, 'lsim': 0.023592105263157893, 'wsim': 0.13679605263157896}), (('LastName', 'EmployeeIdFk'), {'ssim': 0.243, 'lsim': 0.021076023391812866, 'wsim': 0.13553801169590643}), (('FirstName', 'TeritoryI

In [97]:
from wiktionaryparser import WiktionaryParser

parser = WiktionaryParser()
word = parser.fetch('NLP')
word

[{'etymology': '',
  'definitions': [{'partOfSpeech': 'noun',
    'text': ['NLP (uncountable)',
     '(communication) Initialism of neuro-linguistic programming.',
     '(computing) Initialism of natural language processing.',
     '(mathematics) Initialism of nonlinear programming.',
     '(psychology) Initialism of neuro-linguistic psychotherapy.'],
    'relatedWords': [],
    'examples': []},
   {'partOfSpeech': 'proper noun',
    'text': ['NLP', '(politics) Initialism of National Labour Party.'],
    'relatedWords': [],
    'examples': []}],
  'pronunciations': {'text': [], 'audio': []}}]