In [1]:
import string
import snakecase
import nltk
import math

from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn

from similarity.ngram import NGram

from itertools import product

from enum import Enum

from collections import namedtuple


# Test data

In [2]:
data = 'POList. Order #. 01-Id.'

In [3]:
data = 'Abbreviations and acronyms are expanded, e.g. {PO, Lines} 01 {Purchase, Order, Lines}.'

In [4]:
table = { 'name': 'nation',
         'columns': [
             { 'name': 'n_nationkey',
             'type': 'INTEGER'
             }, 
             { 'name': 'n_name',
             'type': 'CHAR(25)'
             },
             { 'name': 'n_regionkey',
             'type': 'INTEGER'
             },
             { 'name': 'n_comment',
             'type': 'VARCHAR(152)'
             }
         ]
        }

# Models

In [5]:
class SchemaElement:
    def __init__(self):
        self.category = list() # an element can belong to multiple categories
        self.tokens = list()
        self.table_name = None
        
    def add_category(self, category):
        self.category.append(category)
        
    def add_token(self, token):
        if type(token) is Token:
            self.tokens.append(token)
        else:
            print("Incorrect token type. The type should be 'Token'")
            
    def get_tokens_data(self, tokens=None):
        if tokens is None:
            return list(map(lambda t: t.data, self.tokens)) 
        else:
            return list(map(lambda t: t.data, tokens)) 
    
    def get_tokens_data_type(self, tokens=None):
        if tokens is None:
            return list(map(lambda t: (t.data, t.token_type), self.tokens))
        else:
            return list(map(lambda t: (t.data, t.token_type), tokens))
    
    def sort_by_token_type(self):
        return sorted(self.tokens, key=lambda token: token.token_type.token_name)
    
    def get_tokens_by_token_type(self, token_type):
        sorted_tokens = self.sort_by_token_type()
        return list(filter(lambda t: t.token_type == token_type, sorted_tokens))
        
class Token:
    def __init__(self):
        self.ignore = False
        self.data = None
        self.token_type = None
        
TokenType = namedtuple('TokenType', ['token_name', 'weight'])
        
class TokenTypes(Enum):
    SYMBOLS = TokenType('symbols', 0)
    NUMBER = TokenType('number', 0.1)
    COMMON_WORDS = TokenType('common words', 0.1)
    CONTENT = TokenType('content', 0.8)
        
    @property
    def weight(self):
        return self.value.weight
    
    @property
    def token_name(self):
        return self.value.token_name

In [6]:
class Table:
    def __init__(self, name):
        self.name = name
        self.attributes = list()
        
    def add_attribute(self, name, attribute_type):
        schema_element = normalize(name)
        schema_element.category = attribute_type
        self.attributes.append(schema_element)
        
    

# Linguistic Matching

## Normalization

In [7]:
def normalize(element, schema_element=None):
    if schema_element is None:
        schema_element = SchemaElement()
    tokens = nltk.word_tokenize(element)
    
    for token in tokens:
        token_obj = Token()
        if token in string.punctuation:
            token_obj.ignore = True
            token_obj.data = token
            token_obj.token_type = TokenTypes.SYMBOLS
            schema_element.add_token(token_obj)
        else:
            try:
                token_float = float(token)
                token_obj.data = token
                token_obj.token_type = TokenTypes.NUMBER
                schema_element.add_token(token_obj)
            except ValueError:
                token_snake = snakecase.convert(token)
                if '_' in token_snake:
                    token_snake = token_snake.replace('_', ' ')
                    schema_element = normalize(token_snake, schema_element)
                elif token.lower() in stopwords.words('english'):
                    token_obj.data = token.lower()
                    token_obj.ignore = True
                    token_obj.token_type = TokenTypes.COMMON_WORDS
                    schema_element.add_token(token_obj)
                else:
                    token_obj.data = token.lower()
                    token_obj.token_type = TokenTypes.CONTENT
                    schema_element.add_token(token_obj)
    
    return schema_element

### Test

In [8]:
s = normalize(data)
# s.get_tokens_data()
# s.get_tokens_data_type()
sbc = s.sort_by_token_type()
# s.get_tokens_data_category()
s.get_tokens_data_type(s.get_tokens_by_token_type(TokenTypes.COMMON_WORDS))

[('and',
  <TokenTypes.COMMON_WORDS: TokenType(token_name='common words', weight=0.1)>),
 ('are',
  <TokenTypes.COMMON_WORDS: TokenType(token_name='common words', weight=0.1)>)]

## Name similarity

In [9]:
def name_similarity_tokens(token_set1, token_set2):
    sum1 = get_partial_similarity(token_set1, token_set2)
    sum2 = get_partial_similarity(token_set2, token_set1)
    
    return (sum1 + sum2) / (len(token_set1) + len(token_set2))

In [10]:
def get_partial_similarity(token_set1, token_set2):
    total_sum = 0
    for t1 in token_set1:
        max_sim = -math.inf
        for t2 in token_set2:
            sim = compute_similarity_wordnet(t1.data, t2.data)
            if math.isnan(sim):
                sim = 1 - compute_similarity_ngram(t1.data, t2.data, 2)
                
            if sim > max_sim:
                max_sim = sim
                
        total_sum = total_sum + max_sim
    
    return total_sum            

In [11]:
# the higher, the better
def compute_similarity_wordnet(word1, word2):
    allsyns1 = set(ss for ss in wn.synsets(word1))
    if len(allsyns1) == 0:
        return math.nan
    allsyns2 = set(ss for ss in wn.synsets(word2))
    
    if len(allsyns2) == 0:
        return math.nan
    
    best = max((wn.wup_similarity(s1, s2) or 0, s1, s2) for s1, s2 in product(allsyns1, allsyns2))
#     print(best)
    
    return best[0]

In [12]:
# the lower, the better
def compute_similarity_ngram(word1, word2, N):
    ngram = NGram(N)
    sim = ngram.distance(word1, word2)
#     print(sim)
    return sim

### Test

In [13]:
d1 = "PODeliverTo"
d2 = "ShipTO"

s1 = normalize(d1)
# s1.get_tokens_data_type()

s2 = normalize(d2)
# s2.get_tokens_data_type()

name_similarity_tokens(s1.tokens, s2.tokens)

0.6266666666666667

## Comparison

In [14]:
# max is 0.5
def name_similarity_elements(element1, element2):
    sum1 = 0 
    sum2 = 0
    
    for tt in TokenTypes:
        if tt == TokenTypes.SYMBOLS:
            continue 
        t1 = element1.get_tokens_by_token_type(tt)
        t2 = element2.get_tokens_by_token_type(tt)
        
        if len(t1) == 0 or len(t2) == 0:
            continue
            
        sim = name_similarity_tokens(t1, t2)
        sum1 = sum1 + tt.weight * sim
        sum2 = sum2 + tt.weight * (len(t1) + len(t2))
        
    return sum1/sum2

### Test

In [15]:
name_similarity_elements(s1, s2)
# s1.get_tokens_data_type()

0.15470085470085468

## Linguistic similarity

In [16]:
def compute_lsim(element1, element2): 
    ns = name_similarity_elements(element1, element2)
    print(ns)
    max_c = -math.inf
    for c1 in element1.category:
        c1 = normalize(c1)
        for c2 in element2.category:
            c2 = normalize(c2)
            nsc = name_similarity_elements(c1, c2)
            if nsc > max_c:
                max_c = nsc
    print(max_c)
    
    return ns * max_c

### Test

In [17]:
d1 = "ShipTo"
d2 = "ShipTO"

s1 = normalize(d1)
s2 = normalize(d2)

s1.add_category('CHAR(25)')
s2.add_category('CHAR(25)')

print(s1.get_tokens_data_type())

lsim = compute_lsim(s1, s2)
print(lsim)

[('ship', <TokenTypes.CONTENT: TokenType(token_name='content', weight=0.8)>), ('to', <TokenTypes.COMMON_WORDS: TokenType(token_name='common words', weight=0.1)>)]
0.5
0.5
0.25
