In [1]:
from networkx.algorithms import bipartite
import networkx as nx
import pandas as pd
import operator
from sknetwork.ranking import PageRank
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle

## Load dataset (pre-processed)

In [2]:
dataset = pd.read_pickle(r'datas/dataset_company_mapping_full')

## Step 1 - Attribute Graph

In [3]:
B = nx.Graph() # create bipartite graph

In [4]:
def add_graph(entity, dataset, graph):
    schema = dataset[entity]
    for eattribute in schema:
        graph.add_nodes_from([entity], bipartite=0)
        graph.add_nodes_from([eattribute], bipartite=1)
        graph.add_edges_from([(entity, eattribute)])
        
        
def buil_attribute_graph(graph):
    top_nodes = {n for n, d in graph.nodes(data=True) if d['bipartite']==1}
    bottom_nodes = set(graph) - top_nodes
    parametro = list(top_nodes)
    attribute_graph = bipartite.projected_graph(graph, parametro)
    return attribute_graph

In [5]:
# populate bipartite graph
for entity in dataset.keys():  
    add_graph(entity, dataset, B)
    
attribute_graph = buil_attribute_graph(B) # create attribute graph

## Step 2 - Calculate Metrics

In [6]:
def calculate_frequency(dataset):
    frequency = {}
    
    attributes = []
    entities = list(dataset.keys())

    for entity in dataset.keys():
        schema = dataset[entity]
        for attribute in schema:
            if attribute not in attributes:
                attributes.append(attribute)

    for a in attributes:
        counter = 0
        for e in entities:
            if (a in dataset[e]):
                counter+= 1
        frequency[a] = counter/len(entities)
    return frequency

In [7]:
centrality = nx.degree_centrality(attribute_graph) # degree centrality
closeness = nx.closeness_centrality(attribute_graph) # closeness centrality
frequency = calculate_frequency(dataset) # frequency

## Step 3 - Atributte relevance

In [8]:
relevance = {}

for item in centrality.keys():
    c = centrality[item]
    p = closeness[item]
    f = frequency[item]
    relevance_item = (c*0.25)+(p*0.25)+(f*0.5)
    relevance[item] = relevance_item

## Step 4 - Build the schema class

In [9]:
E = list(dataset.keys())
S = []

for entity in E:
    S.append(dataset[entity])

In [10]:
sum_relevance = []
sum_relevance_inverse = []
for i in range(len(S)):
    soma = 0
    for att in S[i]:
        soma += relevance[att]
    sum_relevance.append(soma)
    sum_relevance_inverse.append(1/soma)

sum_rel = sum(sum_relevance)
sum_rel_inv = sum(sum_relevance_inverse)


alphas = []
betas = []

for i in tqdm(range(len(S))):
    alpha = sum_relevance[i]/sum_rel
    beta = (1/sum_relevance[i])/sum_rel_inv
    alphas.append(alpha)
    betas.append(beta)

100%|██████████| 65400/65400 [00:00<00:00, 1331867.73it/s]


In [11]:
sum(alphas),sum(betas) # equal to 1

(0.9999999999999395, 0.9999999999999803)

In [12]:
# equation 2
def quality_schema(Sc, schemas):
    soma = 0
    for j in range(len(schemas)):
        a =  alphas[j] * (len(set(Sc).intersection(schemas[j]))/len(schemas[j])) # gain (equation 3) part 1 (equation 2)
        b = betas[j] * (1 - len(set(Sc).intersection(schemas[j]))/len(Sc) )# cost (equation 3) part 2 (equation 2)
        result = (a) - (b)
        soma += result
    return soma

In [13]:
order_attribute = sorted(relevance.items(), key=operator.itemgetter(1)) # tuple of attributes ordered by relevance

In [14]:
R = [] # set of attributes ordered by relevance

for element in order_attribute:
    R.append(element[0])

In [15]:
# select attributes to the schema class (algorithm 1)
q_max = 0
k = -1

for j in range(1, len(R)+1):
    Sc = R[-j:]
    q = round(quality_schema(Sc, S), 2)
    if (q >= q_max):
        q_max = q
        k = j
    print(j, ' = ', q)
    
print('===================================')
print("top-", k, ' = ', q_max)
print('===================================')

1  =  0.06
2  =  0.11
3  =  0.11
4  =  0.11
5  =  0.16
6  =  0.17
7  =  0.17
8  =  0.18
9  =  0.19
10  =  0.17
11  =  0.17
12  =  0.17
13  =  0.17
14  =  0.17
15  =  0.17
16  =  0.17
17  =  0.16
18  =  0.17
19  =  0.17
20  =  0.17
21  =  0.17
22  =  0.16
23  =  0.16
24  =  0.16
25  =  0.16
26  =  0.16
27  =  0.16
28  =  0.16
29  =  0.15
30  =  0.15
31  =  0.15
32  =  0.15
33  =  0.15
34  =  0.15
35  =  0.15
36  =  0.15
37  =  0.14
38  =  0.14
39  =  0.14
40  =  0.14
41  =  0.13
42  =  0.13
43  =  0.13
44  =  0.13
45  =  0.12
46  =  0.12
47  =  0.12
48  =  0.12
49  =  0.11
50  =  0.11
51  =  0.11
52  =  0.11
53  =  0.11
54  =  0.1
55  =  0.1
56  =  0.1
57  =  0.1
58  =  0.1
59  =  0.09
60  =  0.09
top- 9  =  0.19


In [16]:
class_schema = R[-k:]

In [17]:
# save result
pickle.dump(class_schema, open('output/company_schema', 'wb'))