# [Part 2] Implement the meta-blocking method

In [1]:
# importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import math
import csv
import glob
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize
import itertools
from itertools import combinations, chain, permutations
from scipy import io
from sklearn.metrics import jaccard_score
#!nltk.download()

## Pre Processing

In [2]:
# First Entity Collection: loading datasets 
data1 = pd.read_csv("Entity_Collection_1.csv")
data1.head()

Unnamed: 0,id,name,phone,gender,university,job_title
0,e1_001,Henry Kelly,0-321-173-4531,Male,University of Virginia,HR Specialist
1,e1_002,Ramon Carter,0-720-705-2870,Male,"University of California, Irvine",Webmaster
2,e1_003,Marvin Owens,6-223-132-0421,Male,Wayne State University,Bellman
3,e1_004,Ruth Squire,7-271-235-7283,Female,Drexel University,HR Coordinator
4,e1_005,Caitlyn Walker,2-560-748-8364,Female,The University of Arizona,Stockbroker


In [3]:
# Second Entity Collection: loading datasets 
data2 = pd.read_csv("Entity_Collection_2.csv")
data2.head()

Unnamed: 0,id_num,full_name,phone_num,sex,school,designation
0,e2_001,Doris Tutton,1-136-745-2581,Female,"University of California, Santa Barbara (UCSB)",Accountant
1,e2_002,Janice Rothwell,0-170-230-6700,Female,University of Colorado Boulder,Business Broker
2,e2_003,Hayden Ingham,5-305-662-6073,Female,Boston College,Production Painter
3,e2_004,Alice Rosenbloom,7-737-863-8035,Female,Brown University,Service Supervisor
4,e2_005,Kurt Reid,6-526-234-3255,Male,University of Texas Dallas,IT Support Staff


In [4]:
# First Entity Collection: 'name' column
data1['name']= data1['name'].str.replace(r"\b(?:[A-Z]\.)+(?!\w)","",regex=True)

# Second Entity Collection: 'full_name' column
data2['full_name']= data2['full_name'].str.replace(r"\b(?:[A-Z]\.)+(?!\w)","",regex=True)

# First Entity Collection: 'phone' column
# Removing hyphen and country codes (Make every phone number atmost 10 digits)
data1['phone'] = data1['phone'].str.replace(r'-','',regex=True)
data1['phone'] = data1['phone'].str.replace(r'(','',regex=True)
data1['phone'] = data1['phone'].str.replace(r')','',regex=True)
data1['phone'] = data1['phone'].str[-10:]
data1.tail(2)

# Second Entity Collection: 'phone_num' column o
# Removing hyphen and country codes (Make every phone number atmost 10 digits)
data2['phone_num'] = data2['phone_num'].str.replace(r'-','',regex=True)
data2['phone_num'] = data2['phone_num'].str.replace(r'(','',regex=True)
data2['phone_num'] = data2['phone_num'].str.replace(r')','',regex=True)
data2['phone_num'] = data2['phone_num'].str[-10:]
data2.tail(2)

# First Entity Collection:: 'university' column
# Removing commas (,) and hyphen (-)
data1['university'] = data1['university'].str.replace(r",","", regex=True)
data1['university'] = data1['university'].str.replace(r" - "," ", regex=True)

# Second Entity Collection: 'school' column
# Removing commas (,) and hyphen (-)
data2['school'] = data2['school'].str.replace(r",","",regex=True)

In [5]:
data1.tail(5)

Unnamed: 0,id,name,phone,gender,university,job_title
145,e1_146,Stone Caleb,1882447288,Male,Colorado University Boulder,HR
146,e1_147,Reid,6862164442,Female,Clark Uni,Lab Technician
147,e1_148,Groves Tom,4763834614,Male,Brown Uni,Bio Analyist
148,e1_149,Horton,1503024701,Male,Maryland University Baltimore County,Service Manager
149,e1_150,Swan Luke,1885652481,Male,Kansas University,HR


In [6]:
data2.tail(5)

Unnamed: 0,id_num,full_name,phone_num,sex,school,designation
145,e2_146,Caleb Stone,1882447288,Male,University of Colorado Boulder,HR Coordinator
146,e2_147,Mandy Reid,6862164442,Female,Clark University,Laboratory Technician
147,e2_148,Tom Groves,4763834614,Male,Brown University,Biologist
148,e2_149,Julian Horton,1503024701,Male,University of Maryland Baltimore County,Service Supervisor
149,e2_150,Luke Swan,1885652481,Male,University of Kansas,HR Coordinator


In [7]:
# Merging all the texts in one place (one column)
data1['merged_text'] = data1['name']+" "+data1['phone']+" "+data1['gender']+" "+data1['university']+" "+data1['job_title']
data2['merged_text'] = data2['full_name']+" "+data2['phone_num']+" "+data2['sex']+" "+data2['school']+" "+data2['designation']

In [8]:
data1.tail(5)

Unnamed: 0,id,name,phone,gender,university,job_title,merged_text
145,e1_146,Stone Caleb,1882447288,Male,Colorado University Boulder,HR,Stone Caleb 1882447288 Male Colorado Universit...
146,e1_147,Reid,6862164442,Female,Clark Uni,Lab Technician,Reid 6862164442 Female Clark Uni Lab Technician
147,e1_148,Groves Tom,4763834614,Male,Brown Uni,Bio Analyist,Groves Tom 4763834614 Male Brown Uni Bio Analyist
148,e1_149,Horton,1503024701,Male,Maryland University Baltimore County,Service Manager,Horton 1503024701 Male Maryland University Ba...
149,e1_150,Swan Luke,1885652481,Male,Kansas University,HR,Swan Luke 1885652481 Male Kansas University HR


In [9]:
data2.tail(5)

Unnamed: 0,id_num,full_name,phone_num,sex,school,designation,merged_text
145,e2_146,Caleb Stone,1882447288,Male,University of Colorado Boulder,HR Coordinator,Caleb Stone 1882447288 Male University of Colo...
146,e2_147,Mandy Reid,6862164442,Female,Clark University,Laboratory Technician,Mandy Reid 6862164442 Female Clark University ...
147,e2_148,Tom Groves,4763834614,Male,Brown University,Biologist,Tom Groves 4763834614 Male Brown University Bi...
148,e2_149,Julian Horton,1503024701,Male,University of Maryland Baltimore County,Service Supervisor,Julian Horton 1503024701 Male University of Ma...
149,e2_150,Luke Swan,1885652481,Male,University of Kansas,HR Coordinator,Luke Swan 1885652481 Male University of Kansas...


In [10]:
# ## Creating Blocks
# empty list of blocks
blocking = []

# Checking for stop words removal as they can not be a token or block (Example : 'of' can not be a block)
sw_nltk = stopwords.words('english')

# Checking articles
articles = ['a','A','an','An','AN','the', 'The','THE']

# iterating both entity collections and produce unique empty blocks. 
for i in range(len(data1)):
    temp_list = regexp_tokenize(data1['merged_text'][i], "[\w']+")
    for j in temp_list:
        if j not in blocking:
            if j not in sw_nltk:
                if j not in articles:
                    blocking.append(j)         
            
for i in range(len(data2)):
    temp_list = regexp_tokenize(data2['merged_text'][i], "[\w']+")
    for j in temp_list:
        if j not in blocking:
            if j not in sw_nltk:
                if j not in articles:
                    blocking.append(j)

print(blocking[1:50])

['Kelly', '3211734531', 'Male', 'University', 'Virginia', 'HR', 'Specialist', 'Ramon', 'Carter', '7207052870', 'California', 'Irvine', 'Webmaster', 'Marvin', 'Owens', '2231320421', 'Wayne', 'State', 'Bellman', 'Ruth', 'Squire', '2712357283', 'Female', 'Drexel', 'Coordinator', 'Caitlyn', 'Walker', '5607488364', 'Arizona', 'Stockbroker', 'Benjamin', 'Torres', '6384764838', 'Stony', 'Brook', 'New', 'York', 'Ambulatory', 'Nurse', 'Erick', 'Thomson', '3828772851', 'Katz', 'School', 'Yeshiva', 'Restaurant', 'Manager', 'Winnie', 'Lucas']


In [11]:
# empty dictionary
blocks = {}
# iterating both entity collections to assign entities to each blocks. 
for block in blocking:
    temp_list = []
    for i in range(len(data1)):
        if block in data1['merged_text'][i]:
            temp_list.append(data1['id'][i])
            
    for j in range(len(data2)):
        if block in data2['merged_text'][j]:
            temp_list.append(data2['id_num'][j])
     
    blocks[block] = temp_list

# checking the blocks
print('Total number of initial blocks: ', len(blocks))
blocks

Total number of initial blocks:  880


{'Henry': ['e1_001', 'e1_136', 'e2_136'],
 'Kelly': ['e1_001'],
 '3211734531': ['e1_001'],
 'Male': ['e1_001',
  'e1_002',
  'e1_003',
  'e1_006',
  'e1_007',
  'e1_009',
  'e1_011',
  'e1_015',
  'e1_017',
  'e1_021',
  'e1_022',
  'e1_023',
  'e1_024',
  'e1_026',
  'e1_028',
  'e1_030',
  'e1_033',
  'e1_040',
  'e1_043',
  'e1_044',
  'e1_047',
  'e1_051',
  'e1_052',
  'e1_054',
  'e1_055',
  'e1_059',
  'e1_060',
  'e1_061',
  'e1_064',
  'e1_072',
  'e1_073',
  'e1_075',
  'e1_076',
  'e1_077',
  'e1_079',
  'e1_083',
  'e1_085',
  'e1_086',
  'e1_088',
  'e1_090',
  'e1_091',
  'e1_092',
  'e1_093',
  'e1_094',
  'e1_096',
  'e1_100',
  'e1_101',
  'e1_103',
  'e1_110',
  'e1_112',
  'e1_113',
  'e1_114',
  'e1_115',
  'e1_116',
  'e1_118',
  'e1_119',
  'e1_120',
  'e1_122',
  'e1_123',
  'e1_127',
  'e1_130',
  'e1_133',
  'e1_134',
  'e1_135',
  'e1_136',
  'e1_138',
  'e1_142',
  'e1_143',
  'e1_144',
  'e1_145',
  'e1_146',
  'e1_148',
  'e1_149',
  'e1_150',
  'e2_005',
 

# Meta-Blocking

In [12]:
def convert_DictToList(sendDict):
    # dictionary values as list 
    sendDict_values = list(sendDict.values())
    # dictionary keys as list 
    sendDict_keys = list(sendDict.keys())
    sendDict_keys = [list(x) for x in sendDict_keys]
    # merging dictionary values, keys in same list 
    returnList = [x+[y] for x, y in zip(sendDict_keys, sendDict_values)]   
    return returnList

In [13]:
# removing any duplicates edges pair (if any)
def removeDuplicateSublist(lst):
     return list(map(list, (set(map(lambda x: tuple(sorted(x)), lst)))))

In [14]:
# calculating average of all the weights
def averageWeight(lst):
    return sum(lst) / len(lst)

In [15]:
# converting the block values in a list for calculating weight
block_values = list(blocks.values())

# droping the blocks  which has only one element
block_values =[x for x in block_values if len(x)>=2]
print('Total number of blocks (after dropping block w/ single value): ', len(block_values))
block_values [0:2]

Total number of blocks (after dropping block w/ single value):  388


[['e1_001', 'e1_136', 'e2_136'],
 ['e1_001',
  'e1_002',
  'e1_003',
  'e1_006',
  'e1_007',
  'e1_009',
  'e1_011',
  'e1_015',
  'e1_017',
  'e1_021',
  'e1_022',
  'e1_023',
  'e1_024',
  'e1_026',
  'e1_028',
  'e1_030',
  'e1_033',
  'e1_040',
  'e1_043',
  'e1_044',
  'e1_047',
  'e1_051',
  'e1_052',
  'e1_054',
  'e1_055',
  'e1_059',
  'e1_060',
  'e1_061',
  'e1_064',
  'e1_072',
  'e1_073',
  'e1_075',
  'e1_076',
  'e1_077',
  'e1_079',
  'e1_083',
  'e1_085',
  'e1_086',
  'e1_088',
  'e1_090',
  'e1_091',
  'e1_092',
  'e1_093',
  'e1_094',
  'e1_096',
  'e1_100',
  'e1_101',
  'e1_103',
  'e1_110',
  'e1_112',
  'e1_113',
  'e1_114',
  'e1_115',
  'e1_116',
  'e1_118',
  'e1_119',
  'e1_120',
  'e1_122',
  'e1_123',
  'e1_127',
  'e1_130',
  'e1_133',
  'e1_134',
  'e1_135',
  'e1_136',
  'e1_138',
  'e1_142',
  'e1_143',
  'e1_144',
  'e1_145',
  'e1_146',
  'e1_148',
  'e1_149',
  'e1_150',
  'e2_005',
  'e2_007',
  'e2_009',
  'e2_010',
  'e2_011',
  'e2_013',
  'e2_0

## Finding nodes

In [16]:
# finding unique entities as nodes of graph
nodes=[]

for block in blocks:
    for entity in blocks[block]:   # adding all entities from blocks into 'nodes'
        if not entity in nodes:
            nodes.append(entity)

# checking nodes 
print("Total num of nodes: ", len(nodes))
nodes[0:5]

Total num of nodes:  300


['e1_001', 'e1_136', 'e2_136', 'e1_002', 'e1_003']

## Finding edges

In [17]:
# finding unique entity pairs as edges of graph 
# !this may require some time to execute for many iterations
edges=[]
for block in blocks: 
    for permutation in permutations(blocks[block],2):
        edge = {(permutation[0]),(permutation[1])}
        
        if len(edge) == 2 and edge not in edges:
            edges.append(edge)

# checking edges 
print("Total num of edges: ", len(edges))
edges[0:5]

# remove duplicates elements (abc, abc) from edges
edges = list(filter(lambda x: list(edges[0])[0] != list(edges[0])[1], edges))
print("Total num of edges: ", len(edges))

# number of unique entity pairs of edges
print("Total num of edges w/ unique pairs: ", len(list(map(set, set(map(tuple, edges))))))

Total num of edges:  43080
Total num of edges:  43080
Total num of edges w/ unique pairs:  43080


# Edge weighting: Common Blocks Scheme (CBS)

In [18]:
# finding weight of each edge (each entity pair) as the total number of blocks each entity pair appears
uniques = set(chain(*block_values))
CBS_Weights_dict = {x: sum(1 for n in block_values if all(i in n for i in x))  for x in combinations(uniques, 2)}
CBS_Weights_dict = {k: v for k, v in CBS_Weights_dict.items() if v != 0}
CBS_Weights_dict

{('e2_002', 'e2_019'): 4,
 ('e2_002', 'e2_068'): 3,
 ('e2_002', 'e1_114'): 2,
 ('e2_002', 'e2_133'): 3,
 ('e2_002', 'e2_142'): 4,
 ('e2_002', 'e1_032'): 4,
 ('e2_002', 'e1_104'): 5,
 ('e2_002', 'e2_001'): 4,
 ('e2_002', 'e2_099'): 4,
 ('e2_002', 'e2_063'): 4,
 ('e2_002', 'e1_119'): 2,
 ('e2_002', 'e1_040'): 3,
 ('e2_002', 'e1_008'): 4,
 ('e2_002', 'e2_110'): 3,
 ('e2_002', 'e2_115'): 4,
 ('e2_002', 'e2_064'): 3,
 ('e2_002', 'e2_150'): 3,
 ('e2_002', 'e1_132'): 4,
 ('e2_002', 'e2_111'): 4,
 ('e2_002', 'e2_021'): 4,
 ('e2_002', 'e1_006'): 2,
 ('e2_002', 'e2_090'): 3,
 ('e2_002', 'e1_089'): 5,
 ('e2_002', 'e1_108'): 4,
 ('e2_002', 'e2_079'): 2,
 ('e2_002', 'e2_035'): 4,
 ('e2_002', 'e1_081'): 4,
 ('e2_002', 'e2_061'): 4,
 ('e2_002', 'e1_056'): 4,
 ('e2_002', 'e2_130'): 3,
 ('e2_002', 'e1_046'): 4,
 ('e2_002', 'e1_058'): 4,
 ('e2_002', 'e2_046'): 4,
 ('e2_002', 'e2_065'): 2,
 ('e2_002', 'e1_038'): 4,
 ('e2_002', 'e2_113'): 4,
 ('e2_002', 'e2_088'): 4,
 ('e2_002', 'e1_023'): 2,
 ('e2_002', 

In [19]:
CBS_Weights_list = convert_DictToList(CBS_Weights_dict)
print('Length of initial CBS edges, weight pair: ', len(CBS_Weights_list))

Length of initial CBS edges, weight pair:  43080


In [20]:
# combining all CBS edges pairs as list 
CBS_Weights_EdgesPair = list(CBS_Weights_dict.keys())

CBS_Weights_EdgesPair_final = []
for item in CBS_Weights_EdgesPair:
    if item[0][0:2] != item[1][0:2]:
        #print(item)
        CBS_Weights_EdgesPair_final.append(item)

print("Total pair of edges: ", len(CBS_Weights_EdgesPair_final))
CBS_Weights_EdgesPair_final


# droping CBS edges pairs from list (if any)
for key in list(CBS_Weights_dict.keys()):
    if key not in CBS_Weights_EdgesPair_final:
        del CBS_Weights_dict[key]

print("Total pair of edges (after droping duplicates pairs): ", len(CBS_Weights_dict))
CBS_Weights_dict

Total pair of edges:  21609
Total pair of edges (after droping duplicates pairs):  21609


{('e2_002', 'e1_114'): 2,
 ('e2_002', 'e1_032'): 4,
 ('e2_002', 'e1_104'): 5,
 ('e2_002', 'e1_119'): 2,
 ('e2_002', 'e1_040'): 3,
 ('e2_002', 'e1_008'): 4,
 ('e2_002', 'e1_132'): 4,
 ('e2_002', 'e1_006'): 2,
 ('e2_002', 'e1_089'): 5,
 ('e2_002', 'e1_108'): 4,
 ('e2_002', 'e1_081'): 4,
 ('e2_002', 'e1_056'): 4,
 ('e2_002', 'e1_046'): 4,
 ('e2_002', 'e1_058'): 4,
 ('e2_002', 'e1_038'): 4,
 ('e2_002', 'e1_023'): 2,
 ('e2_002', 'e1_007'): 3,
 ('e2_002', 'e1_092'): 2,
 ('e2_002', 'e1_005'): 4,
 ('e2_002', 'e1_012'): 5,
 ('e2_002', 'e1_100'): 2,
 ('e2_002', 'e1_103'): 2,
 ('e2_002', 'e1_113'): 2,
 ('e2_002', 'e1_018'): 4,
 ('e2_002', 'e1_021'): 2,
 ('e2_002', 'e1_020'): 5,
 ('e2_002', 'e1_064'): 3,
 ('e2_002', 'e1_055'): 2,
 ('e2_002', 'e1_037'): 4,
 ('e2_002', 'e1_071'): 5,
 ('e2_002', 'e1_074'): 4,
 ('e2_002', 'e1_133'): 3,
 ('e2_002', 'e1_066'): 4,
 ('e2_002', 'e1_097'): 4,
 ('e2_002', 'e1_035'): 4,
 ('e2_002', 'e1_098'): 4,
 ('e2_002', 'e1_062'): 4,
 ('e2_002', 'e1_011'): 3,
 ('e2_002', 

# Edge weighting: Jaccard Scheme (JS)

In [21]:
CBS_Weights_EdgesPair_finalA = []
for e in CBS_Weights_EdgesPair_final:
    CBS_Weights_EdgesPair_finalA.append(e[0])

CBS_Weights_EdgesPair_finalA[:5]

['e2_002', 'e2_002', 'e2_002', 'e2_002', 'e2_002']

In [22]:
CBS_Weights_EdgesPair_finalB = []
for e in CBS_Weights_EdgesPair_final:
    CBS_Weights_EdgesPair_finalB.append(e[1])

CBS_Weights_EdgesPair_finalB[:5]

['e1_114', 'e1_032', 'e1_104', 'e1_119', 'e1_040']

In [23]:
# fiding weight of each edge (each entity pair) as the total number of blocks each entity pair appears
uniquesA = set(chain(*block_values))
CBS_Weights_dictA = {x: sum(1 for n in block_values if all(i in n for i in x))  for x in combinations(uniquesA, 1)}
CBS_Weights_dictA = {k: v for k, v in CBS_Weights_dictA.items() if v != 0}
CBS_Weights_dictA

{('e2_002',): 10,
 ('e2_019',): 8,
 ('e2_068',): 10,
 ('e1_114',): 10,
 ('e2_133',): 10,
 ('e2_142',): 13,
 ('e1_032',): 10,
 ('e1_104',): 11,
 ('e2_001',): 12,
 ('e2_099',): 7,
 ('e2_063',): 8,
 ('e1_119',): 8,
 ('e1_040',): 11,
 ('e1_008',): 10,
 ('e2_110',): 8,
 ('e2_115',): 9,
 ('e2_064',): 10,
 ('e2_150',): 11,
 ('e1_132',): 7,
 ('e2_111',): 9,
 ('e2_021',): 11,
 ('e1_006',): 13,
 ('e2_090',): 9,
 ('e1_089',): 7,
 ('e1_108',): 12,
 ('e2_079',): 13,
 ('e2_035',): 6,
 ('e1_081',): 9,
 ('e2_061',): 8,
 ('e1_056',): 8,
 ('e2_130',): 10,
 ('e1_046',): 14,
 ('e1_058',): 12,
 ('e2_046',): 12,
 ('e2_065',): 8,
 ('e1_038',): 9,
 ('e2_113',): 8,
 ('e2_088',): 11,
 ('e1_023',): 9,
 ('e1_007',): 13,
 ('e2_005',): 13,
 ('e1_092',): 9,
 ('e2_116',): 6,
 ('e1_005',): 9,
 ('e1_012',): 11,
 ('e1_100',): 10,
 ('e2_081',): 9,
 ('e1_103',): 14,
 ('e1_113',): 9,
 ('e1_018',): 12,
 ('e2_043',): 10,
 ('e2_048',): 7,
 ('e1_021',): 8,
 ('e2_080',): 9,
 ('e2_039',): 8,
 ('e2_034',): 14,
 ('e2_007',): 8,
 (

In [24]:
# fiding weight of each edge (each entity pair) as the total number of blocks each entity pair appears
uniquesB = set(chain(*block_values))
CBS_Weights_dictB = {x: sum(1 for n in block_values if all(i in n for i in x))  for x in combinations(uniquesB, 1)}
CBS_Weights_dictB = {k: v for k, v in CBS_Weights_dictB.items() if v != 0}
CBS_Weights_dictB

{('e2_002',): 10,
 ('e2_019',): 8,
 ('e2_068',): 10,
 ('e1_114',): 10,
 ('e2_133',): 10,
 ('e2_142',): 13,
 ('e1_032',): 10,
 ('e1_104',): 11,
 ('e2_001',): 12,
 ('e2_099',): 7,
 ('e2_063',): 8,
 ('e1_119',): 8,
 ('e1_040',): 11,
 ('e1_008',): 10,
 ('e2_110',): 8,
 ('e2_115',): 9,
 ('e2_064',): 10,
 ('e2_150',): 11,
 ('e1_132',): 7,
 ('e2_111',): 9,
 ('e2_021',): 11,
 ('e1_006',): 13,
 ('e2_090',): 9,
 ('e1_089',): 7,
 ('e1_108',): 12,
 ('e2_079',): 13,
 ('e2_035',): 6,
 ('e1_081',): 9,
 ('e2_061',): 8,
 ('e1_056',): 8,
 ('e2_130',): 10,
 ('e1_046',): 14,
 ('e1_058',): 12,
 ('e2_046',): 12,
 ('e2_065',): 8,
 ('e1_038',): 9,
 ('e2_113',): 8,
 ('e2_088',): 11,
 ('e1_023',): 9,
 ('e1_007',): 13,
 ('e2_005',): 13,
 ('e1_092',): 9,
 ('e2_116',): 6,
 ('e1_005',): 9,
 ('e1_012',): 11,
 ('e1_100',): 10,
 ('e2_081',): 9,
 ('e1_103',): 14,
 ('e1_113',): 9,
 ('e1_018',): 12,
 ('e2_043',): 10,
 ('e2_048',): 7,
 ('e1_021',): 8,
 ('e2_080',): 9,
 ('e2_039',): 8,
 ('e2_034',): 14,
 ('e2_007',): 8,
 (

In [25]:
CBS_Weights_dictA_dict = convert_DictToList(CBS_Weights_dictA)
CBS_Weights_dictA_dict = pd.DataFrame(CBS_Weights_dictA_dict)
CBS_Weights_dictA_dict.rename(columns={0: 'edge1', 1: 'edge1Weight'}, inplace=True)
CBS_Weights_dictA_dict.head()

Unnamed: 0,edge1,edge1Weight
0,e2_002,10
1,e2_019,8
2,e2_068,10
3,e1_114,10
4,e2_133,10


In [26]:
CBS_Weights_dictB_dict = convert_DictToList(CBS_Weights_dictB)
CBS_Weights_dictB_dict = pd.DataFrame(CBS_Weights_dictB_dict)
CBS_Weights_dictB_dict.rename(columns={0: 'edge2', 1: 'edge2Weight'}, inplace=True)
CBS_Weights_dictB_dict.head()

Unnamed: 0,edge2,edge2Weight
0,e2_002,10
1,e2_019,8
2,e2_068,10
3,e1_114,10
4,e2_133,10


In [27]:
# combining all CBS weight values as list 
edgeWeight_list = list(CBS_Weights_dict.values())

# combining all CBS edges as list 
edge_CBS_Weights_EdgesPair = list(CBS_Weights_dict.keys())
edge_CBS_Weights_EdgesPair = [list(x) for x in edge_CBS_Weights_EdgesPair]

In [28]:
# merging CBS weight, CBS edges in same list for graph  
CBS_Weights_list = convert_DictToList(CBS_Weights_dict)
print('Length of initial CBS edges, weight pair: ', len(CBS_Weights_list))
CBS_Weights_list

Length of initial CBS edges, weight pair:  21609


[['e2_002', 'e1_114', 2],
 ['e2_002', 'e1_032', 4],
 ['e2_002', 'e1_104', 5],
 ['e2_002', 'e1_119', 2],
 ['e2_002', 'e1_040', 3],
 ['e2_002', 'e1_008', 4],
 ['e2_002', 'e1_132', 4],
 ['e2_002', 'e1_006', 2],
 ['e2_002', 'e1_089', 5],
 ['e2_002', 'e1_108', 4],
 ['e2_002', 'e1_081', 4],
 ['e2_002', 'e1_056', 4],
 ['e2_002', 'e1_046', 4],
 ['e2_002', 'e1_058', 4],
 ['e2_002', 'e1_038', 4],
 ['e2_002', 'e1_023', 2],
 ['e2_002', 'e1_007', 3],
 ['e2_002', 'e1_092', 2],
 ['e2_002', 'e1_005', 4],
 ['e2_002', 'e1_012', 5],
 ['e2_002', 'e1_100', 2],
 ['e2_002', 'e1_103', 2],
 ['e2_002', 'e1_113', 2],
 ['e2_002', 'e1_018', 4],
 ['e2_002', 'e1_021', 2],
 ['e2_002', 'e1_020', 5],
 ['e2_002', 'e1_064', 3],
 ['e2_002', 'e1_055', 2],
 ['e2_002', 'e1_037', 4],
 ['e2_002', 'e1_071', 5],
 ['e2_002', 'e1_074', 4],
 ['e2_002', 'e1_133', 3],
 ['e2_002', 'e1_066', 4],
 ['e2_002', 'e1_097', 4],
 ['e2_002', 'e1_035', 4],
 ['e2_002', 'e1_098', 4],
 ['e2_002', 'e1_062', 4],
 ['e2_002', 'e1_011', 3],
 ['e2_002', 

In [29]:
JS_df_initial = pd.DataFrame(CBS_Weights_list)
JS_df_initial.rename(columns={0: 'edge1', 1: 'edge2', 2: 'cbsWeight'}, inplace=True)
JS_df_initial

Unnamed: 0,edge1,edge2,cbsWeight
0,e2_002,e1_114,2
1,e2_002,e1_032,4
2,e2_002,e1_104,5
3,e2_002,e1_119,2
4,e2_002,e1_040,3
...,...,...,...
21604,e1_050,e2_057,4
21605,e1_043,e2_055,6
21606,e1_043,e2_057,2
21607,e2_055,e1_030,4


In [30]:
JS_df = pd.merge(JS_df_initial, CBS_Weights_dictA_dict, on='edge1', how='outer')
JS_df = JS_df[JS_df['edge2'].notna()]

JS_df = pd.merge(JS_df, CBS_Weights_dictB_dict, on='edge2', how='outer')
JS_df = JS_df[JS_df['edge1'].notna()]

JS_df['jsWeight'] = JS_df['cbsWeight']/(JS_df['edge1Weight'] +  JS_df['edge2Weight'] - JS_df['cbsWeight'])
JS_df = JS_df.drop(['cbsWeight', 'edge1Weight', 'edge2Weight'], axis = 1)
JS_df['jsWeight'] = round(JS_df['jsWeight'], 4)
JS_df

Unnamed: 0,edge1,edge2,jsWeight
0,e2_002,e1_114,0.1111
1,e2_019,e1_114,0.2857
2,e2_068,e1_114,0.0526
3,e2_002,e1_032,0.2500
4,e2_019,e1_032,0.1250
...,...,...,...
21604,e1_142,e2_041,0.0556
21605,e1_125,e2_041,0.1176
21606,e1_067,e2_041,0.1250
21607,e1_126,e2_041,0.1429


In [31]:
JS_Weights_list = JS_df.values.tolist()
JS_Weights_list

[['e2_002', 'e1_114', 0.1111],
 ['e2_019', 'e1_114', 0.2857],
 ['e2_068', 'e1_114', 0.0526],
 ['e2_002', 'e1_032', 0.25],
 ['e2_019', 'e1_032', 0.125],
 ['e2_068', 'e1_032', 0.1765],
 ['e2_133', 'e1_032', 0.1111],
 ['e2_142', 'e1_032', 0.2105],
 ['e2_002', 'e1_104', 0.3125],
 ['e2_019', 'e1_104', 0.1176],
 ['e2_068', 'e1_104', 0.2353],
 ['e2_133', 'e1_104', 0.1667],
 ['e2_142', 'e1_104', 0.2],
 ['e2_002', 'e1_119', 0.125],
 ['e2_019', 'e1_119', 0.3333],
 ['e2_068', 'e1_119', 0.0588],
 ['e2_133', 'e1_119', 0.2857],
 ['e2_142', 'e1_119', 0.2353],
 ['e2_001', 'e1_119', 0.1111],
 ['e2_099', 'e1_119', 0.25],
 ['e2_063', 'e1_119', 0.2308],
 ['e2_002', 'e1_040', 0.1667],
 ['e2_019', 'e1_040', 0.2667],
 ['e2_068', 'e1_040', 0.1053],
 ['e2_133', 'e1_040', 0.2353],
 ['e2_142', 'e1_040', 0.2632],
 ['e2_001', 'e1_040', 0.15],
 ['e2_099', 'e1_040', 0.2857],
 ['e2_063', 'e1_040', 0.4615],
 ['e2_002', 'e1_008', 0.25],
 ['e2_019', 'e1_008', 0.125],
 ['e2_068', 'e1_008', 0.25],
 ['e2_133', 'e1_008', 0.

In [32]:
JS_Weights_dict = dict()
for sub in JS_Weights_list:
    JS_Weights_dict[tuple(sub[:2])] = (sub[2:])

In [33]:
JS_Weights_dict = {k:v[0] for k,v in JS_Weights_dict.items()}
JS_Weights_dict

{('e2_002', 'e1_114'): 0.1111,
 ('e2_019', 'e1_114'): 0.2857,
 ('e2_068', 'e1_114'): 0.0526,
 ('e2_002', 'e1_032'): 0.25,
 ('e2_019', 'e1_032'): 0.125,
 ('e2_068', 'e1_032'): 0.1765,
 ('e2_133', 'e1_032'): 0.1111,
 ('e2_142', 'e1_032'): 0.2105,
 ('e2_002', 'e1_104'): 0.3125,
 ('e2_019', 'e1_104'): 0.1176,
 ('e2_068', 'e1_104'): 0.2353,
 ('e2_133', 'e1_104'): 0.1667,
 ('e2_142', 'e1_104'): 0.2,
 ('e2_002', 'e1_119'): 0.125,
 ('e2_019', 'e1_119'): 0.3333,
 ('e2_068', 'e1_119'): 0.0588,
 ('e2_133', 'e1_119'): 0.2857,
 ('e2_142', 'e1_119'): 0.2353,
 ('e2_001', 'e1_119'): 0.1111,
 ('e2_099', 'e1_119'): 0.25,
 ('e2_063', 'e1_119'): 0.2308,
 ('e2_002', 'e1_040'): 0.1667,
 ('e2_019', 'e1_040'): 0.2667,
 ('e2_068', 'e1_040'): 0.1053,
 ('e2_133', 'e1_040'): 0.2353,
 ('e2_142', 'e1_040'): 0.2632,
 ('e2_001', 'e1_040'): 0.15,
 ('e2_099', 'e1_040'): 0.2857,
 ('e2_063', 'e1_040'): 0.4615,
 ('e2_002', 'e1_008'): 0.25,
 ('e2_019', 'e1_008'): 0.125,
 ('e2_068', 'e1_008'): 0.25,
 ('e2_133', 'e1_008'): 0

# Pruning Scheme: Weight Edge Pruning method w/ CBS

In [34]:
def getWEP_df(send_Dict):
    values_list = list(send_Dict.values())
    
    # combining all CBS edges as list 
    keys_list = list(send_Dict.keys())
    keys_list = [list(x) for x in keys_list]
    avg = round(averageWeight(values_list), 2)
    print("Average of all the weights =", round(averageWeight(values_list), 2))

    # droping the edges with weights lower than the average of all the weights in the graph
    send_Dict_wep = {k: v for k, v in send_Dict.items() if v > avg}
    print("Total pair of edges (after WEG pruning): ", len(send_Dict_wep))

    # convertiing CBS edges pairs  in list
    send_Dict_wep_keys = list(send_Dict_wep.keys())
    send_Dict_wep_keys = [list(x) for x in send_Dict_wep_keys]

    # convertiing CBS weight, edges pairs in same list for graph
    WEP_list = removeDuplicateSublist(send_Dict_wep_keys)
    print("Total pair of edges (after WEG pruning & duplicate droping): ", len(WEP_list))

    return WEP_list


In [35]:
WEP_list = getWEP_df(CBS_Weights_dict) # JS_Weights_dict
WEP_list

Average of all the weights = 3.17
Total pair of edges (after WEG pruning):  9691
Total pair of edges (after WEG pruning & duplicate droping):  9691


[['e1_072', 'e2_136'],
 ['e1_056', 'e2_032'],
 ['e1_112', 'e2_069'],
 ['e1_074', 'e2_035'],
 ['e1_023', 'e2_079'],
 ['e1_123', 'e2_036'],
 ['e1_079', 'e2_084'],
 ['e1_010', 'e2_030'],
 ['e1_051', 'e2_020'],
 ['e1_150', 'e2_022'],
 ['e1_071', 'e2_051'],
 ['e1_125', 'e2_018'],
 ['e1_149', 'e2_015'],
 ['e1_097', 'e2_049'],
 ['e1_006', 'e2_055'],
 ['e1_010', 'e2_098'],
 ['e1_076', 'e2_149'],
 ['e1_134', 'e2_031'],
 ['e1_114', 'e2_093'],
 ['e1_062', 'e2_029'],
 ['e1_052', 'e2_032'],
 ['e1_126', 'e2_087'],
 ['e1_133', 'e2_014'],
 ['e1_024', 'e2_073'],
 ['e1_046', 'e2_067'],
 ['e1_015', 'e2_090'],
 ['e1_041', 'e2_073'],
 ['e1_094', 'e2_073'],
 ['e1_100', 'e2_110'],
 ['e1_050', 'e2_032'],
 ['e1_007', 'e2_050'],
 ['e1_134', 'e2_039'],
 ['e1_062', 'e2_043'],
 ['e1_075', 'e2_038'],
 ['e1_020', 'e2_057'],
 ['e1_052', 'e2_044'],
 ['e1_057', 'e2_029'],
 ['e1_003', 'e2_087'],
 ['e1_024', 'e2_037'],
 ['e1_067', 'e2_035'],
 ['e1_039', 'e2_005'],
 ['e1_093', 'e2_023'],
 ['e1_058', 'e2_104'],
 ['e1_085',

In [36]:
# generate blocks from WEP
blocks_WEP = []
for blc in block_values:
    for item in WEP_list:
        ln = len(item)
        for i in range(len(blc) - ln + 1):
            if all(item[j] == blc[i+j] for j in range(ln)):
                blocks_WEP.append(blc)

print ("Total number of blocks after WEP method: ", len(blocks_WEP))
blocks_WEP

Total number of blocks after WEP method:  273


[['e1_001', 'e1_136', 'e2_136'],
 ['e1_001',
  'e1_002',
  'e1_003',
  'e1_006',
  'e1_007',
  'e1_009',
  'e1_011',
  'e1_015',
  'e1_017',
  'e1_021',
  'e1_022',
  'e1_023',
  'e1_024',
  'e1_026',
  'e1_028',
  'e1_030',
  'e1_033',
  'e1_040',
  'e1_043',
  'e1_044',
  'e1_047',
  'e1_051',
  'e1_052',
  'e1_054',
  'e1_055',
  'e1_059',
  'e1_060',
  'e1_061',
  'e1_064',
  'e1_072',
  'e1_073',
  'e1_075',
  'e1_076',
  'e1_077',
  'e1_079',
  'e1_083',
  'e1_085',
  'e1_086',
  'e1_088',
  'e1_090',
  'e1_091',
  'e1_092',
  'e1_093',
  'e1_094',
  'e1_096',
  'e1_100',
  'e1_101',
  'e1_103',
  'e1_110',
  'e1_112',
  'e1_113',
  'e1_114',
  'e1_115',
  'e1_116',
  'e1_118',
  'e1_119',
  'e1_120',
  'e1_122',
  'e1_123',
  'e1_127',
  'e1_130',
  'e1_133',
  'e1_134',
  'e1_135',
  'e1_136',
  'e1_138',
  'e1_142',
  'e1_143',
  'e1_144',
  'e1_145',
  'e1_146',
  'e1_148',
  'e1_149',
  'e1_150',
  'e2_005',
  'e2_007',
  'e2_009',
  'e2_010',
  'e2_011',
  'e2_013',
  'e2_0

# Pruning Scheme: Cardinality Node Pruning method w/ JS

In [37]:
# listing of all edges connected to a node (repeat for each node)
# making a dataframe of all the edges connected to the node in dataframe

def getCNP_df(WEP_list):
    CNP_df = pd.DataFrame.from_records(WEP_list)
    CNP_df.rename(columns={0: 'node', 1: 'nodeEdgePair', 2: 'Weight'}, inplace=True)
    CNP_df
    
    # sorting according to each node with respect to edge weights in descending order
    CNP_df = CNP_df.sort_values(['node',  'Weight'], ascending= False)
    CNP_df.head()

    CNP_df.reset_index(drop=True, inplace=True)
    return CNP_df

In [38]:
CNP_df = getCNP_df(JS_Weights_list) # CBS_Weights_list
CNP_df

Unnamed: 0,node,nodeEdgePair,Weight
0,e2_150,e1_150,0.9091
1,e2_150,e1_001,0.4286
2,e2_150,e1_072,0.3571
3,e2_150,e1_004,0.3571
4,e2_150,e1_051,0.3571
...,...,...,...
21604,e1_001,e2_033,0.2143
21605,e1_001,e2_057,0.1429
21606,e1_001,e2_078,0.1333
21607,e1_001,e2_128,0.1250


In [39]:
# calculating blocking cardinality (BCin)
# ∑Bei,ej = total number of blocks with all the pairs of node-graph (∑Bei,ej)
# ∑ei = total number of entities in node-graph
# BCin = (∑Bei, ej) / ∑ei
blockingCardinality = float(len(CBS_Weights_dict.keys()))/float(len(nodes))
blockingCardinality

72.03

In [40]:
# calculating K-threshold for each node-graph 
# K = BCin-1
K = int(math.floor(blockingCardinality-1))
K

71

In [41]:
# calculating number of pairs and merging with dataframe in 'count' colummn
count_df = CNP_df.groupby('node').count()
count_df.rename(columns={'nodeEdgePair': 'count'}, inplace=True)
count_df = count_df.drop(columns=['Weight'])

CNP_dffinal = pd.merge(CNP_df, count_df, on='node', how='outer')
CNP_dffinal

Unnamed: 0,node,nodeEdgePair,Weight,count
0,e2_150,e1_150,0.9091,143
1,e2_150,e1_001,0.4286,143
2,e2_150,e1_072,0.3571,143
3,e2_150,e1_004,0.3571,143
4,e2_150,e1_051,0.3571,143
...,...,...,...,...
21604,e1_001,e2_033,0.2143,18
21605,e1_001,e2_057,0.1429,18
21606,e1_001,e2_078,0.1333,18
21607,e1_001,e2_128,0.1250,18


In [42]:
# keeping only the top K edges of each node in the graph 
CNF_overK_DF = CNP_dffinal[CNP_dffinal['count'] > K]
print("Total number of edges which is over K (before dropping): ", len(CNF_overK_DF))
CNF_overK_DF = CNF_overK_DF.groupby('node').head(K).reset_index(drop=True)
print("Total number of edges which is over K (after dropping): ", len(CNF_overK_DF))
CNF_overK_DF

CNF_belowK_DF = CNP_dffinal[CNP_dffinal['count'] <= K]
print("Total number of edges which is less K: ", len(CNF_belowK_DF))
CNF_belowK_DF

# combing all top k edges for each node
CNF_final = CNF_belowK_DF.append(CNF_overK_DF)
CNF_final = CNF_final.drop(columns=['count', 'Weight'])
CNF_final

Total number of edges which is over K (before dropping):  16342
Total number of edges which is over K (after dropping):  10721
Total number of edges which is less K:  5267


Unnamed: 0,node,nodeEdgePair
143,e2_149,e1_149
144,e2_149,e1_122
145,e2_149,e1_030
146,e2_149,e1_043
147,e2_149,e1_003
...,...,...
10716,e1_005,e2_134
10717,e1_005,e2_015
10718,e1_005,e2_146
10719,e1_005,e2_044


In [43]:
# converting df to list
CNF_final_list = CNF_final.values.tolist()
print("Total number of edges after CNP method: ", len(CNF_final_list))
CNF_final_list

# removing if their is any duplicates adges
CNF_final_list_ = removeDuplicateSublist(CNF_final_list)  
print("Total number of edges after CNP method: ", len(CNF_final_list_))

Total number of edges after CNP method:  15988
Total number of edges after CNP method:  15988


In [44]:
# converting blocks after CNP method
blocks_CNF = []
for blc in block_values:
    for item in CNF_final_list_:
        ln = len(item)
        for i in range(len(blc) - ln + 1):
            if all(item[j] == blc[i+j] for j in range(ln)):
                blocks_CNF.append(blc)            

print ("Total number of blocks after CNP method: ",len(blocks_CNF))
blocks_CNF 

Total number of blocks after CNP method:  312


[['e1_001', 'e1_136', 'e2_136'],
 ['e1_001',
  'e1_002',
  'e1_003',
  'e1_006',
  'e1_007',
  'e1_009',
  'e1_011',
  'e1_015',
  'e1_017',
  'e1_021',
  'e1_022',
  'e1_023',
  'e1_024',
  'e1_026',
  'e1_028',
  'e1_030',
  'e1_033',
  'e1_040',
  'e1_043',
  'e1_044',
  'e1_047',
  'e1_051',
  'e1_052',
  'e1_054',
  'e1_055',
  'e1_059',
  'e1_060',
  'e1_061',
  'e1_064',
  'e1_072',
  'e1_073',
  'e1_075',
  'e1_076',
  'e1_077',
  'e1_079',
  'e1_083',
  'e1_085',
  'e1_086',
  'e1_088',
  'e1_090',
  'e1_091',
  'e1_092',
  'e1_093',
  'e1_094',
  'e1_096',
  'e1_100',
  'e1_101',
  'e1_103',
  'e1_110',
  'e1_112',
  'e1_113',
  'e1_114',
  'e1_115',
  'e1_116',
  'e1_118',
  'e1_119',
  'e1_120',
  'e1_122',
  'e1_123',
  'e1_127',
  'e1_130',
  'e1_133',
  'e1_134',
  'e1_135',
  'e1_136',
  'e1_138',
  'e1_142',
  'e1_143',
  'e1_144',
  'e1_145',
  'e1_146',
  'e1_148',
  'e1_149',
  'e1_150',
  'e2_005',
  'e2_007',
  'e2_009',
  'e2_010',
  'e2_011',
  'e2_013',
  'e2_0

In [45]:
print("Total num of nodes (initial): ", len(nodes))
print("Total pair of edges (initial): ", len(CBS_Weights_EdgesPair_final))
print("Total pair of edges (after weight edge pruning method): ", len(WEP_list))
print("Total pair of edges (after cardinality node pruning method): ", len(CNF_final_list_))

Total num of nodes (initial):  300
Total pair of edges (initial):  21609
Total pair of edges (after weight edge pruning method):  9691
Total pair of edges (after cardinality node pruning method):  15988


In [46]:
print("Total number of blocks (initial): ", len(blocks))
print("Total number of blocks (after dropping block w/ single value): ", len(block_values))
print("Total number of blocks (after weight edge pruning method): ", len(blocks_WEP))
print("Total number of blocks (after cardinality node pruning method): ",len(blocks_CNF))

Total number of blocks (initial):  880
Total number of blocks (after dropping block w/ single value):  388
Total number of blocks (after weight edge pruning method):  273
Total number of blocks (after cardinality node pruning method):  312
