# Record Linkage
A demo of the excellent [Record Linkage](https://recordlinkage.readthedocs.io/en/latest/) toolkit specific to the challenges expected when merging health, police and emergency shelter data.

In [236]:
import pandas as pd
import numpy as np
import re

from tqdm.auto import tqdm, trange
from tqdm.notebook import tqdm
tqdm.pandas()

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

import recordlinkage as rl
from recordlinkage.datasets import load_febrl2, load_febrl4, binary_vectors
from recordlinkage.index import Block, Full, Random
from recordlinkage.base import BaseIndexAlgorithm

## Databases

- Freely Extensible Biomedical Record Linkage (Febrl) datasets.
- `febrl4` contains two tables, A and B, with 5000 entries.
- Table A has the original records and Table B contains duplicates.
- Used to test merging two datasets.

In [107]:
dfA, dfB = load_febrl4()

In [220]:
dfA.loc['rec-712-org']

given_name               emma
surname                  geue
street_number            1220
address_1        rumker place
address_2          milbrodale
suburb                woodend
postcode                 2429
state                     nsw
date_of_birth        19941224
soc_sec_id            3886398
Name: rec-712-org, dtype: object

In [222]:
dfB.loc['rec-712-dup-0']

given_name               emam
surname                  geue
street_number            1220
address_1        rumker place
address_2          milbrodale
suburb                woodend
postcode                 2429
state                     nsw
date_of_birth        19941224
soc_sec_id            3886398
Name: rec-712-dup-0, dtype: object

## Indexing

- These utilities determine which records should be compared with which other records.
- If we're linking two datasets, A and B, every record from A should be compared with B.
- If we're looking for redundancies in a single dataset, A, each record in A should be compared with all the records with higher index values.

#### Algorithms
- **Full:** Pulls out all possible indices.
  + Desirable but something to think about.  If AHS has 200,000 individuals and CPS has 5,000 individuals, we have a billion comparisons to do.  Possible but not on laptops or windows PCs.
- **Block:** Create pairs that agree on one or more variables (ie. if we only expected variation in first names, we could compare only those records that matched on last name).

#### Birth Year Blocking
- Full indexing for this dataset would be 25e6 entries.  
- Block to compare only those records with birth years within 5 years of each other.

In [137]:
class BirthYearBlocking(BaseIndexAlgorithm):
    """Block index based birth years."""

    def __init__(self,maxYearDiff):
        self.maxYearDiff = maxYearDiff
        super().__init__()
    
    
    def _link_index(self, df_a, df_b):
        """Pair records with birth years within self.maxYearDiff of each other."""
        
        birthYearA = df_a['date_of_birth'].str[:4].astype(float)
        birthYearB = df_b['date_of_birth'].str[:4].astype(float)

        tBar = tqdm(total = len(birthYearA)*len(birthYearB))
        tuples = []
        for idxA, valA in birthYearA.items():
            for idxB, valB in birthYearB.items():
                if np.abs(valA-valB) <= self.maxYearDiff:
                    tuples += [ (idxA,idxB) ]
                tBar.update()

        tBar.close()

        return pd.MultiIndex.from_tuples(tuples,names=[ df_a.index.name+'_A', df_b.index.name+'_B' ])



In [225]:
indexerBlock = BirthYearBlocking(maxYearDiff=5)
pairsBlock = indexerBlock.index(dfA,dfB)

  0%|          | 0/25000000 [00:00<?, ?it/s]

In [227]:
indexerFull = rl.Index()
indexerFull.add(Full())
indexerFull.add(Full())
pairsFull = indexerFull.index(dfA,dfB)



## Comparison

- It's possible to compare each field in the databases using different methods (ie. exact matches, string distance metrics, etc.).
- The available metrics are from the jellyfish string matching library.  They include Levenshtein, Jaro-Winkler, etc.

In [231]:
%%time
comparer = rl.Compare(n_jobs=-1)

comparer.string('given_name', 'given_name', method='damerau_levenshtein', label='given_name')
comparer.string('surname', 'surname', method='damerau_levenshtein', label='surname')
comparer.string('date_of_birth', 'date_of_birth', method='damerau_levenshtein', label='date_of_birth')

metricTbl = comparer.compute(pairsFull, dfA, dfB)

CPU times: user 2.53 s, sys: 1.68 s, total: 4.21 s
Wall time: 1min 1s


In [186]:
metricTbl

Unnamed: 0_level_0,Unnamed: 1_level_0,given_name,surname,date_of_birth
rec_id_1,rec_id_2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
rec-1070-org,rec-825-dup-0,0.375000,0.000000,0.500
rec-1070-org,rec-3156-dup-0,0.125000,0.285714,0.375
rec-1070-org,rec-2852-dup-0,0.375000,0.142857,0.375
rec-1070-org,rec-3040-dup-0,0.125000,0.142857,0.375
rec-1070-org,rec-4799-dup-0,0.250000,0.285714,0.625
...,...,...,...,...
rec-66-org,rec-2020-dup-0,0.400000,0.111111,0.500
rec-66-org,rec-2062-dup-0,0.200000,0.222222,0.500
rec-66-org,rec-2281-dup-0,0.166667,0.111111,0.625
rec-66-org,rec-3152-dup-0,0.000000,0.222222,0.250


## Classification

- Ideally, we would have a known set of matches (ie. manually matched, linked with health card numbers, etc.).  Supervised learning algorithms can be applied in this case.

In [204]:
# Normalize data features.
metrics = metricTbl.to_numpy()
for r in range(metrics.shape[1]):
    metrics[:,r] *= (metrics[:,r] - np.mean(metrics[:,r]))/np.sqrt(np.var(metrics[:,r]))

In [188]:
# Create list of confirmed matches.
pat = r'rec-\d+'
match = np.zeros(len(metricTbl.index))
iMatch = 0

for keys in list(metricTbl.index):
    
    matchA = re.search(pat,keys[0])
    matchB = re.search(pat,keys[1])
    
    match[iMatch] = 1*( matchA.group(0) == matchB.group(0) )
    iMatch += 1

In [205]:
# Utilize logistic regression to match records.
lr = LogisticRegression()

In [242]:
# Determines confusion matrix
def calc_confusion_matrix(hat,labels):
    
    tPos = int( np.sum(hat * labels) )
    fPos = int( np.sum(hat * (1-labels)) )
    pos = int( np.sum(labels) )
    neg = len(labels) - pos
    fNeg = pos - tPos
    tNeg = neg - fPos

    return np.array([ [ tPos, fNeg ], [ fPos, tNeg ] ])  
    

In [246]:
# Evaluate using k-fold cross-validation.
nFolds = 10

skf = StratifiedKFold(n_splits=nFolds, random_state=None, shuffle=True)
cnfMtx = np.zeros((2,2),dtype=int)
tBar = tqdm(total=nFolds)

for trainIdx, testIdx in skf.split(metrics,match):
    
    lr.fit(metrics[trainIdx,:],match[trainIdx])
    hat = lr.predict(metrics[testIdx,:])
    cnfMtx += calc_confusion_matrix(hat,match[testIdx])
    
    tBar.update()
    
tBar.close()

tPos = cnfMtx[0,0]
fNeg = cnfMtx[0,1]
nPos = tPos+fNeg

fPos = cnfMtx[1,0]
tNeg = cnfMtx[1,1]
nNeg = fPos+tNeg

print(f'tPos: {tPos}/{nPos}, fPos: {fPos}/{nPos}')
print(f'tNeg: {tNeg}/{nNeg}, fNeg: {fNeg}/{nNeg}\n')

print(f'True Positive Rate/Sensitivity: {100*tPos/nPos:.2f}% ({tPos}/{nPos})')
print(f'Confidence/Precision: {100*tPos/(tPos+fPos):.2f}% ({tPos}/{tPos+fPos})')


  0%|          | 0/10 [00:00<?, ?it/s]

tPos: 4193/4592, fPos: 26/4592
tNeg: 2510939/2510965, fNeg: 399/2510965

True Positive Rate/Sensitivity: 91.31% (4193/4592)
Confidence/Precision: 99.38% (4193/4219)


In [244]:
lr.coef_

array([[0.97808405, 0.90194101, 4.15506523]])

## Hashes

In [154]:
import hashlib

## My Index Code

In [75]:
dfA = load_febrl2()

In [79]:
keepCols = [ 'given_name', 'surname', 'date_of_birth' ]
dat = dfA[keepCols].drop_duplicates()

In [80]:
pat = r'(rec-\d+)'

def copy_table(tbl):
    return pd.DataFrame({ 
        'first': list(tbl.given_name), 
        'last': list(tbl.surname), 
        'dob': list(tbl.date_of_birth) }, 
        index= list(range(len(tbl.index))))

dat = dat.groupby( dat.index.str.extract(pat,expand=False), axis=0 ).apply(copy_table)