## Ingest
Bring .csv into the system

In [1]:
from ingest import load_file
import os
os.listdir('assets/data')

['dws_wages.csv',
 'usbe_students.csv',
 'ushe_students.csv',
 'ustc_students.csv']

In [2]:
load_file('assets/data/dws_wages.csv', 'dws_wages')
load_file('assets/data/usbe_students.csv', 'usbe_students')
load_file('assets/data/ustc_students.csv', 'ustc_students') 
load_file('assets/data/ushe_students.csv', 'ushe_students')

## Linking Steps

Process of Record Linkage

0. Load
1. Preprocess
2. Index
3. Compare
4. Classify
5. Evaluate
6. Update (Post MPI - Adding new information, new MPI, to MPI Master Record)

### Master Person Architectures
Visualize data architecture store of MPI raw records.  NOTE: data may not be present in all database options.  Use whichever cell aligns with configured loading behavior.

**SQL** View

In [1]:
from db import get_session
import pandas as pd 


with get_session() as session:
    result = session.execute(
        'SELECT * FROM mpi_vectors LIMIT 5'
    ).fetchall()
    count_mpi = session.execute(
        'SELECT COUNT(*) FROM mpi_vectors'
    ).fetchone()
pd.DataFrame(result)

Unnamed: 0,0,1,2,3
0,1,,elvis,11
1,1,presley,elvis,12
2,1,costello,elvis,13
3,2,austin,jane,11
4,2,austin,janet,12


In [2]:
print('Total MPI in system: ', str(count_mpi[0]))

Total MPI in system:  5


**NoSQL** View

In [23]:
from db import get_mongo
import json

db = get_mongo()
count_docs = db.raw.count_documents({})
x = db.raw.find_one({})
x['_id'] = str(x['_id'])

print(json.dumps(x, indent=2))

{
  "_id": "5fe4c95dc3167f203746dc3d",
  "mpi": "14201705-13242408-16004929-5639130",
  "sources": [
    {
      "guid": 1489953705965142162,
      "score": 0.0,
      "fields": [
        {
          "fieldname": "birth_date_pool",
          "value": "6/9/1256"
        },
        {
          "fieldname": "FIRST_ENTERED_US",
          "value": "07-MAR-16 12.00.00.000000000 AM"
        },
        {
          "fieldname": "first_name_pool",
          "value": "genora"
        },
        {
          "fieldname": "gender_pool",
          "value": "f"
        },
        {
          "fieldname": "last_name_pool",
          "value": "zegarra"
        },
        {
          "fieldname": "middle_name_pool",
          "value": "rhino"
        },
        {
          "fieldname": "ssid_pool",
          "value": "0x218b2ea30x43165d46"
        },
        {
          "fieldname": "usbe_student_id_pool",
          "value": 666165
        },
        {
          "fieldname": "id",
          "value": 4639

In [2]:
# Delete all records (skip if not needed)
d = db.raw.delete_many({})
d.deleted_count

10000

In [3]:
print('Total MPI in system: ', count_docs)

Total MPI in system:  10000


## Prepare Data

Prepare identity view (MPI vectors) and data view (distinct mapped columns from source)

In [1]:
source_tablename = 'usbe_students'
# source_tablename = 'ustc_students'
# source_tablename = 'ushe_students'
# source_tablename = 'dws_wages'

In [2]:
from mpi.prepare import create_data_view, create_identity_view

dview = create_data_view(source_tablename)
iview = create_identity_view(mapped_columns=dview.subset.columns)

Mapped Columns:  ['BIRTH_DATE', 'FIRST_NAME', 'GENDER', 'LAST_NAME', 'MIDDLE_NAME', 'SSID', 'STUDENT_ID', 'guid']


In [3]:
dview.head()

Unnamed: 0,BIRTH_DATE,FIRST_ENTERED_US,FIRST_NAME,GENDER,LAST_NAME,MIDDLE_NAME,SSID,STUDENT_ID,id,guid
0,6/9/1256,07-MAR-16 12.00.00.000000000 AM,Genora,f,Zegarra,Rhino,0x218b2ea30x43165d46,666165,46394,1489953705965142162
1,9/5/1287,13-AUG-12 12.00.00.000000000 AM,Laramee,m,Vela Garcia,Legacy-Martin,0x2156f0e80x42ade1d0,976990,35263,1489953705965142162
2,3/11/1507,06-OCT-16 12.00.00.000000000 AM,Zelma,n,Garlitz,Katempa,0x2981558d0x5302ab1a,225275,148149,1489953705965142162
3,3/11/1898,24-FEB-97 12.00.00.000000000 AM,Rynale,n,Reuter,Javien,0x175401eb0x2ea803d6,445502272,126536,1489953705965142162
4,1/5/2380,05-SEP-05 12.00.00.000000000 AM,Weslee,n,Cavalcante,Milam,0x286b652b0x50d6ca56,437170,105139,1489953705965142162


In [4]:
dview.subset.head()

Unnamed: 0,birth_date_pool,FIRST_ENTERED_US,first_name_pool,gender_pool,last_name_pool,middle_name_pool,ssid_pool,usbe_student_id_pool,id,guid
0,6/9/1256,07-MAR-16 12.00.00.000000000 AM,Genora,f,Zegarra,Rhino,0x218b2ea30x43165d46,666165,46394,1489953705965142162
1,9/5/1287,13-AUG-12 12.00.00.000000000 AM,Laramee,m,Vela Garcia,Legacy-Martin,0x2156f0e80x42ade1d0,976990,35263,1489953705965142162
2,3/11/1507,06-OCT-16 12.00.00.000000000 AM,Zelma,n,Garlitz,Katempa,0x2981558d0x5302ab1a,225275,148149,1489953705965142162
3,3/11/1898,24-FEB-97 12.00.00.000000000 AM,Rynale,n,Reuter,Javien,0x175401eb0x2ea803d6,445502272,126536,1489953705965142162
4,1/5/2380,05-SEP-05 12.00.00.000000000 AM,Weslee,n,Cavalcante,Milam,0x286b652b0x50d6ca56,437170,105139,1489953705965142162


In [5]:
iview.head()

### Performance Option / Seeding
Check here for potential for match.  If a match is impossible on available fields, can circumvent the linkage process and generate the MPI's here.

In [6]:
# Check for match availability.  If not, halt process and create MPIs
from mpi.link import is_match_available
from mpi.update import generate_mpi, write_mpi_data, gen_mpi_insert
from mpi.update import update_mpi_vector_table
from mpi.prepare import standardize


if is_match_available(dview.subset, iview):
    print('Match available.  Proceed with linking process.')
else:
    print('Match unavailable.  Generated MPIs for data view.')
    write_mpi_data(
        gen_mpi_insert(
            generate_mpi(standardize(dview.subset))
        )
    )
    update_mpi_vector_table()
    
    # Recreate a view from the MPI table with valid identity data
    iview = create_identity_view(mapped_columns=dview.subset.columns)
    

checking available columns []
Match unavailable.  Generated MPIs for data view.


## Building record linkage and mpi classification

In [7]:
from mpi.prepare import standardize
from utils import match_dtype

### Preprocessing

Standardize data across data and identity views.

In [8]:
# Match Dtypes - Align data types prior to cleaning.
#    This helps the cleaner by segmenting string/object and numeric fields

# Cast columns to matching datatypes for comparisons later on
source_clean, id_clean = match_dtype(standardize(dview.subset), iview)

In [9]:
source_clean.head(1)

Unnamed: 0,birth_date_pool,FIRST_ENTERED_US,first_name_pool,gender_pool,last_name_pool,middle_name_pool,ssid_pool,usbe_student_id_pool,id,guid
0,6/9/1256,07-MAR-16 12.00.00.000000000 AM,genora,f,zegarra,rhino,0x218b2ea30x43165d46,666165,46394,1489953705965142162


In [10]:
id_clean.head(1)

Unnamed: 0,gender_pool,FIRST_ENTERED_US,freq_score,mpi,usbe_student_id_pool,ssid_pool,first_name_pool,middle_name_pool,last_name_pool,birth_date_pool,id
0,f,07-MAR-16 12.00.00.000000000 AM,1.0,14201705-13242408-16004929-5639130,666165,0x218b2ea30x43165d46,genora,rhino,zegarra,6/9/1256,46394


## Indexing

Make record pairs - pair rows needing match to potential identity candidates.

Indexing serves two purposes:

1. Create the list of pairs to check (candidate link).  Example: row 1 from table 1 to row 199 from table 2.

2. Reduce the potential number of pairs to check (candidates).

In [11]:
from mpi.index import build_indexer
from utils import match_dataframe_columns

In [12]:
# Create indexer on dataview
#    Indexer is a set of rules to generate 
#    candidate matches from data -> identities

source_matched, id_matched = match_dataframe_columns(source_clean, id_clean)

indexer = build_indexer(source_matched)

# Check index algorithms (generated from data view columns)
indexer.algorithms

[<SortedNeighbourhood left_on='first_name_pool', right_on='first_name_pool'>,
 <SortedNeighbourhood left_on='middle_name_pool', right_on='middle_name_pool'>,
 <SortedNeighbourhood left_on='last_name_pool', right_on='last_name_pool'>,
 <SortedNeighbourhood left_on='id', right_on='id'>,
 <Block left_on='usbe_student_id_pool', right_on='usbe_student_id_pool'>,
 <Block left_on='ssid_pool', right_on='ssid_pool'>,
 <Block left_on='id', right_on='id'>]

In [13]:
# Run indexer on dataview, identity view
candidates = indexer.index(source_matched, id_matched)

# Full indexing is a cross join of data and all possible identities.

# Demonstrating full indexing size:
print('Full Index Length: ', len(source_clean) * len(id_clean))

# Examine multi indices.  On the left is the data view index.  Right identity.
print('Algorithmic Index Length: ', len(candidates))

# Estimate Savings
print('Savings: ', (1- len(candidates)/(len(source_clean) * len(id_clean))) * 100)

# Preview indices:
for pair in candidates[0:5]:
    print(f'Data-row {pair[0]}', f'ID-row {pair[1]}')

Full Index Length:  100000000
Algorithmic Index Length:  114582
Savings:  99.885418
Data-row 0 ID-row 0
Data-row 0 ID-row 1707
Data-row 0 ID-row 2695
Data-row 0 ID-row 3252
Data-row 0 ID-row 4305


## Comparing

Indexing does not normally store the outcome of its findings.  Indexing algorithms are meant to be fast, can be error prone.  Algorithms can be tuned for string (many), numeric, and time/date fields.

The output of comparison is a clean feature matrix for the classifier to train/predict on.

In [14]:
from mpi.compare import build_comparator

In [15]:
# Create comparator on dataview
#    Comparator is a set of algorithms for each feature to be compared.
#    These are genearlly much more expensive compared to indexing functions
cmp = build_comparator(source_matched)

# Check comparison algorithms and fields
cmp.features

[<Numeric 'usbe_student_id_pool'>,
 <Exact 'ssid_pool'>,
 <Numeric 'id'>,
 <String 'gender_pool'>,
 <String 'first_name_pool'>,
 <String 'middle_name_pool'>,
 <String 'last_name_pool'>,
 <String 'id'>]

In [17]:
# Compute comparisons
#    Gives clean match dataset for classification
comparisons = cmp.compute(candidates, source_clean, id_clean)
comparisons.head()

TypeError: str argument expected

In [21]:
source_matched

Unnamed: 0,gender_pool,FIRST_ENTERED_US,usbe_student_id_pool,ssid_pool,first_name_pool,middle_name_pool,last_name_pool,birth_date_pool,id
0,f,07-MAR-16 12.00.00.000000000 AM,666165,0x218b2ea30x43165d46,genora,rhino,zegarra,6/9/1256,46394
1,m,13-AUG-12 12.00.00.000000000 AM,976990,0x2156f0e80x42ade1d0,laramee,legacy martin,vela garcia,9/5/1287,35263
2,n,06-OCT-16 12.00.00.000000000 AM,225275,0x2981558d0x5302ab1a,zelma,katempa,garlitz,3/11/1507,148149
3,n,24-FEB-97 12.00.00.000000000 AM,445502272,0x175401eb0x2ea803d6,rynale,javien,reuter,3/11/1898,126536
4,n,05-SEP-05 12.00.00.000000000 AM,437170,0x286b652b0x50d6ca56,weslee,milam,cavalcante,1/5/2380,105139
...,...,...,...,...,...,...,...,...,...
9995,m,25-DEC-04 12.00.00.000000000 AM,311049,0x169657b20x2d2caf64,mekaylie,randall j,tzunun,5/3/2727,66917
9996,m,27-JAN-14 12.00.00.000000000 AM,793712,0x2e52052f0x5ca40a5e,behrett,everhart,covarrubias,6/18/1650,16819
9997,f,20-JUL-08 12.00.00.000000000 AM,506989,0x1d500f3e0x3aa01e7c,alylah,mikhail,treft,10/4/1458,14273
9998,f,28-FEB-98 12.00.00.000000000 AM,555305,0x1fd3990c0x3fa73218,emsley,destry,kennington,10/9/2082,36743


## Classification

Score candidates for match.  

#### Two approaches: Supervised vs Unsupervised
 * **Supervised** approach requires a training set.
 * **Unsupervised** does not require a training set and operates on only on the comparison table itself.

In [20]:
from mpi.classify import estimate_true, build_classifier

# Get estimated true linkages for supervised model
links_true = estimate_true(comparisons)

# Create classifier
clf = build_classifier('logistic', comparisons, match_index=links_true)

# Check probabilities (score) of each comparison -- NOT IN USE IN THIS VERSION
predictions = clf.prob(comparison_vectors=comparisons)
predictions

0     0       0.999409
      1707    0.000025
      4305    0.000001
      4980    0.000005
      6265    0.000005
                ...   
9999  5475    0.000106
      6678    0.000199
      7860    0.000005
      9752    0.000005
      9999    0.999409
Length: 94600, dtype: float64

## Evaluate
Express classification quality and explore outliers

In [21]:
from recordlinkage import reduction_ratio
from recordlinkage import confusion_matrix

links_pred = clf.predict(comparison_vectors=comparisons)

rratio = reduction_ratio(links_pred, source_clean)
cmatrix = confusion_matrix(links_true, links_pred, candidates)

In [22]:
# Review confusion matrix
#    TP-FN
#    |  |
#    FP-TN
print(cmatrix)

# Review reduction ratio
print(rratio)

[[ 9999     0]
 [    1 84600]]
0.9997999799979999


The confusion matrix may not be particularly useful here as generation of true links is prone to error. The reduction ratio is more sensitive than binary predictions in this case.

In [23]:
# Review findings
#   Interesting that the logistic predicted an MPI indices for each given an incomplete target list.

# Is the relationship 1,1?
split_list = lambda x: ([ix[0] for ix in x], [ix[1] for ix in x])
i1, i2 = split_list(links_pred)
len(list(set(i1))), len(list(set(i2)))

(10000, 10000)

## Update

Append matched MPIs and match score to data view and merge to original data.

In [24]:
from mpi.update import append_mpi, write_matched_unmatched
from utils import union_frames

# Attach matched MPIs and generate MPIs for unmatched rows
matched, unmatched = append_mpi(source_clean, id_clean, links_pred)

# Write matched and unmatched data to DB
write_matched_unmatched(matched, unmatched)

# Combine matched, unmatched
combined = union_frames(matched, unmatched)

In [27]:
dview.head()

Unnamed: 0,BIRTH_DATE,FIRST_ENTERED_US,FIRST_NAME,GENDER,LAST_NAME,MIDDLE_NAME,SSID,STUDENT_ID,id,guid
0,6/9/1256,07-MAR-16 12.00.00.000000000 AM,Genora,f,Zegarra,Rhino,0x218b2ea30x43165d46,666165,46394,1489953705965142162
1,9/5/1287,13-AUG-12 12.00.00.000000000 AM,Laramee,m,Vela Garcia,Legacy-Martin,0x2156f0e80x42ade1d0,976990,35263,1489953705965142162
2,3/11/1507,06-OCT-16 12.00.00.000000000 AM,Zelma,n,Garlitz,Katempa,0x2981558d0x5302ab1a,225275,148149,1489953705965142162
3,3/11/1898,24-FEB-97 12.00.00.000000000 AM,Rynale,n,Reuter,Javien,0x175401eb0x2ea803d6,445502272,126536,1489953705965142162
4,1/5/2380,05-SEP-05 12.00.00.000000000 AM,Weslee,n,Cavalcante,Milam,0x286b652b0x50d6ca56,437170,105139,1489953705965142162


In [38]:
from mpi.update import expand_match_to_raw, append_mpi

# Join data view (DISTINCT identities in source table), now containing matched and generated MPIs, to raw table.
#    This can be done a few ways.  Here, the data view (whose columns have been renamed and processed)
#    is joined to the original subset (whose columns were just renamed).  The subset is then indexed back 
#    unto the raw table so original column names and source formatting are preserved.


updated, matched, unmatched = expand_match_to_raw(raw, subset, source_data, id_data, links_pred)
updated.head(1)

NameError: name 'raw' is not defined

In [87]:
# Update the MPI Vectors table for future use
from mpi.update import update_mpi_vector_table
update_mpi_vector_table()

### De-Identification

Create de-identified table while match available in memory or as referenced temp table.

In [88]:
from assets.mapping import colmap
from db import dataframe_to_db
from di import simple_di

dataframe_to_db(
    simple_di(updated), 
    tablename=source_tablename + '_di'
)

'ustc_students_di'

## Flag MPI

Rule 1:  MPI contains disagreement in blocking identifiers (local_id, ssn, ssid)

Rule 2:  Blocking identifer shared between multiple MPI

In [1]:
from mpi.postprocess import Rule1, Rule2

In [2]:
f = Rule1()
f.result

Unnamed: 0,mpi,flag
0,15918315-13223288-3889668-8685359,Rule1


In [3]:
f = Rule2()
f.result

Unnamed: 0,flag
