In [10]:
import pandas as pd
import os
, 
from utils.models import *

## Ingest
Bring .csv into the system

In [41]:
from ingest import load_file
os.listdir('assets/data')

['dws_wages.csv',
 'usbe_students.csv',
 'ushe_students.csv',
 'ustc_students.csv']

In [42]:
load_file('assets/data/dws_wages.csv', 'dws_wages')
load_file('assets/data/usbe_students.csv', 'usbe_students')
load_file('assets/data/ustc_students.csv', 'ustc_students')
load_file('assets/data/ushe_students.csv', 'ushe_students')

## Linking Steps

Process of Record Linkage

0. Load
1. Preprocess
2. Index
3. Compare
4. Classify
5. Evaluate
6. Update (Post MPI - Adding new information, new MPI, to MPI Master Record)

### Master Person Architectures
Visualize data architecture store of MPI raw records.  NOTE: data may not be present in all database options.  Use whichever cell aligns with configured loading behavior.

**SQL** View

In [15]:
from utils.db import get_session

with get_session() as session:
    result = session.execute(
        'SELECT * FROM master_person_long LIMIT 5'
    ).fetchall()
    count_mpi = session.execute(
        'SELECT COUNT(DISTINCT(mpi)) FROM master_person_long'
    ).fetchone()
columns = ('mpi', 'field', 'value', 'score', 'guid')
pd.DataFrame(result, columns=columns)

NameError: name 'pd' is not defined

In [None]:
print('Total MPI in system: ', count_mpi[0])

**NoSQL** View

In [1]:
from utils.db import get_mongo
import json

db = get_mongo()
count_docs = db.raw.count_documents({})
x = db.raw.find_one({})
x['_id'] = str(x['_id'])

print(json.dumps(x, indent=2))

TypeError: 'NoneType' object is not subscriptable

In [2]:
# Delete all records (skip if not needed)
d = db.raw.delete_many({})
d.deleted_count

0

In [3]:
print('Total MPI in system: ', count_docs)

Total MPI in system:  0


## Prepare Data

Prepare identity view (MPI vectors) and data view (distinct mapped columns from source)

In [7]:
source_tablename = 'ushe_students'
# source_tablename = 'dws_wages'

In [8]:
from utils.db import query_db, get_db
from mpi.prepare import create_data_view, create_identity_view

raw, subset = create_data_view(source_tablename)
dview = subset.drop_duplicates()
iview = create_identity_view(mapped_columns=dview.columns.to_list())

### TESTING ONLY (Performance Option / Seeding)
Check here for potential for match.  If a match is impossible on available fields, can circumvent the linkage process and generate the MPI's here.

In [9]:
# Check for match availability.  If not, halt process and create MPIs
from mpi.link import is_match_available
from mpi.update import generate_mpi, write_mpi_data, gen_mpi_insert
from mpi.update import update_mpi_vector_table

if is_match_available(dview, iview):
    print('Match available.  Proceed with linking process.')
else:
    print('Match unavailable.  Generated MPIs for data view.')
    temp = generate_mpi(dview)
    write_mpi_data(gen_mpi_insert(temp))
    update_mpi_vector_table()
    
    # Recreate a view from the MPI table with valid identity data
    iview = create_identity_view(mapped_columns=dview.columns.tolist())
    

Match available.  Proceed with linking process.


In [10]:
raw.head(1)

Unnamed: 0,index,S_INST,S_YEAR,S_TERM,S_EXTRACT,S_ID,S_ID_FLAG,S_PREVIOUS_ID,S_LAST,S_FIRST,...,S_BIA,S_TERM_ATT_CR,S_TERM_EARNED_CR,S_COLLEGE,S_MAJOR,S_COLLEGE2,S_MAJOR2,S_INAME,id,guid
0,0,1858,2018,3,E,469629.0,I,187277,Steed,Sakayla,...,,4,126,Huntsman School of Business,Deaf Ed & Elementary Ed,School of the Arts,Network Systems,SNOW,7853,270603007341519345


In [11]:
dview.head(1)

Unnamed: 0,ushe_student_id_pool,last_name_pool,first_name_pool,middle_name_pool,birth_date_pool,gender_pool,ssid_pool,ssn_pool,guid
0,469629.0,Steed,Sakayla,Conder,6/20/2859,m,0x23570dbc0x46ae1b78,375910347,270603007341519345


In [12]:
iview.head(1)

Unnamed: 0,ssn_pool,middle_name_pool,birth_date_pool,gender_pool,ssid_pool,last_name_pool,first_name_pool,freq_score,mpi
0,885713242,Sky Marie,,,,Neddo,Giniveve Dee,1.0,9430854-11279535-9947192-1990676


In [29]:
iview.dropna(axis=1)

Unnamed: 0,ssn_pool,middle_name_pool,last_name_pool,freq_score,mpi
0,885713242,Sky Marie,Neddo,1.0,9430854-11279535-9947192-1990676
1,180469190,Hierophany,Grawrock,1.0,16751823-1129864-15931392-364986
2,471824371,Sigmar,Berge,1.0,6138267-13889458-5068645-15264702
3,714487489,Raudel,Galindo,1.0,10234143-6960907-14723586-12378303
4,599583779,Hamilton,Martir-Morrell,1.0,14152434-1390101-12479164-15959908
...,...,...,...,...,...
9995,470950552,Howard W,Holdaway,1.0,15195628-11158377-827624-15013225
9996,657429524,Nehemias Kapeneta,Beaver,1.0,2789977-13248583-8575217-16354988
9997,782103857,Calvin Reed,Gillis,1.0,2548935-14144592-12043249-8708308
9998,692428862,Nashawn,Lelle,1.0,14454687-8962619-13927059-3420850


In [13]:
len(iview)

10000

## Building record linkage and mpi classification

In [14]:
from mpi.preprocess import clean_raw, match_dtype

### Preprocessing

Standardize data across data and identity views.

In [15]:
# Match Dtypes - Align data types prior to cleaning.
#    This helps the cleaner by segmenting string/object and numeric fields

if hasattr(iview, 'freq_score'):
    iscore = iview[['mpi', 'freq_score']]
else:
    iscore = None

# Cast columns to matching datatypes for comparisons later on
source_data, id_data = match_dtype(dview, iview)  

# Clean data and re-index comparison.
subset = clean_raw(subset)
source_data = clean_raw(source_data)
id_data = clean_raw(id_data)

In [16]:
source_data.head(1)

Unnamed: 0,ushe_student_id_pool,last_name_pool,first_name_pool,middle_name_pool,birth_date_pool,gender_pool,ssid_pool,ssn_pool,guid
0,469629.0,steed,sakayla,conder,6/20/2859,m,0x23570dbc0x46ae1b78,375910347,270603007341519345


In [17]:
id_data.head(1)

Unnamed: 0,ssn_pool,middle_name_pool,birth_date_pool,gender_pool,ssid_pool,last_name_pool,first_name_pool,freq_score,mpi
0,885713242,sky marie,,,,neddo,giniveve dee,1.0,9430854-11279535-9947192-1990676


In [18]:
iscore.head(1)

Unnamed: 0,mpi,freq_score
0,9430854-11279535-9947192-1990676,1.0


## Indexing

Make record pairs - pair rows needing match to potential identity candidates.

Indexing serves two purposes:

1. Create the list of pairs to check (candidate link).  Example: row 1 from table 1 to row 199 from table 2.

2. Reduce the potential number of pairs to check (candidates).

In [19]:
from mpi.index import build_indexer
from utils import match_dataframe_columns

In [20]:
# Create indexer on dataview
#    Indexer is a set of rules to generate 
#    candidate matches from data -> identities

source_matched, id_matched = match_dataframe_columns(source_data, id_data)

indexer = build_indexer(source_matched)

# Check index algorithms (generated from data view columns)
indexer.algorithms

[<SortedNeighbourhood left_on='middle_name_pool', right_on='middle_name_pool'>,
 <SortedNeighbourhood left_on='last_name_pool', right_on='last_name_pool'>,
 <SortedNeighbourhood left_on='first_name_pool', right_on='first_name_pool'>,
 <Block left_on='ssn_pool', right_on='ssn_pool'>,
 <Block left_on='ssid_pool', right_on='ssid_pool'>]

In [21]:
# Run indexer on dataview, identity view
candidates = indexer.index(source_matched, id_matched)

# Full indexing is a cross join of data and all possible identities.

# Demonstrating full indexing size:
print('Full Index Length: ', len(source_data) * len(id_data))

# Examine multi indices.  On the left is the data view index.  Right identity.
print('Algorithmic Index Length: ', len(candidates))

# Estimate Savings
print('Savings: ', (1- len(candidates)/(len(source_data) * len(id_data))) * 100)

# Preview indices:
for pair in candidates[0:5]:
    print(f'Data-row {pair[0]}', f'ID-row {pair[1]}')

Full Index Length:  100000000
Algorithmic Index Length:  57500
Savings:  99.9425
Data-row 0 ID-row 4283
Data-row 0 ID-row 5239
Data-row 0 ID-row 6384
Data-row 0 ID-row 7276
Data-row 1 ID-row 1283


## Comparing

Indexing does not normally store the outcome of its findings.  Indexing algorithms are meant to be fast, can be error prone.  Algorithms can be tuned for string (many), numeric, and time/date fields.

The output of comparison is a clean feature matrix for the classifier to train/predict on.

In [22]:
from mpi.compare import build_comparator

In [23]:
# Create comparator on dataview
#    Comparator is a set of algorithms for each feature to be compared.
#    These are genearlly much more expensive compared to indexing functions
cmp = build_comparator(source_matched)

# Check comparison algorithms and fields
cmp.features

[<Numeric 'ssn_pool'>,
 <Exact 'ssid_pool'>,
 <String 'middle_name_pool'>,
 <String 'gender_pool'>,
 <String 'last_name_pool'>,
 <String 'first_name_pool'>]

In [24]:
# Compute comparisons
#    Gives clean match dataset for classification
comparisons = cmp.compute(candidates, source_data, id_data)
comparisons.head()

Unnamed: 0,Unnamed: 1,ssn_pool,ssid_pool,middle_name_pool,gender_pool,last_name_pool,first_name_pool
0,4283,0.0,0,1.0,0.0,0.0,0.0
0,5239,0.0,0,0.0,0.0,0.0,1.0
0,6384,0.0,0,1.0,0.0,0.0,0.0
0,7276,0.0,0,1.0,0.0,0.0,0.0
1,1283,0.0,0,0.0,0.0,1.0,0.0


## Classification

Score candidates for match.  

#### Two approaches: Supervised vs Unsupervised
 * **Supervised** approach requires a training set.
 * **Unsupervised** does not require a training set and operates on only on the comparison table itself.

In [25]:
from mpi.classify import estimate_true, build_classifier

# Get estimated true linkages for supervised model
links_true = estimate_true(comparisons)

# Create classifier
clf = build_classifier('logistic', comparisons, match_index=links_true)

# Check probabilities (score) of each comparison -- NOT IN USE IN THIS VERSION
predictions = clf.prob(comparison_vectors=comparisons)
predictions

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0

## Evaluate
Express classification quality and explore outliers

In [23]:
from recordlinkage import reduction_ratio
from recordlinkage import confusion_matrix

links_pred = clf.predict(comparison_vectors=comparisons)

rratio = reduction_ratio(links_pred, source_data)
cmatrix = confusion_matrix(links_true, links_pred, candidates)

In [24]:
# Review confusion matrix
#    TP-FN
#    |  |
#    FP-TN
print(cmatrix)

# Review reduction ratio
print(rratio)

[[ 9999     0]
 [    1 84636]]
0.9997999799979999


The confusion matrix may not be particularly useful here as generation of true links is prone to error. The reduction ratio is more sensitive than binary predictions in this case.

In [25]:
# Review findings
#   Interesting that the logistic predicted an MPI indices for each given an incomplete target list.

# Is the relationship 1,1?
split_list = lambda x: ([ix[0] for ix in x], [ix[1] for ix in x])
i1, i2 = split_list(links_pred)
len(list(set(i1))), len(list(set(i2)))

(10000, 10000)

## Update

Append matched MPIs and match score to data view and merge to original data.

In [36]:
from mpi.update import expand_match_to_raw

# Join data view (DISTINCT identities in source table), now containing matched and generated MPIs, to raw table.
#    This can be done a few ways.  Here, the data view (whose columns have been renamed and processed)
#    is joined to the original subset (whose columns were just renamed).  The subset is then indexed back 
#    unto the raw table so original column names and source formatting are preserved.


updated, matched, unmatched = expand_match_to_raw(raw, subset, source_data, id_data, links_pred)
updated.head(1)

Unnamed: 0,CLMWAGES,EMPLOYER,ES_WORKSITE_NUM,NAICS,WAGES,YRQTR,SSN,LASTNAME,FIRSTNAME,MIDDLEINITIAL,id,guid,mpi
0,14188,HOLLYWOOD VIDEO,0,873477,7898926,20172,885713242,Neddo,Giniveve Dee,Sky Marie,10465,942117639377991468,


In [37]:
# Update the MPI Vectors table for future use
update_mpi_vector_table()

IndexError: list index out of range

### De-Identification

Create de-identified table while match available in memory or as referenced temp table.

In [51]:
from assets.mapping import colmap
from utils.db import dataframe_to_db
from di import simple_di

dataframe_to_db(
    simple_di(updated), 
    tablename=source_tablename + '_di'
)

'dws_wages_di'