In [1]:
from mpi import create_distinct_view, create_identity_view, link
import pandas as pd
import os
from utils.db import get_session
from utils.models import *

## Ingest
Bring .csv into the system

In [2]:
from ingest import load_file
os.listdir('assets/data')

['dws_wages.csv',
 'usbe_students.csv',
 'ushe_students.csv',
 'ustc_students.csv']

In [3]:
load_file('assets/data/dws_wages.csv', 'dws_wages')
load_file('assets/data/usbe_students.csv', 'usbe_students')
load_file('assets/data/ustc_students.csv', 'ustc_students')
load_file('assets/data/ushe_students.csv', 'ushe_students')

## Linking Steps

In [4]:
# Visualize raw MPI data
#  V0.1  Data is seeded with ushe_students table
#  TODO: implement matched route and proper identification of unmatched

with get_session() as session:
    result = session.execute(
        'SELECT * FROM master_person_long LIMIT 5'
    ).fetchall()
    count_mpi = session.execute(
        'SELECT COUNT(DISTINCT(mpi)) FROM master_person_long'
    ).fetchone()
columns = ('mpi', 'field', 'value', 'score', 'guid')
pd.DataFrame(result, columns=columns)

Unnamed: 0,mpi,field,value,score,guid
0,4614774-16011216-593186-10763907,ushe_student_id_pool,469629.0,1.0,775123813349145184
1,4614774-16011216-593186-10763907,last_name_pool,Steed,1.0,775123813349145184
2,4614774-16011216-593186-10763907,first_name_pool,Sakayla,1.0,775123813349145184
3,4614774-16011216-593186-10763907,middle_name_pool,Conder,1.0,775123813349145184
4,4614774-16011216-593186-10763907,birth_date_pool,6/20/2859,1.0,775123813349145184


In [5]:
count_mpi

(10000,)

In [6]:
# Create a view of the data with mapped columns
raw, subset = create_distinct_view('ushe_students')
dview = subset.drop_duplicates()
# print(dview.head())

# Create a view from the MPI table with valid identity data
iview = create_identity_view(dview.columns.tolist())
# print(iview.head())

t1, t2 = dview, iview #link(dview, iview)  # MULTIPLE LINKING NOT SUPPORTED

In [7]:
t1.head(1)

Unnamed: 0,ushe_student_id_pool,last_name_pool,first_name_pool,middle_name_pool,birth_date_pool,gender_pool,ssid_pool,ssn_pool,guid
0,469629.0,Steed,Sakayla,Conder,6/20/2859,m,0x23570dbc0x46ae1b78,375910347,1027519456284287867


In [8]:
t2.head(1)

Unnamed: 0_level_0,value,value,value,value,value,value,value,value,value,score,score,score,score,score,score,score,score,score
field,birth_date_pool,first_name_pool,gender_pool,guid,last_name_pool,middle_name_pool,ssid_pool,ssn_pool,ushe_student_id_pool,birth_date_pool,first_name_pool,gender_pool,guid,last_name_pool,middle_name_pool,ssid_pool,ssn_pool,ushe_student_id_pool
mpi,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
10003201-9614165-4958245-12107289,3/12/2754,Drake,m,775123813349145184,McClean,Samba,0x21c4ecab0x4389d956,346913149,422191.0,1,1,1,1,1,1,1,1,1


## Building record linkage and mpi classification

In [9]:
from mpi.link import clean_raw, match_dtype

### Preprocessing

Standardize data across data and identity views.

In [10]:
# Match Dtypes - Align data types prior to cleaning.
#    This helps the cleaner by segmenting string/object and numeric fields

ct1, ct2 = match_dtype(t1, t2.value)  # t2 identity table is a multi-index table.

# Clean data and align formats, fields, for indexing and comparison.
subset = clean_raw(subset)
ct1 = clean_raw(ct1)
ct2 = clean_raw(ct2)
ct2 = ct2.reset_index(level='mpi')
scores = t2.score

In [11]:
ct1.head(1)

Unnamed: 0,ushe_student_id_pool,last_name_pool,first_name_pool,middle_name_pool,birth_date_pool,gender_pool,ssid_pool,ssn_pool,guid
0,469629.0,steed,sakayla,conder,6/20/2859,m,0x23570dbc0x46ae1b78,375910347,1027519456284287867


In [12]:
ct2.head(1)

field,mpi,birth_date_pool,first_name_pool,gender_pool,guid,last_name_pool,middle_name_pool,ssid_pool,ssn_pool,ushe_student_id_pool
0,10003201-9614165-4958245-12107289,3/12/2754,drake,m,775123813349145184,mcclean,samba,0x21c4ecab0x4389d956,346913149,422191.0


In [13]:
scores.head(1)

field,birth_date_pool,first_name_pool,gender_pool,guid,last_name_pool,middle_name_pool,ssid_pool,ssn_pool,ushe_student_id_pool
mpi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10003201-9614165-4958245-12107289,1,1,1,1,1,1,1,1,1


## Indexing

Make record pairs - pair rows needing match to potential identity candidates.

In [14]:
from mpi.link import build_indexer, view_indices

In [15]:
# Create indexer on dataview
#    Indexer is a set of rules to generate 
#    candidate matches from data -> identities
indexer = build_indexer(ct1)

# Check index algorithms (generated from data view columns)
indexer.algorithms

[<SortedNeighbourhood left_on='last_name_pool', right_on='last_name_pool'>,
 <SortedNeighbourhood left_on='first_name_pool', right_on='first_name_pool'>,
 <SortedNeighbourhood left_on='middle_name_pool', right_on='middle_name_pool'>,
 <Block left_on='ushe_student_id_pool', right_on='ushe_student_id_pool'>,
 <Block left_on='ssid_pool', right_on='ssid_pool'>,
 <Block left_on='ssn_pool', right_on='ssn_pool'>]

In [16]:
# Run indexer on dataview, identity view
candidates = indexer.index(ct1, ct2)

# Examine multi indices.  On the left is the data view index.  Right identity.
candidates[0:5]

MultiIndex([(0,   18),
            (0, 1994),
            (0, 2386),
            (0, 3332),
            (0, 4276)],
           )

## Comparing

Create field comparators to run on candidates identified during indexing

In [17]:
from mpi.link import build_comparator

In [18]:
# Create comparator on dataview
#    Comparator is a set of algorithms for each feature to be compared.
#    These are genearlly much more expensive compared to indexing functions
cmp = build_comparator(ct1)

# Check comparison algorithms and fields
cmp.features

[<Numeric 'ushe_student_id_pool'>,
 <Exact 'ssid_pool'>,
 <Numeric 'ssn_pool'>,
 <String 'last_name_pool'>,
 <String 'first_name_pool'>,
 <String 'middle_name_pool'>]

In [19]:
# Compute comparisons
#    Gives clean match dataset for classification
comparisons = cmp.compute(candidates, ct1, ct2)
comparisons.head()

Unnamed: 0,Unnamed: 1,ushe_student_id_pool,ssid_pool,ssn_pool,last_name_pool,first_name_pool,middle_name_pool
0,18,0.0,0,0.0,0.0,0.0,1.0
0,1994,0.0,0,0.0,0.0,0.0,1.0
0,2386,0.0,0,0.0,1.0,0.0,0.0
0,3332,0.0,0,0.0,0.0,0.0,0.0
0,4276,0.0,0,0.0,0.0,0.0,0.0


## Classification
Score candidates for match.  

#### Two approaches: Supervised vs Unsupervised
 * **Supervised** approach requires a training set.
 * **Unsupervised** does not require a training set and operates on only on the comparison table itself.

In [20]:
from mpi.link import estimate_true, create_classifier

# Get estimated true linkages for supervised model
links_true = estimate_true(comparisons)

# Create classifier
clf = create_classifier('logistic', comparisons, match_index=links_true)

# Check probabilities (score) of each comparison
print(len(links_true))
print(clf.prob(comparison_vectors=comparisons)[0:5])

9998
0  18      0.000058
   1994    0.000058
   2386    0.000043
   3332    0.000003
   4276    0.000003
dtype: float64


## Evaluate
Express classification quality and explore outliers

In [21]:
from recordlinkage import reduction_ratio
from recordlinkage import confusion_matrix

links_pred = clf.predict(comparison_vectors=comparisons)

rratio = reduction_ratio(links_pred, ct1)
cmatrix = confusion_matrix(links_true, links_pred, candidates)

In [22]:
# Review confusion matrix
#    TP-FN
#    |  |
#    FP-TN
print(cmatrix)

# Review reduction ratio
print(rratio)

[[ 9998     0]
 [    2 84812]]
0.9997999799979999


The confusion matrix may not be particularly useful here as generation of true links is prone to error. The reduction ratio is more sensitive than binary predictions in this case.

In [23]:
# Review findings
#   Interesting that the logistic predicted an MPI indices for each given an incomplete target list.

# Is the relationship 1,1?
split_list = lambda x: ([ix[0] for ix in x], [ix[1] for ix in x])
i1, i2 = split_list(links_pred)
len(list(set(i1))), len(list(set(i2)))

(10000, 10000)

## Update

Append matched MPIs and match score to data view and merge to original data.

In [26]:
from mpi.link import expand_match_to_raw

# Join data view with MPI with raw table
#     Create merge condition
raw = expand_match_to_raw(raw, subset, ct1, ct2, links_pred)
raw.head(1)

Unnamed: 0,index,S_INST,S_YEAR,S_TERM,S_EXTRACT,S_ID,S_ID_FLAG,S_PREVIOUS_ID,S_LAST,S_FIRST,...,S_TERM_ATT_CR,S_TERM_EARNED_CR,S_COLLEGE,S_MAJOR,S_COLLEGE2,S_MAJOR2,S_INAME,id,guid,mpi
0,0,1858,2018,3,E,469629.0,I,187277,Steed,Sakayla,...,4,126,Huntsman School of Business,Deaf Ed & Elementary Ed,School of the Arts,Network Systems,SNOW,7853,1027519456284287867,4614774-16011216-593186-10763907


### De-Identification

Create de-identified table while match available in memory or as referenced temp table.

In [48]:
from assets.mapping import colmap
from utils.db import dataframe_to_db
from di import simple_di

dataframe_to_db(simple_di(raw), 'ushe_students_di')

'USHE_STUDENTS_DI'