# LIMS probabilistic linkage

In [1]:
# Imports

import os, sys
from os import listdir, sep
from os.path import abspath, basename, isdir
from sys import argv
import pandas as pd
import pickle
import random
import numpy as np
import recordlinkage as rl
from recordlinkage import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import MiniBatchKMeans
import collections
import classifiers

# Settings

encoding = 'ISO-8859-1'

# Variables

INPUTDATA = "data/LIMS/probabilisticLinkage/"

TARGET_FILENAME = "New lims extract.xlsx"
QUERY_FILENAME = "LINC Gold Standard - Original Testing Registers"

# Reading functions

def read_targetdata():
    df = pd.read_excel(os.path.join(INPUTDATA, TARGET_FILENAME), sheet = "Sheet 1")
    df = df[["PATID", "patnumber", "accessnumber", "FIRSTNAME", "NAME", "SEX", "birthdate", "LOCNAME"]]
    df = df.rename(dict(zip(df.columns, ["patid1", "patid2", "patid3", "firstname", "surname", "sex", "birthdate", "locname"])), axis = 1)
    df = df.drop_duplicates()
    # First Name
    df.firstname = preprocessing.clean(df.firstname)
    # Surname
    df.surname = preprocessing.clean(df.surname)
    # Sex
    df.loc[df.sex == 1, ["sex"]] = "m"
    df.loc[df.sex == 2, ["sex"]] = "f"
    df.loc[df.sex == 0, ["sex"]] = np.nan
    # Date
    df.birthdate = pd.to_datetime(df["birthdate"], errors = "coerce")
    # Locname
    df.locname = preprocessing.clean(df.locname)
    df.locname = df.locname.str.replace('\d+', '')
    df = df.drop_duplicates()
    df = df.reset_index(drop=True)
    return df

def read_sourcedata():
    return

df = read_targetdata()
dft = df[50:1050].copy()
dft = dft.reset_index(drop=True)
dfq = df[:1000].copy()
dfq = dfq.reset_index(drop=True)


## Indexing

In [2]:
columns = ["firstname", "surname"]
ngrams = 2
k = 3

def join(v):
    v = [x for x in v if str(x) != "nan"]
    return " ".join(v)

def ngram_vectorize(df, cols, n):
    dfm = df[cols].apply(join, axis=1)
    vectorizer = CountVectorizer(analyzer = "char", ngram_range = (1,n))
    vectorizer.fit(dfm)
    X = vectorizer.transform(dfm).toarray()
    return vectorizer, X

def clustering(X, k):
    kmeans = MiniBatchKMeans(n_clusters = k, random_state = 42)
    kmeans.fit(X)
    clusters = kmeans.predict(X)
    return kmeans, clusters

def partition_target(df, cols, n, k):
    vectorizer, X = ngram_vectorize(df, cols, n)
    kmeans, clusters = clustering(X, k)
    df["partition"] = clusters
    return df, vectorizer, kmeans

def partition_query(df, cols, vectorizer, kmeans):
    dfm = df[cols].apply(join, axis=1)
    X = vectorizer.transform(dfm).toarray()
    clusters = kmeans.predict(X)
    df["partition"] = clusters
    return df

def partition(dfq, dft, colst, colsq, ngrams, k):
    dft, vectorizer, kmeans = partition_target(dft, colst, ngrams, k)
    dfq = partition_query(dfq, colsq, vectorizer, kmeans)
    return dfq, dft

def block(dfq, dft):
    indexer = rl.Index()
    indexer.block('partition')
    candidate_pairs = indexer.index(dfq, dft)
    return candidate_pairs

dfq, dft = partition(dfq, dft, columns, columns, ngrams, k)
pairs = block(dfq, dft)

## Expand

In [335]:
def expand_name(dfs, dfe):
    def _expand_name(dfs, dfe, col):
        dfe[col] = dfs[col]
        dfe[col+"_soundex"] = preprocessing.phonetic(dfs[col], method = "soundex")
        dfe[col+"_nysiis"] = preprocessing.phonetic(dfs[col], method = "nysiis")
        dfe[col+"_metaphone"] = preprocessing.phonetic(dfs[col], method = "metaphone")
        return dfe
    dfe = _expand_name(dfs, dfe, "firstname")
    dfe = _expand_name(dfs, dfe, "surname")
    return dfe

def expand_birthdate(dfs, dfe):
    dfe["birthdate"] = pd.to_datetime(dfs["birthdate"], errors = "coerce")
    dfe["birthdate_string"] = dfs["birthdate"].dt.strftime
    dfe["birthyear"] = dfe.birthdate.dt.year
    return dfe

def expand_sex(dfs, dfe):
    dfe["sex"] = dfs["sex"]
    return dfe

def expand(dfs):
    dfe = dfs.copy()
    dfe = expand_name(dfs, dfe)
    dfe = expand_birthdate(dfs, dfe)
    dfe = expand_sex(dfs, dfe)
    return dfe

dft = expand(dft)
dfq = expand(dfq)

## Compare

In [394]:
%%time

def compare_name(comp, col):
    comp.exact(col, col, label = col)
    comp.string(col, col, method = "jarowinkler", label = col+"_jarowinkler")
    comp.exact(col+"_soundex", col+"_soundex", label = col+"_soundex")
    comp.exact(col+"_nysiis", col+"_nysiis", label = col+"_nysiis")
    comp.exact(col+"_metaphone", col+"_metaphone", label = col+"_metaphone")
    return comp

def compare_birthdate(comp):
    comp.date("birthdate", "birthdate", label = "birthdate_date")
    #comp.string("birthdate_string", "birthdate_string", method = "jarowinkler", label = "birthdate_jarowinkler")
    comp.numeric("birthyear", "birthyear", scale = 1, label = "birthdate_year")
    return comp

def compare_sex(comp):
    comp.exact("sex", "sex", missing_value = np.nan, label = "sex")
    return comp

def compare_location(comp):
    comp.exact(col, col, label = col)
    comp.string(col, col, method = "jarowinkler", threshold = 0.85, label = col+"_jarowinkler")
    comp.exact(col+"_soundex", col+"_soundex", label = col+"_soundex")
    comp.exact(col+"_nysiis", col+"_nysiis", label = col+"_nysiis")
    comp.exact(col+"_metaphone", col+"_metaphone", label = col+"_metaphone")
    return comp

def compare(pairs, dfq, dft):
    comp = rl.Compare(n_jobs = -1)
    comp = compare_name(comp, "firstname")
    comp = compare_name(comp, "surname")
    comp = compare_birthdate(comp)
    comp = compare_sex(comp)
    return comp.compute(pairs, dfq, dft)

dfc = compare(pairs, dfq, dft)

CPU times: user 159 ms, sys: 212 ms, total: 370 ms
Wall time: 11.4 s


In [379]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

def binarize_comparisons(dfc, dfq, dft, expected_hit_ratio = 1):
    def binarize():
        pass
    # DO IT WITH RANKS EXPECTED VALUES
    # BEWARE REPEATED VALUES.
    
def preprocess_comparison(df):
    df = df.loc[:, (df != df.iloc[0]).any()]
    X = np.array(df)
    imp = SimpleImputer(strategy = "most_frequent")
    X = imp.fit_transform(X)
    sc = MinMaxScaler()
    X = sc.fit_transform(X)
    df = pd.DataFrame(X, columns = df.columns, index = df.index)
    return df

In [395]:
dfc = preprocess_comparison(dfc)

## Feature ensembles

In [396]:
def get_feature_dict(df):
    features = collections.defaultdict(list)
    for c in df.columns:
        features[c.split("_")[0]] += [c]
    return features

features = get_feature_dict(dfc)

In [397]:
B = 10

def sample_features(df, B):
    features = get_feature_dict(df)
    ensemble = []
    for _ in range(0, B):
        feats = []
        for k, v in features.items():
            feats += [random.choice(v)]
        ensemble += [feats]
    return ensemble

ensemble = sample_features(dfc, 10)

In [393]:
?SimpleImputer

In [409]:
## Probabilistic linkage

ecm = classifiers.ECMClassifier(binarize = 0.5, max_iter = 1000, atol = 1e-10)
ecm.fit(dfc)

In [417]:
min_proba = 0.0

def raw_match(feature_vectors):
    sums = feature_vectors.sum(axis=1)
    return sums / feature_vectors.shape[1]

def ecm_probabilities(feature_vectors):
    ecm = classifiers.ECMClassifier(binarize = 0.5, max_iter = 1000, atol = 1e-10)
    ecm.fit(feature_vectors)
    probas = ecm.prob(feature_vectors)
    return probas

def ensemble_ecm_probabilities(feature_vectors, ensemble):
    probas = []
    for cols in ensemble:
        probas += [ecm_probabilities(feature_vectors[cols])]
    probas = pd.concat(probas, axis = 1)
    probas = probas.median(axis = 1)
    return probas

def linkage(feature_vectors, dfq, dft, ensemble):
    raws  = raw_match(feature_vectors)
    probas = ensemble_ecm_probabilities(feature_vectors, ensemble)
    dfl = pd.DataFrame(data = {"index.q": probas.index.labels[0],
                               "index.t": probas.index.labels[1],
                               "raw": raws.values,
                               "proba": probas.values,
                               "firstname.q": dfq.iloc[probas.index.labels[0]]["firstname"].values,
                               "surname.q": dfq.iloc[probas.index.labels[0]]["surname"].values,
                               "birthdate.q": dfq.iloc[probas.index.labels[0]]["birthdate"].values,
                               "firstname.t": dft.iloc[probas.index.labels[1]]["firstname"].values,
                               "surname.t": dft.iloc[probas.index.labels[1]]["surname"].values,
                               "birthdate.t": dft.iloc[probas.index.labels[1]]["birthdate"].values})
    dfl = dfl[["index.q", "index.t", "raw", "proba", "firstname.q", "surname.q", "birthdate.q", "firstname.t", "surname.t", "birthdate.t"]]
    dfl = dfl[dfl.proba >= min_prob]
    dfl = dfl.sort_values(by = ["index.q", "raw"], ascending = [True, False])
    dfl = dfl.reset_index(drop = True)
    return dfl

dfl = linkage(dfc, dfq, dft, ensemble)

In [419]:
dfl.sort_values(by = "proba", ascending = False)

Unnamed: 0,index.q,index.t,raw,proba,firstname.q,surname.q,birthdate.q,firstname.t,surname.t,birthdate.t
290736,837,420,1.000000,9.970982e-01,abaham,mwale,2019-03-23,abaham,mwale,2019-03-23
247382,712,662,1.000000,9.970982e-01,charity,kayamba,1981-01-01,charity,kayamba,1981-01-01
344423,990,940,1.000000,9.970982e-01,emelda,ngulube,2018-12-04,emelda,ngulube,2018-12-04
344422,990,44,1.000000,9.970982e-01,emelda,ngulube,2018-12-04,emelda,ngulube,2018-12-04
287433,827,777,1.000000,9.970982e-01,miracle,musangu,2019-03-27,miracle,musangu,2019-03-27
220349,636,586,1.000000,9.970982e-01,maureen,mwanza,1990-06-28,maureen,mwanza,1990-06-28
254161,732,682,1.000000,9.970982e-01,shimonda,violet,2019-05-04,shimonda,violet,2019-05-04
123294,354,304,1.000000,9.970982e-01,gabriel,phiri,2018-08-05,gabriel,phiri,2018-08-05
153455,441,391,1.000000,9.970982e-01,patrick,mbewe,1974-01-01,patrick,mbewe,1974-01-01
50285,144,94,1.000000,9.970982e-01,stone,sakala,1955-08-21,stone,sakala,1955-08-21


In [415]:
probas = pd.concat(probas, axis = 1)

In [416]:
probas.median(axis = 1)

0    1      5.935029e-21
     3      1.659606e-16
     7      5.935029e-21
     11     5.935029e-21
     15     5.935029e-21
     17     5.935029e-21
     19     5.935029e-21
     20     5.935029e-21
     26     3.428020e-15
     28     1.659606e-16
     30     5.935029e-21
     31     3.428020e-15
     35     1.659606e-16
     38     1.659606e-16
     46     5.935029e-21
     54     5.935029e-21
     60     5.935029e-21
     67     5.935029e-21
     68     3.428020e-15
     70     3.428020e-15
     71     1.659606e-16
     74     5.935029e-21
     78     5.935029e-21
     79     1.659606e-16
     81     5.935029e-21
     84     3.428020e-15
     85     3.428020e-15
     96     5.935029e-21
     100    5.935029e-21
     101    3.428020e-15
                ...     
989  877    7.117140e-16
     880    1.659606e-16
     884    7.117140e-16
     885    7.117140e-16
     896    5.935029e-21
     905    5.935029e-21
     911    5.935029e-21
     914    5.935029e-21
     916    5.935029e-21


In [330]:
p = 0.8

import itertools
from collections import defaultdict
from scipy import spatial

def get_lowest(df, cols):
    low = 0
    f1 = 0
    f2 = 0
    for k in itertools.combinations(cols, 2):
        cos = spatial.distance.cosine(df[k[0]].values, df[k[1]].values)
        if cos > low:
            low = cos
            f1 = k[0]
            f2 = k[1]
    return f1, f2

def generate_pool(df, p):
    features = defaultdict(list)
    features_names = set([x.split("_")[0] for x in df.columns])
    for feature in sorted(features_names):
        vf = [col for col in df.columns if feature in col]
        if len(vf) == 1:
            features[feature] = vf
            continue
        f1, f2 = get_lowest(df, vf)
        V = [f1, f2]
        features[feature].append(f1)
        features[feature].append(f2)
        vf.remove(f1)
        vf.remove(f2)
        while vf:
            f = vf.pop()
            c = 0
            for item in V:
                num = spatial.distance.cosine(df[f].values, df[item].values)
                if num > p and f not in features:
                    c += 1;
            if c == len(V):
                V.append(f)
                features[feature].append(f)
    return features

def convert_to_features_schema(features):
    features = list(dict(features).values())
    features_schema = list(itertools.product(*features))
    return features_schema

def get_features_schema(df):
    dfv = feature_vectors.reset_index(drop = True)
    features = generate_pool(dfv, p)
    features_schema = convert_to_features_schema(features)
    return features_schema

features_schema = get_features_schema(feature_vectors)

In [None]:
def calculate_weights(df, features, Xm, Xu, w):
    ls = []
    djs = []
    for index, f in enumerate(features):
        match = (df.loc[Xm][f] - 1).sum(axis = 0)
        notmatch = (df.loc[Xu][f] - 0).sum(axis = 0)
        dj = match + notmatch
        djs.append(dj)
        if dj == 0:
            ls.append(index)
    if len(ls) > 0:
        w = np.zeros(w.shape)
        w[ls] = 1 / len(ls)
    else :
        s = sum(1 / d for d in djs)
        djs = [round(1 / (dj*s), 5) for dj in djs]
        w = np.asarray(djs).reshape(w.shape)
    return w

def feature_weights(df, Mm, Mu, e, w):
    Xm=set()
    Xu=set()
    tm, tu = 0, 0
    while(len(Xm)<Mm):
        t=np.dot(abs(df[~df.index.isin(Xm)].values-1),w)
        Xm.update(df[~df.index.isin(Xm)][t<=tm].head(Mm-len(Xm)).index) ### fill the seed until we reach Mm without repeating
        tm+=0.05
    while(len(Xu)<Mu):
        t=np.dot(abs(df[~df.index.isin(Xu)].values-0),w)
        inde=set(df[(~df.index.isin(Xu))][t<=tu].head(Mu-len(Xu)).index) ### make sure that no matching point is selected for not matching point
        Xu.update(inde-Xm)
        tu+=0.05
    wnew=calculate_weights(df,df.columns,Xm,Xu,w)
    while( np.array(abs(wnew-w)>e).any()):
        Xm=set()
        Xu=set()
        tm,tu=0,0
        w=wnew
        while(len(Xm)<Mm):
            t=np.dot(abs(df[~df.index.isin(Xm)].values-1),w)
            Xm.update(df[~df.index.isin(Xm)][t<=tm].head(Mm-len(Xm)).index)
            tm+=0.05
        while(len(Xu)<Mu):
            t=np.dot(abs(df[~df.index.isin(Xu)].values-0),w)
            inde=set(df[(~df.index.isin(Xu))][t<=tu].head(Mu-len(Xu)).index)
            Xu.update(inde-Xm)
            tu+=0.05
        wnew=calculate_weights(df,df.columns,Xm,Xu,w)
    return Xm,Xu

## Probabilities

In [185]:
dfl[dfl.proba > 0.5].sort_values(by = "proba")

Unnamed: 0,index.q,index.t,raw,proba,firstname.q,surname.q,birthdate.q,firstname.t,surname.t,birthdate.t
328293,947,901,0.6,0.504688,angel,chisha,2018-12-05,angela,simulungwe,2018-12-05
328291,947,49,0.6,0.504688,angel,chisha,2018-12-05,angela,simulungwe,2018-12-05
33844,99,106,0.6,0.504688,angela,simulungwe,2018-12-05,angel,chisha,2018-12-05
329858,951,106,0.6,0.504688,angela,simulungwe,2018-12-05,angel,chisha,2018-12-05
329859,951,897,0.6,0.504688,angela,simulungwe,2018-12-05,angel,chisha,2018-12-05
54817,156,901,0.6,0.504688,angel,chisha,2018-12-05,angela,simulungwe,2018-12-05
33845,99,897,0.6,0.504688,angela,simulungwe,2018-12-05,angel,chisha,2018-12-05
54815,156,49,0.6,0.504688,angel,chisha,2018-12-05,angela,simulungwe,2018-12-05
4779,15,773,0.6,0.638794,natasha,mungandu,2018-11-16,aron,mungandu,2018-11-16
78337,223,101,0.6,0.638794,maureen,tembo,1974-01-01,everisto,tembo,1974-01-01


In [153]:
clf = classifiers.ECMClassifier()

In [95]:
clf = rl.ECMClassifier(binarize=0.5, max_iter=100, atol = 1e-4)
clf.fit(feature_vectors)

In [187]:
feature_vectors

Unnamed: 0,Unnamed: 1,firstname,firstname_jarowinkler,firstname_soundex,firstname_nysiis,firstname_metaphone,surname,surname_jarowinkler,surname_soundex,surname_nysiis,surname_metaphone,birthdate,birthyear,sex
0,1,0,0.626852,0,0,0,0,0.666667,0,0,0,0.0,0.0,0
0,3,0,0.000000,0,0,0,0,0.550000,0,0,0,0.0,0.0,0
0,7,0,0.000000,0,0,0,0,0.550000,0,0,0,0.0,0.0,0
0,11,0,0.555556,0,0,0,0,0.508333,0,0,0,0.0,0.0,0
0,15,0,0.000000,0,0,0,0,0.441667,0,0,0,0.0,0.0,0
0,17,0,0.000000,0,0,0,0,0.550000,0,0,0,0.0,0.0,0
0,19,0,0.481481,0,0,0,0,0.430556,0,0,0,0.0,0.0,0
0,20,0,0.587302,0,0,0,0,0.550000,0,0,0,0.0,0.0,0
0,26,0,0.569444,0,0,0,0,0.430556,0,0,0,0.0,0.0,1
0,28,0,0.351852,0,0,0,0,0.550000,0,0,0,0.0,0.0,1


In [55]:
sums = feature_vectors.sum(axis = 1)

In [73]:
clf = rl.ECMClassifier()

In [89]:
clf.fit(feature_vectors)

  feature_log_prob_ = np.log(safe_sparse_dot(g_freq.T, X_unique_bin))
  return umr_maximum(a, axis, None, out, keepdims)


In [90]:
?rl.ECMClassifier

In [295]:
p = 0.8

import itertools
from collections import defaultdict
from scipy import spatial

def get_lowest(df, cols):
    low = 0
    f1 = 0
    f2 = 0
    for k in itertools.combinations(cols, 2):
        cos = spatial.distance.cosine(df[k[0]].values, df[k[1]].values)
        if cos > low:
            low = cos
            f1 = k[0]
            f2 = k[1]
    return f1, f2

def generate_pool(df, p):
    features = defaultdict(list)
    features_names = set([x.split("_")[0] for x in df.columns])
    for feature in sorted(features_names):
        vf = [col for col in df.columns if feature in col]
        if len(vf) == 1:
            features[feature] = vf
            continue
        f1, f2 = get_lowest(df, vf)
        V = [f1, f2]
        features[feature].append(f1)
        features[feature].append(f2)
        vf.remove(f1)
        vf.remove(f2)
        while vf:
            f = vf.pop()
            c = 0
            for item in V:
                num = spatial.distance.cosine(df[f].values, df[item].values)
                if num > p and f not in features:
                    c += 1;
            if c == len(V):
                V.append(f)
                features[feature].append(f)
    return features

def convert_to_features_schema(features):
    features = list(dict(features).values())
    features_schema = list(itertools.product(*features))
    return features_schema

def get_features_schema(df):
    dfv = feature_vectors.reset_index(drop = True)
    features = generate_pool(dfv, p)
    features_schema = convert_to_features_schema(features)
    return features_schema

In [296]:
dfv = feature_vectors.reset_index(drop = True)
features = generate_pool(dfv, p)
features_schema = convert_to_features_schema(features)

In [297]:
def calculate_weights(df, features, Xm, Xu, w):
    ls = []
    djs = []
    for index, f in enumerate(features):
        match = (df.loc[Xm][f] - 1).sum(axis = 0)
        notmatch = (df.loc[Xu][f] - 0).sum(axis = 0)
        dj = match + notmatch
        djs.append(dj)
        if dj == 0:
            ls.append(index)
    if len(ls) > 0:
        w = np.zeros(w.shape)
        w[ls] = 1 / len(ls)
    else :
        s = sum(1 / d for d in djs)
        djs = [round(1 / (dj*s), 5) for dj in djs]
        w = np.asarray(djs).reshape(w.shape)
    return w

def automatic_seed_selection(df, Mm, Mu, e, w):
    Xm=set()
    Xu=set()
    tm,tu=0,0
    while(len(Xm)<Mm):
        t=np.dot(abs(df[~df.index.isin(Xm)].values-1),w)
        Xm.update(df[~df.index.isin(Xm)][t<=tm].head(Mm-len(Xm)).index) ### fill the seed until we reach Mm without repeating
        tm+=0.05
    while(len(Xu)<Mu):
        t=np.dot(abs(df[~df.index.isin(Xu)].values-0),w)
        inde=set(df[(~df.index.isin(Xu))][t<=tu].head(Mu-len(Xu)).index) ### make sure that no matching point is selected for not matching point
        Xu.update(inde-Xm)
        tu+=0.05
    wnew=calculate_weights(df,df.columns,Xm,Xu,w)
    while( np.array(abs(wnew-w)>e).any()):
        Xm=set()
        Xu=set()
        tm,tu=0,0
        w=wnew
        while(len(Xm)<Mm):
            t=np.dot(abs(df[~df.index.isin(Xm)].values-1),w)
            Xm.update(df[~df.index.isin(Xm)][t<=tm].head(Mm-len(Xm)).index)
            tm+=0.05
        while(len(Xu)<Mu):
            t=np.dot(abs(df[~df.index.isin(Xu)].values-0),w)
            inde=set(df[(~df.index.isin(Xu))][t<=tu].head(Mu-len(Xu)).index)
            Xu.update(inde-Xm)
            tu+=0.05
        wnew=calculate_weights(df,df.columns,Xm,Xu,w)
    return Xm,Xu

In [298]:
Mm = 200
Mu = 8000
e  = 0.5

def calculate_Xm_Xu(df, features):
    Xm = []
    Xu = []
    for k, v in features.items():
        z = len(v)
        w = np.full((z, 1), 1. / z)
        x1, x2 = automatic_seed_selection(df[v], Mm, Mu, e, w)
        Xm.append(x1)
        Xu.append(x2)    
    return Xm, Xu

In [299]:
Xm, Xu = calculate_Xm_Xu(dfv, features)

In [300]:
ALL = set().union(*Xm)

def calculate_Q(set0, set1):
    S00 = set0.intersection(set1)
    S11 = ALL - (set0.union(set1))
    S01 = set0 - set1
    S10 = set0 - set1
    Q = float((len(S00)*len(S11)) - (len(S01) * len(S10))) / ((len(S00)*len(S11)) + (len(S01) * len(S10)))
    return Q

In [301]:
Qs = []
for f in itertools.combinations(range(0,len(Xm)),2):
    Qs.append((calculate_Q(Xm[f[0]],Xm[f[1]]),f[0],f[1]))

In [323]:
from frameworks.SelfLearning import *
from sklearn.linear_model import LogisticRegression

models = []
X_features = []
for i, schema in enumerate(features_schema):
    model = SelfLearningModel(LogisticRegression(solver = "lbfgs"))
    models.append(model)
    X_features += [list(schema)]

In [327]:
%%time
for i, model in enumerate(models):
    X = dfv[X_features[i]].values
    name = 'proba_' + str(i)
    dfv[name] = -1
    dfv.loc[Xm[i], name] = 1
    dfv.loc[Xu[i], name] = 0
    y = dfv[name].values
    model.fit(X, y)
    #dfv.loc[dfv[name] == -1, name] = model.predict_proba(dfv[dfv[name] == -1][X_features[i]].values)

IndexError: list index out of range

In [328]:
dfv

Unnamed: 0,firstname,firstname_jarowinkler,firstname_soundex,firstname_nysiis,firstname_metaphone,surname,surname_jarowinkler,surname_soundex,surname_nysiis,surname_metaphone,...,match_0,match_1,match_2,match_3,match_4,proba_0,proba_1,proba_2,proba_3,proba_4
0,0,0.626852,0,0,0,0,0.666667,0,0,0,...,0,0,0,0,-1,-1,0,-1,0,-1
1,0,0.000000,0,0,0,0,0.550000,0,0,0,...,0,0,0,0,-1,-1,0,0,0,-1
2,0,0.000000,0,0,0,0,0.550000,0,0,0,...,0,0,0,0,-1,-1,0,0,0,-1
3,0,0.555556,0,0,0,0,0.508333,0,0,0,...,0,0,0,0,-1,-1,0,-1,0,-1
4,0,0.000000,0,0,0,0,0.441667,0,0,0,...,0,0,0,0,-1,-1,0,0,0,-1
5,0,0.000000,0,0,0,0,0.550000,0,0,0,...,0,0,0,0,-1,-1,0,0,0,-1
6,0,0.481481,0,0,0,0,0.430556,0,0,0,...,0,0,0,0,-1,-1,0,-1,0,-1
7,0,0.587302,0,0,0,0,0.550000,0,0,0,...,0,0,0,0,-1,-1,0,-1,0,-1
8,0,0.569444,0,0,0,0,0.430556,0,0,0,...,0,1,0,0,-1,-1,1,-1,0,-1
9,0,0.351852,0,0,0,0,0.550000,0,0,0,...,0,1,0,0,-1,-1,1,-1,0,-1


In [308]:
dfv

Unnamed: 0,firstname,firstname_jarowinkler,firstname_soundex,firstname_nysiis,firstname_metaphone,surname,surname_jarowinkler,surname_soundex,surname_nysiis,surname_metaphone,birthdate_date,birthdate_year,sex
0,0,0.626852,0,0,0,0,0.666667,0,0,0,0.0,0.0,0
1,0,0.000000,0,0,0,0,0.550000,0,0,0,0.0,0.0,0
2,0,0.000000,0,0,0,0,0.550000,0,0,0,0.0,0.0,0
3,0,0.555556,0,0,0,0,0.508333,0,0,0,0.0,0.0,0
4,0,0.000000,0,0,0,0,0.441667,0,0,0,0.0,0.0,0
5,0,0.000000,0,0,0,0,0.550000,0,0,0,0.0,0.0,0
6,0,0.481481,0,0,0,0,0.430556,0,0,0,0.0,0.0,0
7,0,0.587302,0,0,0,0,0.550000,0,0,0,0.0,0.0,0
8,0,0.569444,0,0,0,0,0.430556,0,0,0,0.0,0.0,1
9,0,0.351852,0,0,0,0,0.550000,0,0,0,0.0,0.0,1


In [292]:
features_schema

[('surname', 'sex', 'firstname', 'birthdate_date'),
 ('surname', 'sex', 'firstname', 'birthdate_year'),
 ('surname', 'sex', 'firstname_jarowinkler', 'birthdate_date'),
 ('surname', 'sex', 'firstname_jarowinkler', 'birthdate_year'),
 ('surname_jarowinkler', 'sex', 'firstname', 'birthdate_date'),
 ('surname_jarowinkler', 'sex', 'firstname', 'birthdate_year'),
 ('surname_jarowinkler', 'sex', 'firstname_jarowinkler', 'birthdate_date'),
 ('surname_jarowinkler', 'sex', 'firstname_jarowinkler', 'birthdate_year')]

In [None]:
feature