In [1]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import fuzzy
import datetime
import re
import math
from matplotlib import pyplot as plt
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [None]:
#df_in = pd.read_excel('FILE DESTINATION') or pd.read_csv('FILE DESTINATION')
#if reading in another file to be used for testing
#Can use other csv/excel files for training as well (replace cell below)

In [2]:
#change destination to whereever the provided Patient Matching Data is stored
df = pd.read_excel('/Users/christopherpan 1/Desktop/LAHacks/Patient Matching Data.xlsx',sep=",")

In [3]:
test = df.loc[:, ['GroupID', 'First Name', 'Last Name', 'Date of Birth', 'Sex', 
                  'Current Street 1', 'Current Street 2', 'Current Zip Code']]

In [4]:
arr = test.values

In [5]:
#chose to use first name, last name, DOB, sex, and address (st 1 + st 2 + zip code)
#converted names to soundex tokens
#stemmed and lemmatized address
soundex = fuzzy.Soundex(4)
stemmer = PorterStemmer()
for row in arr:
    if isinstance(row[2], str):
        row[1] = soundex(row[1].lower())
        row[2] = soundex(row[2].lower())
    else:
        row[1] = soundex(row[1].lower())
    row[1] = row[1].replace('%d','')
    if isinstance(row[3], datetime.datetime):
        row[3] = row[3].strftime("%m/%d/%Y")
    if isinstance(row[4], str):
        row[4] = row[4][0]
    else:
        row[4] = 'U'
    if isinstance(row[5], str):
        row[5] = word_tokenize(row[5])
        str_lst = list(row[5])
        for i in range(len(str_lst)):
            str_lst[i] = stemmer.stem(str_lst[i])
        row[5] = "".join(str_lst)
        if isinstance(row[6], str):
            row[6] = word_tokenize(row[6])
            str_lst = list(row[6])
            for i in range(len(str_lst)):
                str_lst[i] = stemmer.stem(str_lst[i])
            row[6] = "".join(str_lst)
            row[5] += row[6]
        if not math.isnan(row[7]):
            row[5] += str(int(float(row[7])))

In [6]:
arr_desired = np.delete(arr, [6, 7], 1)

In [7]:
#creates data to be used for training by comparing each person with others
def create(arr, groups, total):
    if len(groups) == 0:
        groups.append(compare(arr, arr))
        return
    add = []
    for lst in total:
        add.append(compare(arr, lst))
    groups.extend(add)

In [8]:
#edit distance
def levenshtein(s1, s2):
    if len(s1) < len(s2):
        return levenshtein(s2, s1)

    # len(s1) >= len(s2)
    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
            deletions = current_row[j] + 1       # than s2
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    
    return previous_row[-1]

In [9]:
#returns array of comparisons
def compare(arr, person):
    first_name = levenshtein(arr[1], person[1])
    first_other = levenshtein(arr[1], str(person[2]))
    last_name = levenshtein(str(arr[2]), str(person[2]))
    last_other = levenshtein(str(arr[2]), str(person[1]))
    if first_other > first_name:
        first_name = first_other
        lsat_name = last_other
    bday = levenshtein(str(arr[3]), str(person[3]))
    gender = levenshtein(arr[4], person[4])
    address = levenshtein(str(arr[5]), str(person[5]))  
    same = 0 if arr[0] == person[0] else 1
    return [first_name, last_name, bday, gender, address, same]

In [10]:
groups = []
for row in arr_desired:
    create(row, groups, arr_desired)

In [11]:
np_groups = np.array(groups)

In [12]:
x = np_groups[:,:5]
y = np_groups[:,5]

In [13]:
#originally split into test and train split
#after verifying accuracy swapped to using entire dataset for training
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 420)
x_train, y_train = x, y

In [14]:
def match(arr, groups):
    if len(groups) == 0:
        groups[0] = [arr]
        return
    for key in groups:
        add = []
        for lst in groups[key]:
            add.append(compare(arr, lst)[:5])
        #originally used logistic regression instead of SVM
        #Use of SVM could have lead to overfitting 
        #which would occur less in logistic regression model 
        #at the cost of accuracy
        #log_n = logisticRegr.predict(np.array(add))
        log_n = clf.predict(np.array(add))
        if (log_n == 0).sum() > log_n.size/2:
            groups[key].append(arr)
            return
    groups[len(groups)] = [arr]

In [15]:
clf = SVC(gamma = 'auto')
clf.fit(x,y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [16]:
svc_groups = {}
for row in arr_desired:
    match(row, svc_groups)

In [17]:
#printing grouping results (same #'s should be in a group)
seen = {}
for key in svc_groups:
    for pp in svc_groups[key]:
        print(pp[0], end =' ') 
    print()

1 1 1 1 1 
2 2 
3 3 3 3 3 3 
4 4 
5 5 5 5 
6 
7 7 7 
8 8 8 
9 9 
10 10 
11 11 11 11 
12 13 
14 
15 15 15 
16 
17 17 17 17 
18 18 
19 19 19 19 
20 20 20 20 20 
21 21 21 21 
22 
23 23 23 23 23 23 
24 24 
25 25 25 25 25 
26 26 26 26 
27 27 27 27 27 
28 28 28 
29 29 29 29 29 29 
30 30 
31 31 31 31 31 
32 32 32 
33 33 33 33 
34 34 
35 35 35 35 35 
36 
37 37 37 37 
38 38 38 
39 39 39 39 39 
40 40 40 
41 41 41 41 
42 42 42 
42 
43 43 43 43 43 43 
44 44 44 
45 45 45 45 
46 46 46 46 
47 47 47 47 
48 48 48 
49 49 49 49 
50 
51 51 
52 52 52 
53 53 53 53 
54 54 54 
55 55 55 55 
56 
57 57 57 
58 58 58 58 
59 59 59 59 
60 60 
61 
62 
63 
64 65 


In [18]:
col_add = []
for key in svc_groups:
    for person in svc_groups[key]:
        col_add.append(key)

In [19]:
col_add = np.array(col_add)
df.insert(1, 'Predicted GroupID', col_add)

In [20]:
#change destination
df.to_csv('/Users/christopherpan 1/Desktop/LAHacks/predicted_matches.csv', index = False, header = True)
#can use df_in (the test/other data)