# MUSE CONSOLIDATION TO SITE 1 WORK

## Initiation

In [None]:
#Import necessary packages and create sqlite connection to database

import sqlite3 as sq
import pandas as pd
import numpy as np
import Levenshtein as lev
#from phonetics import soundex

cnx = sq.connect('test.db')
cur = cnx.cursor()

In [None]:
#Check the tables in the database
# Check tables active in the DB
cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
cur.fetchall()       

## Crosswalk Creation/Validation

In [None]:
#Load Patient Lists stored to local db
epicpatientlistfull = pd.read_sql_query("""SELECT * FROM masterpatientlist;""", cnx)
musepatientlist = pd.read_sql_query("""SELECT * FROM musenomapping;""", cnx)
epicpatientlist = epicpatientlistfull.drop_duplicates(subset='new_mrn', keep='first')
print("epicpatientlistfull.shape ", epicpatientlistfull.shape)
print("epicpatientlist.shape ", epicpatientlist.shape)
print("musepatientlist.shape ", musepatientlist.shape)

In [None]:
epicpatientlistfull.info()

In [None]:
epicpatientlist.info()

In [None]:
epicpatientlist['uid'][(epicpatientlist['uid'].duplicated()) & (epicpatientlist['uid']!='MISSING')]

In [None]:
musepatientlist.info()

In [None]:
epicpatientlist.head()

In [None]:
musepatientlist.head()

In [None]:
#Due to nature of matching, can only consider patients that don't have duplicates in Epic List on all 4 constraints
epicpatientlist = epicpatientlist.drop_duplicates(subset=['last_name','first_name','birth_date','gender'], keep = False)
epicpatientlist.shape

In [None]:
#Also need to drop just first/last name matches as well
uniqueepicpatientlist = epicpatientlist.drop_duplicates('new_mrn',keep='first').drop_duplicates(subset=['last_name','first_name'],keep=False)
uniqueepicpatientlist.shape

In [None]:
musepatientlist.loc[:,'DOB_year_1800_from_first_name'] = '18' + musepatientlist.loc[:,'FirstName_Split_2']

In [None]:
musepatientlist['PatientID_replace888'] = '000' + musepatientlist['PatientID'].str[3:]

In [None]:
currentmrnmatch = epicpatientlist.merge(musepatientlist, left_on='current_mrn', right_on='PatientID', sort=False)
currentmrnmatch['list'] = 'currentmrnmatch'
currentmrnmatch.to_csv("C:\\Users\\username\\Desktop\\currentmrnmatch.csv")
musepatientlist2 = musepatientlist[~musepatientlist['PatientID'].isin(currentmrnmatch['PatientID'])]
print('currentmrnmatch shape = ', currentmrnmatch.shape)
print('musepatientlist2 shape = ', musepatientlist2.shape)

In [None]:
legacymrnmatch = epicpatientlistfull.merge(musepatientlist2, left_on='legacy_mrn', right_on='PatientID', sort=False)
legacymrnmatch['list'] = 'legacymrnmatch'
legacymrnmatch.to_csv("C:\\Users\\username\\Desktop\\legacymrnmatch.csv")
musepatientlist3 = musepatientlist2[~musepatientlist2['PatientID'].isin(legacymrnmatch['PatientID'])]
print('legacymrnmatch shape = ', legacymrnmatch.shape)
print('musepatientlist3 shape = ', musepatientlist3.shape)

In [None]:
uidmatch = epicpatientlist.merge(musepatientlist3, left_on='uid',right_on='PatientID', sort=False)
uidmatch['list'] = 'uidmatch'
uidmatch.to_csv("C:\\Users\\username\\Desktop\\uidmatch.csv")
musepatientlist4 = musepatientlist3[~musepatientlist3['PatientID'].isin(uidmatch['PatientID'])]
print('uidmatch shape = ', uidmatch.shape)
print('musepatientlist4 shape = ', musepatientlist4.shape)

In [None]:
exact4match = epicpatientlist.merge(musepatientlist4, right_on=['FirstName_Split_1','LastName','dateofbirth_d','Gender'], 
                             left_on=['first_name','last_name','birth_date','gender'])
exact4match['list'] = 'exact4match'
exact4match.to_csv("C:\\Users\\username\\Desktop\\exact4match.csv")
musepatientlist5 = musepatientlist4[~musepatientlist4['PatientID'].isin(exact4match['PatientID'])]
print('exact4match shape = ', exact4match.shape)
print('musepatientlist5 shape = ', musepatientlist5.shape)

In [None]:
match4withsplitbirthyear = uniqueepicpatientlist.merge(musepatientlist5[musepatientlist5['FirstName_Split_2'].notnull()],\
                                            right_on=['FirstName_Split_1','LastName','Gender','DOB_year_from_first_name'], 
                                            left_on=['first_name','last_name','gender','birthyear'])
match4withsplitbirthyear['list'] = 'match4withsplitbirthyear'
match4withsplitbirthyear.to_csv("C:\\Users\\username\\Desktop\\match4withsplitbirthyear.csv")
musepatientlist6 = musepatientlist5[~musepatientlist5['PatientID'].isin(match4withsplitbirthyear['PatientID'])]
print('match4withsplitbirthyear shape = ', match4withsplitbirthyear.shape)
print('musepatientlist6 shape = ', musepatientlist6.shape)

In [None]:
match3missinggender = epicpatientlist.merge(musepatientlist6[musepatientlist6['Gender']==9],\
                             right_on=['FirstName_Split_1','LastName','dateofbirth_d'], 
                             left_on=['first_name','last_name','birth_date'])
match3missinggender['list'] = 'match3missinggender'
match3missinggender.to_csv("C:\\Users\\username\\Desktop\\match3missinggender.csv")
musepatientlist7 = musepatientlist6[~musepatientlist6['PatientID'].isin(match3missinggender['PatientID'])]
print('match3missinggender shape = ', match3missinggender.shape)
print('musepatientlist7 shape = ', musepatientlist7.shape)

In [None]:
match3missinggenderyearsplit = uniqueepicpatientlist.merge(musepatientlist7[musepatientlist7['Gender']==9],\
                                right_on=['FirstName_Split_1','LastName','DOB_year_from_first_name'], 
                                left_on=['first_name','last_name','birthyear'])
match3missinggenderyearsplit['list'] = 'match3missinggenderyearsplit'
match3missinggenderyearsplit.to_csv("C:\\Users\\username\\Desktop\\match3missinggenderyearsplit.csv")
musepatientlist8 = musepatientlist7[~musepatientlist7['PatientID'].isin(match3missinggenderyearsplit['PatientID'])]
print('match3missinggenderyearsplit shape = ', match3missinggenderyearsplit.shape)
print('musepatientlist8 shape = ', musepatientlist8.shape)

In [None]:
match3missingdobunique = uniqueepicpatientlist.merge(musepatientlist8[musepatientlist8['dateofbirth_d'] == '1700-01-01 00:00:00'],\
                             right_on=['FirstName_Split_1','LastName','Gender'], 
                             left_on=['first_name','last_name','gender'])
match3missingdobunique['list'] = 'match3missingdobunique'
match3missingdobunique.to_csv("C:\\Users\\username\\Desktop\\match3missingdobunique.csv")
musepatientlist9 = musepatientlist8[~musepatientlist8['PatientID'].isin(match3missingdobunique['PatientID'])]
print('match3missingdobunique shape = ', match3missingdobunique.shape)
print('musepatientlist9 shape = ', musepatientlist9.shape)

In [None]:
match4withsplitbirthyear1800 = uniqueepicpatientlist.merge(musepatientlist9,\
                                            right_on=['FirstName_Split_1','LastName','Gender','DOB_year_1800_from_first_name'], 
                                            left_on=['first_name','last_name','gender','birthyear'])
match4withsplitbirthyear1800['list'] = 'match4withsplitbirthyear1800'
match4withsplitbirthyear1800.to_csv("C:\\Users\\username\\Desktop\\match4withsplitbirthyear1800.csv")
musepatientlist10 = musepatientlist9[~musepatientlist9['PatientID'].isin(match4withsplitbirthyear1800['PatientID'])]
print('match4withsplitbirthyear1800 shape = ', match4withsplitbirthyear1800.shape)
print('musepatientlist10 shape = ', musepatientlist10.shape)

In [None]:
uidmatch_replace888 = epicpatientlist.merge(musepatientlist10, left_on='uid',right_on='PatientID_replace888', sort=False)
uidmatch_replace888['list'] = 'uidmatch_replace888'
uidmatch_replace888.to_csv("C:\\Users\\username\\Desktop\\uidmatch_replace888.csv")
musepatientlist11 = musepatientlist10[~musepatientlist10['PatientID'].isin(uidmatch_replace888['PatientID'])]
print('uidmatch_replace888 shape = ', uidmatch_replace888.shape)
print('musepatientlist11 shape = ', musepatientlist11.shape)

In [None]:
new_mappings = pd.concat([currentmrnmatch,
legacymrnmatch,
uidmatch,
exact4match,
match4withsplitbirthyear,
match3missinggender,
match3missinggenderyearsplit,
match3missingdobunique,
match4withsplitbirthyear1800,
uidmatch_replace888
])
new_mappings = new_mappings.reset_index(drop=True)

In [None]:
new_mappings.shape

In [None]:
new_mappings['list'][new_mappings['PatientID'].duplicated()].value_counts()

In [None]:
new_mappings['PatientID'].duplicated().any()

In [None]:
new_mappings.head()

In [None]:
new_mappings.info()

In [None]:
new_mappings['legacy_pat_full_name'] = new_mappings['legacy_pat_last_name'] + ',' + new_mappings['legacy_pat_first_name']

In [None]:
updated_new_mappings = new_mappings[['PatientID','current_mrn','legacy_mrn','new_mrn','last_name','first_name',
                                     'birth_date','gender','legacy_pat_first_name','legacy_pat_last_name', 'legacy_pat_full_name', 
                                     'PatientID','ExtraOrderData4','LastName','FirstName','dateofbirth_d','Gender','#_of_Tests',
                                     'LastTestDate','PatientIDLength','Race','SiteName','list']]

In [None]:
updated_new_mappings.info()

In [None]:
updated_new_mappings.columns

In [None]:
standard_columns = ['InputPatientID','CurrentPatientID','legacy_mrn','NewPatientID','INPUTLastName',
                    'INPUTFirstName','INPUTDOB','INPUTGender','legacy_pat_first_name','legacy_pat_full_name','legacy_pat_last_name',
                    'MUSEPatientID','ExtraOrderData4','MUSELastName','MUSEFirstName','dateofbirth_d','Gender','#_of_Tests',
                    'LastTestDate','PatientIDLength','Race','SiteName','List']
updated_new_mappings.columns = standard_columns

In [None]:
updated_new_mappings.info()

In [None]:
#Found one study where there was a space. Manual adding mapping to updated mapping list
record_to_copy = updated_new_mappings.loc[14485,:]
new_record = record_to_copy.copy()
new_record['InputPatientID'] = ' 99'
new_record['MUSEPatientID'] = ' 99'
new_record['MUSELastName'] = '99'
new_record['LastTestDate'] = '99'
new_record['PatientIDLength'] = 12
new_record['List'] = 'manualintervention'
updated_new_mappings = updated_new_mappings.append(new_record, ignore_index=True)

In [None]:
updated_new_mappings.to_csv('C:\\Users\\username\\Desktop\\updated_new_mappings.csv')
print('New mapping shape: ',updated_new_mappings.shape)

In [None]:
updated_no_mappings = musepatientlist[~musepatientlist['PatientID'].isin(updated_new_mappings['MUSEPatientID'])]
print('No mapping shape: ',updated_no_mappings.shape)
updated_no_mappings.to_csv('C:\\Users\\username\\Desktop\\updated_no_mappings.csv')

In [None]:
print('Original rows: ', musepatientlist.shape[0])
print('New combined rows: ', updated_new_mappings.shape[0] + updated_no_mappings.shape[0])
print('New mappings: ', updated_new_mappings.shape[0])
print('Still needs mapping: ', updated_no_mappings.shape[0])

In [None]:
updated_new_mappings.MUSEPatientID.duplicated().any()

In [None]:
updated_no_mappings.PatientID.duplicated().any()

## Master Patient List Cleaning

In [None]:
masterpatientlist = pd.read_csv('MUSE_MRN_Patient_List.txt',
                                sep='\,',lineterminator='\n',
                                usecols=[0,1,2,3,4,5,6,7,8,9,10,11,12],
                                converters={'current_mrn':str, 'new_mrn':str, 'legacy_mrn':str},
                                engine = 'python'
)


In [None]:
# #Use if need to modify masterpatientlist stored to SQLite DB
# masterpatientlist = epicpatientlist

In [None]:
masterpatientlist.info()

In [None]:
masterpatientlist.head()

In [None]:
masterpatientlist['current_mrn'].str.isalpha().any()

### Fix Patients with Commas in Current First/Last Name Field

In [None]:
masterpatientlist.columns

In [None]:
commonsuffix = ['Jr','Jr.','JR','JR.''Sr','Sr.','SR','SR.','RN','MD','M.D.','MD.']

In [None]:
##****Shifts Columns for patients that had a comma at end of their first name
masterpatientlist.loc[(masterpatientlist['current_mrn'].str.isalpha()) & (masterpatientlist['last_name'].isnull()),'last_name':] =\
    masterpatientlist.loc[(masterpatientlist['current_mrn'].str.isalpha()) & (masterpatientlist['last_name'].isnull()),'last_name':].shift(-1, axis =1) 

##****Shifts Columns for patients that had a comma at beginning of their first name
masterpatientlist.loc[(masterpatientlist['current_mrn'].str.isalpha()) & (masterpatientlist['first_name'].isnull()),'first_name':] =\
    masterpatientlist.loc[(masterpatientlist['current_mrn'].str.isalpha()) & (masterpatientlist['first_name'].isnull()),'first_name':].shift(-1, axis =1) 

##****Shifts Columns for patients that had a comma at end of their last name
masterpatientlist.loc[(masterpatientlist['current_mrn'].str.isalpha()) & (masterpatientlist['birth_date'].isnull()), 'birth_date':] =\
    masterpatientlist.loc[(masterpatientlist['current_mrn'].str.isalpha()) & (masterpatientlist['birth_date'].isnull()), 'birth_date':].shift(-1, axis=1)

##***Shifts Columns for patients that had a common suffix in their first name with a comma.
##*** Removes the suffix as would be expected in downstream systems
masterpatientlist.loc[(masterpatientlist['current_mrn'].str.isalpha()) & (masterpatientlist['last_name'].str.strip().isin(commonsuffix)),'last_name':] =\
    masterpatientlist.loc[(masterpatientlist['current_mrn'].str.isalpha()) & (masterpatientlist['last_name'].str.strip().isin(commonsuffix)),'last_name':].shift(-1,axis=1)

##***Shifts Columns for patients that had a common suffix in their last name with a comma.
##*** Removes the suffix as would beexpected in downstream systems
masterpatientlist.loc[(masterpatientlist['current_mrn'].str.isalpha()) & (masterpatientlist['birth_date'].str.strip().isin(commonsuffix)),'birth_date':] =\
    masterpatientlist.loc[(masterpatientlist['current_mrn'].str.isalpha()) & (masterpatientlist['birth_date'].str.strip().isin(commonsuffix)),'birth_date':].shift(-1,axis=1)

##***NEEDS TO BE DONE LAST. Shifts remaining patients that had comma in middle of first or last name
##*** May lose some informatio but manual checks would catch what is occurring easily
masterpatientlist.loc[masterpatientlist['current_mrn'].str.isalpha(),'birth_date':] =\
    masterpatientlist.loc[(masterpatientlist['current_mrn'].str.isalpha()),'birth_date':].shift(-1, axis = 1)

In [None]:
masterpatientlist['current_mrn'].str.isalpha().any()

### Fix issue due to Comma in Legacy Pat Full Name Field

In [None]:
masterpatientlist.columns

In [None]:
#Full name got split in to full name and last name field: Last, First. Do not need last name field from import
masterpatientlist.drop(columns='legacy_pat_last_name', inplace=True)

In [None]:
#Rename columns to relfect how values were actually imported
masterpatientlist.rename(columns={'legacy_pat_full_name':'legacy_pat_last_name'}, inplace=True)

In [None]:
masterpatientlist.columns

### Standardize Data to match MUSE Extract Data

In [None]:
#Drop Columns with missing NM MRN value as these are of no use/not valid
masterpatientlist.dropna(subset=['new_mrn'],inplace=True)

In [None]:
#Fill in NA with Default Values
masterpatientlist['first_name'].fillna('MISSING', inplace=True)
masterpatientlist['last_name'].fillna('MISSING', inplace=True)
masterpatientlist['birth_date'].fillna('1889-01-01 00:00:00.00', inplace=True)
masterpatientlist['gender'].fillna('U', inplace=True)
masterpatientlist['current_mrn'].fillna('99', inplace=True)
masterpatientlist['new_mrn'].fillna('99', inplace=True)
masterpatientlist['legacy_mrn'].fillna('NA', inplace=True)
masterpatientlist['legacy_pat_last_name'].fillna('NA', inplace=True)
masterpatientlist['legacy_pat_first_name'].fillna('NA', inplace=True)

In [None]:
#Change capitlization to UPPERCASE for all name fields
masterpatientlist['first_name'] = masterpatientlist['first_name'].str.upper()
masterpatientlist['last_name'] = masterpatientlist['last_name'].str.upper()
masterpatientlist['legacy_pat_last_name'] = masterpatientlist['legacy_pat_last_name'].str.upper()
masterpatientlist['legacy_pat_first_name'] = masterpatientlist['legacy_pat_first_name'].str.upper()

In [None]:
#Remove any extra whitespace in dataframe
for col in list(masterpatientlist.columns):
    masterpatientlist[col] = masterpatientlist[col].str.strip()

In [None]:
#Replace Gender with M=0, F=1, U = 99
masterpatientlist['gender'].replace('M',0,inplace=True)
masterpatientlist['gender'].replace('F',1,inplace=True)
masterpatientlist['gender'].replace('U',99,inplace=True)

In [None]:
#Create Birth Year Column to match against Birth Year in Patient Name
masterpatientlist['birthyear'] = masterpatientlist.loc[:,'birth_date'].str.partition('-')[0]

In [None]:
masterpatientlist.info()

In [None]:
masterpatientlist.head()

In [None]:
masterpatientlist.tail()

In [None]:
masterpatientlist = epicpatientlistfull

In [None]:
masterpatientlist2 = epicpatientlist

In [None]:
masterpatientlist['uid'][(masterpatientlist['uid'].duplicated()) & (masterpatientlist['uid']!='MISSING')].shape

In [None]:
masterpatientlist.loc[masterpatientlist['uid'].isin(
        masterpatientlist2['uid'][(masterpatientlist2['uid'].duplicated()) & (masterpatientlist2['uid']!='MISSING')]),'uid'] = 'MISSING'

In [None]:
masterpatientlist['birth_date'] = pd.to_datetime(masterpatientlist['birth_date'])

### Write Cleaned Patient List to DB for quick retrieval in future

In [None]:
masterpatientlist.to_sql("masterpatientlist",cnx,if_exists="replace", index=False)

## MUSE Lists Cleaning

In [None]:
#Temporary Dataframes to Read Direct from Excel initially or if any changes to Excel Document
nomapping = pd.read_excel (r'North_Centra_WestConsolidationTestResults_Feb_2019.xlsx', 
                           sheet_name='No Mapping',
                           converters={'PatientID':str}
                          )
nomapping.shape

In [None]:
nomapping = musepatientlist.copy()
nomapping.shape

In [None]:
nomapping.info()

In [None]:
nomapping.columns

In [None]:
nomapping.head()

### Standardize Data to match Master Patient List Data

In [None]:
#Remove any extra whitespace in dataframe
for col in list(nomapping.columns):
    if nomapping[col].dtype == 'O':
        nomapping[col] = nomapping[col].str.strip()

In [None]:
#Changed Patient List instead to Reflect MUSE info Rename Gender to M and F M = 0, F = 1
#nomapping.Gender.replace(0,'M',inplace=True)
#nomapping.Gender.replace(1,'F',inplace=True)

In [None]:
nomapping['dateofbirth_d'].replace('1700-01-01 00:00:00.00','1700-01-01 00:00:00.000', inplace=True)

In [None]:
#Replace Default Values - Differ than Epic Default
#Fill in NA with Default Values
nomapping['FirstName'].fillna('NONE', inplace=True)
nomapping['LastName'].fillna('NONE', inplace=True)
nomapping['dateofbirth_d'].fillna('1700-01-01 00:00:00.000', inplace=True)
nomapping['Gender'].fillna('9', inplace=True)
nomapping['PatientID'].fillna('88', inplace=True)

In [None]:
#Make Names Uppercase to standardize
nomapping['FirstName'] = nomapping['FirstName'].str.upper()
nomapping['LastName'] = nomapping['LastName'].str.upper()

In [None]:
#Split out First Names that have Commas to capture the patients that have birth year in their name
nomapping['FirstName_Split_1'] = nomapping['FirstName'].str.partition(",")[0].str.strip()
nomapping['FirstName_Split_2'] = nomapping['FirstName'].str.partition(",")[2].str.strip()
nomapping['DOB_year_from_first_name'] = '19' + nomapping['FirstName_Split_2']

In [None]:
#Change Gener to int value
nomapping['Gender'] = nomapping['Gender'].astype(int)

In [None]:
nomapping.info()

In [None]:
nomapping.head()

In [None]:
nomapping.tail()

In [None]:
nomapping = musepatientlist

In [None]:
nomapping.info()

In [None]:
nomapping['dateofbirth_d'] = pd.to_datetime(nomapping['dateofbirth_d'])

### Write Cleaned Patient List to DB for quick retrieval in future

In [None]:
nomapping.to_sql("musenomapping",cnx,if_exists="replace", index=False)

## Alternative Epic List with uid Mapping

Prosolv and CLMSA Xcelera migrations had crosswalks with uid. Importing all of these crosswalks to check against the MUSE no mapping list which appears to have uid for a number of patients.

### CLSMA Mapping

In [None]:
clsmapatientlist = pd.read_excel("clsmauid.xlsx",
                                dtype = {'uid':str, 'MRN':str, '_MRN':str},
                                index_col=None,
                                headers=0)
clsmapatientlist.shape

In [None]:
clsmapatientlist.head()

In [None]:
clsmapatientlist.info()

In [None]:
clsmapatientlist.columns

In [None]:
clsmapatientlist['uid'] = '000' + clsmapatientlist['uid'].str.replace('-','')
clsmapatientlist['First_Name'] = clsmapatientlist['PATIENT_NAME'].str.partition(',')[2]
clsmapatientlist['Last_Name'] = clsmapatientlist['PATIENT_NAME'].str.partition(',')[0]
clsmapatientlist['SEX'].replace('Female','1',inplace=True)
clsmapatientlist['SEX'].replace('Male','0',inplace=True)

### Prosolv Mappings

In [None]:
prosolvmrn1 = pd.read_excel('Prosolv_Mapping.xlsx'
                            ,sheet_name = 'Current  MRN mapping'
                            ,headers = 0
                            ,usecols = ['Clarity_uid','Clarity_MRN','Clarity_PAT_NAME','Clarity_DOB','Clarity_SEX']
                            ,dtype = {'Clarity_uid':str,'Clarity_MRN':str}
                            )
prosolvmrn1.shape

In [None]:
prosolvmrn1.head()

In [None]:
prosolvmrn1.info()

In [None]:
prosolvmrn1.columns

In [None]:
invaliduid = ['000-00-0000','111-11-1111','222-22-2222','333-33-3333','444-44-4444','555-55-5555',
             '666-66-6666','777-77-7777','888-88-8888','999-99-9999']

In [None]:
prosolvmrn1.drop(index=prosolvmrn1[prosolvmrn1['Clarity_uid'].isin(invaliduid)].index.values, inplace=True)
prosolvmrn1.dropna(subset=['Clarity_uid'], inplace=True)
prosolvmrn1['First_Name'] = prosolvmrn1['Clarity_PAT_NAME'].str.partition(',')[2]
prosolvmrn1['Last_Name'] = prosolvmrn1['Clarity_PAT_NAME'].str.partition(',')[0]
prosolvmrn1['Clarity_SEX'].replace('M',0,inplace=True)
prosolvmrn1['Clarity_SEX'].replace('F',1,inplace=True)
prosolvmrn1['Clarity_uid'] = '000' + prosolvmrn1['Clarity_uid'].str.replace('-','')
prosolvmrn1.drop_duplicates(subset='Clarity_MRN',inplace=True)

In [None]:
prosolvmrn2 = pd.read_excel('Prosolv_Mapping.xlsx'
                            ,sheet_name = 'Historical  MRN Mapping'
                            ,headers = 0
                            ,usecols = ['Clarity_uid','Clarity_MRN','Clarity_PAT_NAME','Clarity_DOB','Clarity_SEX']
                            ,dtype = {'Clarity_uid':str,'Clarity_MRN':str})
prosolvmrn2.shape

In [None]:
prosolvmrn2.head()

In [None]:
prosolvmrn2.info()

In [None]:
prosolvmrn2.columns

In [None]:
prosolvmrn2.drop(index=prosolvmrn2[prosolvmrn2['Clarity_uid'].isin(invaliduid)].index.values, inplace=True)
prosolvmrn2.dropna(subset=['Clarity_uid'], inplace=True)
prosolvmrn2['First_Name'] = prosolvmrn2['Clarity_PAT_NAME'].str.partition(',')[2]
prosolvmrn2['Last_Name'] = prosolvmrn2['Clarity_PAT_NAME'].str.partition(',')[0]
prosolvmrn2['Clarity_SEX'].replace('M',0,inplace=True)
prosolvmrn2['Clarity_SEX'].replace('F',1,inplace=True)
prosolvmrn2['Clarity_uid'] = '000' + prosolvmrn2['Clarity_uid'].str.replace('-','')
prosolvmrn2.drop_duplicates(subset='Clarity_MRN',inplace=True)

In [None]:
prosolvmrn3 = pd.read_excel('RemainingToBeMapped.xlsx'
                            ,sheet_name = '5.20.2019 Mapping'
                            ,headers = 0
                            ,usecols = ['Clarity_uid','Clarity_MRN','Clarity_PAT_NAME','Clarity_DOB','Clarity_SEX']
                            ,dtype = {'Clarity_uid':str,'Clarity_MRN':str}
                            )
prosolvmrn3.shape

In [None]:
prosolvmrn3.head()

In [None]:
prosolvmrn3.info()

In [None]:
prosolvmrn3.columns

In [None]:
prosolvmrn3.drop(index=prosolvmrn3[prosolvmrn3['Clarity_uid'].isin(invaliduid)].index.values, inplace=True)
prosolvmrn3.dropna(subset=['Clarity_uid'], inplace=True)
prosolvmrn3['First_Name'] = prosolvmrn3['Clarity_PAT_NAME'].str.partition(',')[2]
prosolvmrn3['Last_Name'] = prosolvmrn3['Clarity_PAT_NAME'].str.partition(',')[0]
prosolvmrn3['Clarity_SEX'].replace('M',0,inplace=True)
prosolvmrn3['Clarity_SEX'].replace('F',1,inplace=True)
prosolvmrn3['Clarity_uid'] = '000' + prosolvmrn3['Clarity_uid'].str.replace('-','')
prosolvmrn3.drop_duplicates(subset='Clarity_MRN',inplace=True)

In [None]:
epicpatientlist.columns

In [None]:
prosolvmrn1.columns

In [None]:
clsmapatientlist.columns

In [None]:
epicpatientlist2 = epicpatientlist.copy()

In [None]:
epicpatientlist2 = epicpatientlist2.merge(clsmapatientlist[['MRN','uid']], left_on='new_mrn', right_on='MRN', how='left', sort=False)
epicpatientlist2 = epicpatientlist2.merge(prosolvmrn1[['Clarity_MRN','Clarity_uid']], left_on = 'current_mrn', right_on='Clarity_MRN', how='left', sort=False)
epicpatientlist2 = epicpatientlist2.merge(prosolvmrn1[['Clarity_MRN','Clarity_uid']], left_on = 'legacy_mrn', right_on='Clarity_MRN', how='left', sort=False, suffixes = ('1_current','1_legacy'))
epicpatientlist2 = epicpatientlist2.merge(prosolvmrn2[['Clarity_MRN','Clarity_uid']], left_on = 'current_mrn', right_on='Clarity_MRN', how='left', sort=False, suffixes = ('1_current','2_current'))
epicpatientlist2 = epicpatientlist2.merge(prosolvmrn2[['Clarity_MRN','Clarity_uid']], left_on = 'legacy_mrn', right_on='Clarity_MRN', how='left', sort=False, suffixes = ('2_current','2_legacy'))
epicpatientlist2 = epicpatientlist2.merge(prosolvmrn3[['Clarity_MRN','Clarity_uid']], left_on = 'new_mrn', right_on='Clarity_MRN', how='left', sort=False)

In [None]:
epicpatientlist2.columns

In [None]:
epicpatientlist2['uid'] = 'MISSING'

In [None]:
epicpatientlist2.loc[epicpatientlist2['Clarity_uid'].notna(),'uid'] =\
    epicpatientlist2.loc[epicpatientlist2['Clarity_uid'].notna(),'Clarity_uid']

epicpatientlist2.loc[(epicpatientlist2['Clarity_uid1_current'].notna()) & (epicpatientlist2['uid'] == 'MISSING'),'uid'] =\
    epicpatientlist2.loc[epicpatientlist2['Clarity_uid1_current'].notna() & (epicpatientlist2['uid'] == 'MISSING'),'Clarity_uid1_current']

epicpatientlist2.loc[(epicpatientlist2['Clarity_uid2_legacy'].notna()) & (epicpatientlist2['uid'] == 'MISSING'),'uid'] =\
    epicpatientlist2.loc[epicpatientlist2['Clarity_uid2_legacy'].notna() & (epicpatientlist2['uid'] == 'MISSING'),'Clarity_uid2_legacy']

epicpatientlist2.loc[(epicpatientlist2['Clarity_uid1_legacy'].notna()) & (epicpatientlist2['uid'] == 'MISSING'),'uid'] =\
    epicpatientlist2.loc[epicpatientlist2['Clarity_uid1_legacy'].notna() & (epicpatientlist2['uid'] == 'MISSING'),'Clarity_uid1_legacy']

epicpatientlist2.loc[(epicpatientlist2['Clarity_uid2_current'].notna()) & (epicpatientlist2['uid'] == 'MISSING'),'uid'] =\
    epicpatientlist2.loc[epicpatientlist2['Clarity_uid2_current'].notna() & (epicpatientlist2['uid'] == 'MISSING'),'Clarity_uid2_current']

epicpatientlist2.loc[(epicpatientlist2['uid'].notna()) & (epicpatientlist2['uid'] == 'MISSING'),'uid'] =\
    epicpatientlist2.loc[epicpatientlist2['uid'].notna() & (epicpatientlist2['uid'] == 'MISSING'),'uid']

In [None]:
epicpatientlist2.loc[:,'uid':'Clarity_uid'][epicpatientlist2['uid'] != 'MISSING']

In [None]:
epicpatientlist = epicpatientlist2.drop(columns=['MRN', 'uid',
       'Clarity_MRN1_current', 'Clarity_uid1_current', 'Clarity_MRN1_legacy',
       'Clarity_uid1_legacy', 'Clarity_MRN2_current', 'Clarity_uid2_current',
       'Clarity_MRN2_legacy', 'Clarity_uid2_legacy', 'Clarity_MRN',
       'Clarity_uid'])

In [None]:
epicpatientlist.shape

Resulting epicpatientlist copied to masterpatientlist and written to DB as master copy which includes uid.

# Testing Code Below - Stop Here

In [None]:
test = pd.read_csv('MUSE_MRN_Patient_List.txt',
                                sep='\,',lineterminator='\n',
                                usecols=[0,1,2,3,4,5,6,7,8,9,10,11,12],
                                converters={'current_mrn':str, 'new_mrn':str, 'legacy_mrn':str},
                                engine = 'python' #nrows = 10000
)

In [None]:
test.info()

In [None]:
test['current_mrn'].value_counts()

In [None]:
test['new_mrn'].value_counts()

In [None]:
test.columns

In [None]:
test['legacy_mrn'].value_counts(dropna=False)

In [None]:
test['legacy_mrn'].str.isalpha().value_counts(dropna=False)

In [None]:
test['new_mrn'].isna().any()

In [None]:
test['new_mrn'].str.isalpha().value_counts(dropna=False)

In [None]:
test.loc[(test['current_mrn'].str.isalpha()) & (test['birth_date'].isnull())]

In [None]:
test.loc[(test['current_mrn'].str.isalpha()) & (test['last_name'].str.strip().isin(commonsuffix)),'last_name':]

In [None]:
test.loc[(test['current_mrn'].str.isalpha()) & (test['birth_date'].str.strip().isin(commonsuffix)),'birth_date':]

In [None]:
test.loc[(test['current_mrn'].str.isalpha()) & (test['first_name'].isnull())]

In [None]:
commonsuffix = ['Jr','Jr.','JR','JR.''Sr','Sr.','SR','SR.','RN','MD','M.D.','MD.']

In [None]:
##****Shifts Columns for patients that had a comma at end of their first name
test.loc[(test['current_mrn'].str.isalpha()) & (test['last_name'].isnull()),'last_name':] =\
    test.loc[(test['current_mrn'].str.isalpha()) & (test['last_name'].isnull()),'last_name':].shift(-1, axis =1) 

##****Shifts Columns for patients that had a comma at beginning of their first name
test.loc[(test['current_mrn'].str.isalpha()) & (test['first_name'].isnull()),'first_name':] =\
    test.loc[(test['current_mrn'].str.isalpha()) & (test['first_name'].isnull()),'first_name':].shift(-1, axis =1) 

##****Shifts Columns for patients that had a comma at end of their last name
test.loc[(test['current_mrn'].str.isalpha()) & (test['birth_date'].isnull()), 'birth_date':] =\
    test.loc[(test['current_mrn'].str.isalpha()) & (test['birth_date'].isnull()), 'birth_date':].shift(-1, axis=1)

##***Shifts Columns for patients that had a common suffix in their first name with a comma. Removes the suffix as would be
##***expected in downstream systems
test.loc[(test['current_mrn'].str.isalpha()) & (test['last_name'].str.strip().isin(commonsuffix)),'last_name':] =\
    test.loc[(test['current_mrn'].str.isalpha()) & (test['last_name'].str.strip().isin(commonsuffix)),'last_name':].shift(-1,axis=1)

##***Shifts Columns for patients that had a common suffix in their last name with a comma. Removes the suffix as would be
##***expected in downstream systems
test.loc[(test['current_mrn'].str.isalpha()) & (test['birth_date'].str.strip().isin(commonsuffix)),'birth_date':] =\
    test.loc[(test['current_mrn'].str.isalpha()) & (test['birth_date'].str.strip().isin(commonsuffix)),'birth_date':].shift(-1,axis=1)

##***NEEDS TO BE DONE LAST. Shifts remaining patients that had comma in middle of first or last name
##*** May lose some informatio but manual checks would catch what is occurring easily
test.loc[test['current_mrn'].str.isalpha(),'birth_date':] =\
    test.loc[(test['current_mrn'].str.isalpha()),'birth_date':].shift(-1, axis = 1)

In [None]:
test.loc[(test['current_mrn'].str.isalpha()) & (test['last_name'].isnull())] 

In [None]:
test.iloc[389067,:]

In [None]:
test['legacy_pat_full_name'].fillna('NA').str.isdigit().value_counts(dropna=False)

In [None]:
test.loc[test['legacy_pat_full_name'].fillna('NA').str.isdigit()]

In [None]:
test.loc[test['legacy_mrn'].notnull()]

In [None]:
test.drop(columns='legacy_pat_last_name', inplace=True)

In [None]:
test.columns

In [None]:
test.rename(columns={'legacy_pat_full_name':'legacy_pat_last_name'}, inplace=True)

In [None]:
test['first_name'].str.strip().str.partition(" ")[2].str.len()

In [None]:
test.loc[test['first_name'].str.strip().str.partition(" ")[2].str.len()> 1]

In [None]:
test.new_mrn.isnull().any()

In [None]:
test.new_mrn.isnull().sum()

In [None]:
test.loc[test.new_mrn.isnull()]

In [None]:
test.dropna(subset=['new_mrn'],inplace=True)

In [None]:
test.shape

In [None]:
test.columns

In [None]:
test.info()

In [None]:
test.birth_date.replace('1111-01-01 00:00:00.00', '1889-01-01 00:00:00.00', inplace=True)

In [None]:
test['first_name'].fillna('MISSING', inplace=True)
test['last_name'].fillna('MISSING', inplace=True)
test['birth_date'].fillna('1889-01-01 00:00:00.00', inplace=True)
test['gender'].fillna('U', inplace=True)
test['current_mrn'].fillna('99', inplace=True)
test['new_mrn'].fillna('99', inplace=True)
test['legacy_mrn'].fillna('NA', inplace=True)
test['legacy_pat_last_name'].fillna('NA', inplace=True)
test['legacy_pat_first_name'].fillna('NA', inplace=True)

In [None]:
test['birth_date'] = pd.to_datetime(test['birth_date'])

In [None]:
test['first_name'] = test['first_name'].str.upper()
test['last_name'] = test['last_name'].str.upper()
test['legacy_pat_last_name'] = test['legacy_pat_last_name'].str.upper()
test['legacy_pat_first_name'] = test['legacy_pat_first_name'].str.upper()

In [None]:
test.info()

In [None]:
test.head()

In [None]:
test = epicpatientlist.merge(prosolvmrn1[['Clarity_MRN','Clarity_uid']], left_on = 'current_mrn', right_on='Clarity_MRN', how='left')

In [None]:
epicpatientlist.shape

In [None]:
test.columns

In [None]:
test.shape

In [None]:
test[~test['Clarity_MRN'].isna()]

In [None]:
test = pd.concat([epicpatientlist,prosolvmrn1[['Clarity_MRN','Clarity_uid']]], ignore_index=True, sort=False, join='outer')

In [None]:
test['Clarity_uid'].isna().sum()

In [None]:
test['Clarity_uid'].isna().sum()

In [None]:
test = epicpatientlist.merge(prosolvmrn1[['Clarity_MRN','Clarity_uid']], left_on = 'current_mrn', right_on='Clarity_MRN', how='left', sort=False)

In [None]:
test = test.merge(prosolvmrn2[['Clarity_MRN','Clarity_uid']], left_on = ['legacy_mrn'], right_on=['Clarity_MRN'], how='left')

In [None]:
test = test.merge(prosolvmrn2[['Clarity_MRN','Clarity_uid']], left_on = ['current_mrn','Clarity_uid'], right_on=['Clarity_MRN','Clarity_uid'], how='left')

In [None]:
test = epicpatientlist.merge(prosolvmrn1[['Clarity_MRN','Clarity_uid']], left_on = 'current_mrn', right_on='Clarity_MRN', how='left')
test = test.merge(prosolvmrn1[['Clarity_MRN','Clarity_uid']], left_on = ['legacy_mrn','Clarity_uid'], right_on=['Clarity_MRN','Clarity_uid'], how='left')

In [None]:
test.shape

In [None]:
test.columns

In [None]:
test.head()

In [None]:
test[~test['Clarity_uid_y'].isna()]

In [None]:
test[test['Last_Name'] != test['last_name']]

In [None]:
test.tail()

In [None]:
musepatientlist.loc[musepatientlist['PatientID'].duplicated(),'PatientID']