In [1]:
import os
import re
import sys
import time
import nltk
import openpyxl
import collections
import numpy as np
import pandas as pd
import pandas.io.formats.excel
from sklearn import preprocessing
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.preprocessing import label_binarize
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def read_data(path):
    data = pd.read_excel(path)
    return data

In [3]:
def get_service_name(data):
    service_name = pd.DataFrame(data['SERVICE_NAME'])
    return service_name

In [4]:
def get_standards_path():
    paths = []
    for file in os.listdir("Standards"):
        paths.append(os.path.join("Standards", file))
    return paths

In [5]:
def get_data_by_service_type(data, types):
    data = data[data['SERVICE_TYPE'].isin(types)]
    data.reset_index(inplace=True,drop=True)
    return data

In [6]:
def read_standards(paths):
    stds = []
    for path in paths:
        stds.append(pd.read_excel(path).drop_duplicates())
    return stds

In [7]:
def get_providers_path():
    paths = []
    for file in os.listdir("Providers"):
        paths.append(os.path.join("Providers", file))
    return paths

In [8]:
def create_document(DF1,col_index): #DF1 is the DF to be preprocessed #col_index is the number of the describtion column (int)
    stopWords = set(stopwords.words('english'))
    uselessWords = ['intended','for','use','helps','to','provide','all','detection','of']
    # Making A copy to prevent change to the original DF as it is needd subsequently
    DF = DF1.copy()
    # Insuring string type for the wanted col
    DF[DF.columns[col_index]] = DF[DF.columns[col_index]].astype(str)
    # Removing all newline and replacing it with space
    DF[DF.columns[col_index]] = DF[DF.columns[col_index]].str.replace('\n',' ')
    #Replacing all non alphanumeric in both english and arabic by space
    #DF[DF.columns[col_index]] = DF[DF.columns[col_index]].apply(lambda x: re.sub('[^A-Za-z0-9]+', ' ', x))
    #Separating camel case with space
    #DF[DF.columns[col_index]] = DF[DF.columns[col_index]].apply(lambda x: re.sub(r"([a-z])([A-Z])", r"\1 \2", x))
    #lowercasing all describtions
    DF[DF.columns[col_index]] = DF[DF.columns[col_index]].apply(lambda x:" ".join(x.lower() for x in x.split()))
    #removing stop words
    DF[DF.columns[col_index]] = DF[DF.columns[col_index]].apply(lambda x:" ".join (word for word in x.split() if word not in stopWords))
    #removing useless words
    DF[DF.columns[col_index]] = DF[DF.columns[col_index]].apply(lambda x:" ".join(word for word in x.split() if word not in uselessWords))
    #DF[DF.columns[col_index]] = DF[DF.columns[col_index]].apply(lambda x:''.join(i for i in x if not i.isdigit()))
    #In case a med dict present (referring the medical abbreviation to its origin) this line return the abbreviation to origin
    ## DF_document.document = DF_document.document.apply(lambda x: " ".join(med_dict(word) for word in x.split()) )
    #Counting all Words that occur either in very low freq or very high and removing them (Threshhold must be configured)
    ## freq = pd.Series(' '.join(DF[DF.columns[col_index]]).split()).value_counts()[:50]
    ## DF[DF.columns[col_index]] = DF[DF.columns[col_index]].apply(lambda x: " ".join(x for x in x.split() if  x not in freq.index))
    ## freq = pd.Series(' '.join(DF[DF.columns[col_index]]).split()).value_counts()[-1034:]
    ## DF[DF.columns[col_index]] = DF[DF.columns[col_index]].apply(lambda x: " ".join(x for x in x.split() if x not in freq.index))
    #Defining a stemmer and stemming all words in the description
    #st = PorterStemmer()
    #DF[DF.columns[col_index]] = DF[DF.columns[col_index]].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
    #DF[DF.columns[col_index]] = DF[DF.columns[col_index]].apply(lambda x: "" if x.isdigit() == True else x)
    return DF

In [9]:
def concat(service_name,standards):
    DF = pd.concat(objs = [service_name,standards])
    #DF = DF.apply(lambda x: ''.join(i for i in x if not i.isdigit()))
    return DF

In [10]:
def create_vectorizer(df,service_name,standard):
    stopWords = stopwords.words('english')
    tfidf = TfidfVectorizer(
                analyzer = "word",  
                tokenizer = None,  
                preprocessor = None, 
                stop_words = None,  
                max_features = None,
                token_pattern = r"(?u)\b\w+\b")
    
    tfidf.fit(df)
    tfref = tfidf.transform(standard)
    tfdata = tfidf.transform(service_name)
    return tfref,tfdata

In [11]:
def find_similar(tfidf_data,tfidf_ref, index, rank = 0):
    #multiplying the 2 matrix to find the cosine similarity
    cosine_similarities = linear_kernel(tfidf_data[index:index+1], tfidf_ref[:]).flatten()
    #getting the most similar index
    highest_ind = cosine_similarities.argsort()[::-1][rank]
    #return the index along with its similarity
    return highest_ind, cosine_similarities[highest_ind]

In [31]:
linear_kernel(tfdata_ACHI[0:0+1], tfref_ACHI[:]).flatten().argsort()[::-1]

array([3136, 3137, 7886, ..., 5264, 5265,    0])

In [12]:
def get_similarity(data,tfdata,tfref):
    result = pd.DataFrame(columns = ['Serv','cpt','Sim','Num'])
    resultAll = pd.DataFrame(columns = ['Serv','cpt','Sim','Num'])
    for i in range(len(data)):
        for j in range(0,1):
            index, simlarity = find_similar(tfdata, tfref, i, j)
            result.loc[i] = [i, index, simlarity, j]
            resultAll = resultAll.append(result)
            result.drop(result.index, inplace = True)
    return resultAll

In [13]:
def get_max_results(data,result_ACHI,result_LOINC,result_SFDA,result_DRS_ACHI,result_DRS_LOINC,result_DRS_SFDA):
    df = pd.DataFrame(columns=['Serv','cpt','Sim','Num','MAPPED_CODE'])
    for i in range(len(result_ACHI)):
        #sims=[result_ACHI.iloc[i,2], result_LOINC.iloc[i,2]]
        if data.iloc[i,5] != 'Lab' and data.iloc[i,5] != 'Other Medical Services':
            if float(result_DRS_ACHI.iloc[i][2]) > 0.2 :
                df.loc[i] = result_DRS_ACHI.iloc[i].to_list() + ['Drs_ACHI']
            else:
                df.loc[i] = result_ACHI.iloc[i].to_list() + ['ACHI']
        elif data.iloc[i,5] == 'Other Medical Services':
            if float(result_DRS_SFDA.iloc[i][2]) > 0.2 :
                df.loc[i] = result_DRS_SFDA.iloc[i].to_list() + ['Drs_SFDA']
            else:
                df.loc[i] = result_SFDA.iloc[i].to_list() + ['SFDA']
        elif data.iloc[i,5] == 'Lab':
            if float(result_DRS_LOINC.iloc[i][2]) > 0.2 :
                df.loc[i] = result_DRS_LOINC.iloc[i].to_list() + ['Drs_LOINC']
            else:
                df.loc[i] = result_LOINC.iloc[i].to_list() + ['LOINC']
    return df

In [14]:
def do_map(data,result,ACHI,LOINC,SFDA,DRS_ACHI,DRS_LOINC,DRS_SFDA):
    data['MAPPED_CODE'] = result['MAPPED_CODE']
    for i in range(len(data)):
        if result.iloc[i,4] == 'ACHI':
            data.loc[i, 'UNIFIED_CODE'] = ACHI.iloc[int(result.iloc[i,1]),0]
            data.loc[i, 'UNIFIED_CODE_DESCRIPTION'] = ACHI.iloc[int(result.iloc[i,1]),3]
        elif result.iloc[i,4] == 'LOINC':
            data.loc[i, 'UNIFIED_CODE'] = LOINC.iloc[int(result.iloc[i,1]),0]
            data.loc[i, 'UNIFIED_CODE_DESCRIPTION'] = LOINC.iloc[int(result.iloc[i,1]),7]
        elif result.iloc[i,4] == 'Drs_LOINC':
            data.loc[i, 'UNIFIED_CODE'] = DRS_LOINC.iloc[int(result.iloc[i,1]),1]
            data.loc[i, 'UNIFIED_CODE_DESCRIPTION'] = DRS_LOINC.iloc[int(result.iloc[i,1]),0]
        elif result.iloc[i,4] == 'Drs_ACHI':
            data.loc[i, 'UNIFIED_CODE'] = DRS_ACHI.iloc[int(result.iloc[i,1]),1]
            data.loc[i, 'UNIFIED_CODE_DESCRIPTION'] = DRS_ACHI.iloc[int(result.iloc[i,1]),0]
        elif result.iloc[i,4] == 'Drs_SFDA':
            data.loc[i, 'UNIFIED_CODE'] = DRS_SFDA.iloc[int(result.iloc[i,1]),1]
            data.loc[i, 'UNIFIED_CODE_DESCRIPTION'] = DRS_SFDA.iloc[int(result.iloc[i,1]),0]    
        else:
            data.loc[i, 'UNIFIED_CODE'] = SFDA.iloc[int(result.iloc[i,1]),0]
            data.loc[i, 'UNIFIED_CODE_DESCRIPTION'] = SFDA.iloc[int(result.iloc[i,1]),3]
    data['Similarity'] = result['Sim']
    return data

In [15]:
def filter_by_similarity(data, thresh):
    data.loc[data['Similarity'] <= thresh, ['UNIFIED_CODE','UNIFIED_CODE_DESCRIPTION','MAPPED_CODE']] = "Couldn't Be Mapped"
    return data

In [16]:
def swap_columns(data):
    columnsTitles = data.columns.to_list()
    result = data.reindex(columns=columnsTitles[:5]+['UNIFIED_CODE_DESCRIPTION']+['UNIFIED_CODE']+['SERVICE_TYPE']+['MAPPED_CODE']+['Similarity'])
    return result

In [17]:
def files_mapping(data_path):
    data = read_data(data_path)
    data = get_data_by_service_type(data, ['Lab','Diagnosis Procedure','Dental','Diagnostic Procedures','Other Medical Services','Package Deal','Physiotherapy','Radiology'])
    ACHI,DRs_ACHI,DRs_LOINC,DRs_SFDA, LOINC, SFDA = read_standards(get_standards_path())
    service_name = get_service_name(data)
    
    ACHI_processed = create_document(ACHI,3)
    LOINC_processed = create_document(LOINC,7)
    SFDA_processed = create_document(SFDA,3)
    DRs_ACHI_processed = create_document(DRs_ACHI,0)
    DRs_LOINC_processed = create_document(DRs_LOINC,0)
    DRs_SFDA_processed = create_document(DRs_SFDA,0)
    data_processed = create_document(service_name,0)
    
    if len(data_processed) > 0:
        tfref_ACHI,tfdata_ACHI = create_vectorizer(concat(data_processed['SERVICE_NAME'],ACHI_processed['ascii_desc']),data_processed['SERVICE_NAME'],ACHI_processed['ascii_desc'])
        tfref_LOINC,tfdata_LOINC = create_vectorizer(concat(data_processed['SERVICE_NAME'],LOINC_processed['LONG_COMMON_NAME']),data_processed['SERVICE_NAME'],LOINC_processed['LONG_COMMON_NAME'])
        tfref_SFDA ,tfdata_SFDA = create_vectorizer(concat(data_processed['SERVICE_NAME'],SFDA_processed['intended_purpose']),data_processed['SERVICE_NAME'],SFDA_processed['intended_purpose'])
        tfref_drs_achi ,tfdata_drs_achi = create_vectorizer(concat(data_processed['SERVICE_NAME'],DRs_ACHI_processed['Code Describtion']),data_processed['SERVICE_NAME'],DRs_ACHI_processed['Code Describtion'])
        tfref_drs_loinc ,tfdata_drs_loinc = create_vectorizer(concat(data_processed['SERVICE_NAME'],DRs_LOINC_processed['Code Describtion']),data_processed['SERVICE_NAME'],DRs_LOINC_processed['Code Describtion'])
        tfref_drs_sfda ,tfdata_drs_sfda = create_vectorizer(concat(data_processed['SERVICE_NAME'],DRs_SFDA_processed['Code Describtion']),data_processed['SERVICE_NAME'],DRs_SFDA_processed['Code Describtion'])
        resultAll_ACHI = get_similarity(data,tfdata_ACHI,tfref_ACHI)
        resultAll_LOINC = get_similarity(data,tfdata_LOINC,tfref_LOINC)
        resultAll_SFDA = get_similarity(data,tfdata_SFDA,tfref_SFDA)
        resultAll_DRS_ACHI = get_similarity(data,tfdata_drs_achi,tfref_drs_achi)
        resultAll_DRS_LOINC = get_similarity(data,tfdata_drs_loinc,tfref_drs_loinc)
        resultAll_DRS_SFDA = get_similarity(data,tfdata_drs_sfda,tfref_drs_sfda)
        AllResult = get_max_results(data,resultAll_ACHI,resultAll_LOINC,resultAll_SFDA,resultAll_DRS_ACHI,resultAll_DRS_LOINC,resultAll_DRS_SFDA)
        AllResult = do_map(data,AllResult,ACHI,LOINC,SFDA,DRs_ACHI,DRs_LOINC,DRs_SFDA)
        AllResult = filter_by_similarity(AllResult, 0.0)
        AllResult = swap_columns(AllResult)
    else:
        AllResult = []

    return AllResult

In [115]:
def export_excel(path):
    writer = pd.ExcelWriter(path,engine='openpyxl')
    writer.book = openpyxl.load_workbook(path)
    
    pandas.io.formats.excel.ExcelFormatter.header_style = None
    
    result["Y/N"] = np.nan
    result["Doctor's Recommended Code"] = np.nan
    result["Doctor's Recommended Code Description"] = np.nan
    result[(result.SERVICE_TYPE != 'Lab') & (result.SERVICE_TYPE != 'Other Medical Services')].to_excel(writer,sheet_name='ACHI',index=False)
    result[result.SERVICE_TYPE == 'Lab'].to_excel(writer,sheet_name='LOINC',index=False)
    result[result.SERVICE_TYPE == 'Other Medical Services'].to_excel(writer,sheet_name='SFDA',index=False)
    
    for i, sheetname in enumerate(writer.book.sheetnames):
        worksheet = writer.book[sheetname]
        mediumStyle = openpyxl.worksheet.table.TableStyleInfo(name='TableStyleMedium2',showRowStripes=True)
        table = openpyxl.worksheet.table.Table(ref=worksheet.dimensions,displayName="Table" + str(i),tableStyleInfo=mediumStyle)
        worksheet.add_table(table)
        
        for col in worksheet.columns:
            worksheet.column_dimensions[col[0].column_letter].width = (len(str(col[0].value)) + 2) * 1.3
    
    writer.book.save(path)
    writer.book.close()
    writer.close()
    os.rename(path,os.path.join("Mapped Providers Learned",os.path.splitext(os.path.basename(path))[0]+" Mapped"+os.path.splitext(path)[-1]))

In [116]:
%%time
n = 1
for provider_path in get_providers_path():
    start_time = time.time()
    print("Started mapping " + os.path.basename(provider_path))
    result = files_mapping(provider_path)
    if len(result) > 0:
        print("Finished mapping " + os.path.basename(provider_path))
        export_excel(provider_path)
        print("Exported " + os.path.basename(provider_path) + " to the 'Mapped Providers 6' folder")
        elapsed_time = time.time() - start_time
        print("Time: " + str(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) + "\n")
    else:
        print("There is no mappings for this provider\n")

Started mapping Abeer Supreme Medical Center 2 ## 6212001674 @ 1285.xlsx
Finished mapping Abeer Supreme Medical Center 2 ## 6212001674 @ 1285.xlsx
Exported Abeer Supreme Medical Center 2 ## 6212001674 @ 1285.xlsx to the 'Mapped Providers 6' folder
Time: 00:01:56

Started mapping Adwaa Al-alami Medical Complex ## 6212000751 @ 8071.xlsx
Finished mapping Adwaa Al-alami Medical Complex ## 6212000751 @ 8071.xlsx
Exported Adwaa Al-alami Medical Complex ## 6212000751 @ 8071.xlsx to the 'Mapped Providers 6' folder
Time: 00:01:28

Started mapping Ahad General Hospital ## 6212001747 @ 9106.xlsx
Finished mapping Ahad General Hospital ## 6212001747 @ 9106.xlsx
Exported Ahad General Hospital ## 6212001747 @ 9106.xlsx to the 'Mapped Providers 6' folder
Time: 00:08:00

Started mapping Al Dawaa Medical Services Company ## 6212001835 @ 38857.xlsx
There is no mappings for this provider

Started mapping Al Falah Hospital ## 6212000272 @ 2442.xlsx
Finished mapping Al Falah Hospital ## 6212000272 @ 2442.xl

Finished mapping Obeid Specialized Hospital - Riyadh ## 6212000073 @ 4726.xlsx
Exported Obeid Specialized Hospital - Riyadh ## 6212000073 @ 4726.xlsx to the 'Mapped Providers 6' folder
Time: 00:04:59

Started mapping Rabiah Hospital ## 6212000557 @ 2703.xlsx
Finished mapping Rabiah Hospital ## 6212000557 @ 2703.xlsx
Exported Rabiah Hospital ## 6212000557 @ 2703.xlsx to the 'Mapped Providers 6' folder
Time: 00:03:27

Started mapping Safa Makkah Polyclinic 1 ## 6212000106 @ 255.xlsx
Finished mapping Safa Makkah Polyclinic 1 ## 6212000106 @ 255.xlsx
Exported Safa Makkah Polyclinic 1 ## 6212000106 @ 255.xlsx to the 'Mapped Providers 6' folder
Time: 00:00:57

Started mapping Safa Makkah Polyclinic 2 ## 6212000086 @ 505.xlsx
Finished mapping Safa Makkah Polyclinic 2 ## 6212000086 @ 505.xlsx
Exported Safa Makkah Polyclinic 2 ## 6212000086 @ 505.xlsx to the 'Mapped Providers 6' folder
Time: 00:01:12

Started mapping Safa Medical Center - Dammam ## 6212000673 @ 543.xlsx
Finished mapping Safa Me

In [18]:
data = pd.read_excel('Abeer Supreme Medical Center 2 ## 6212001674 @ 1285.xlsx')
achi = pd.read_excel('Standards/ACHI.XLSX')
service_name = get_service_name(data)
data_processed = create_document(service_name,0)
ACHI_processed = create_document(achi,3)
tfref_ACHI,tfdata_ACHI = create_vectorizer(concat(data_processed['SERVICE_NAME'],ACHI_processed['ascii_desc']),data_processed['SERVICE_NAME'],ACHI_processed['ascii_desc'])

In [21]:
tfref_ACHI.shape

(7887, 4437)

In [22]:
tfdata_ACHI.shape

(1285, 4437)