In [1]:
import sent2vec
import fasttext
from nltk import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from scipy.spatial import distance
import gensim
import os
import sys
import re
import time
import nltk
import openpyxl
import collections
import numpy as np
import pandas as pd
import pandas.io.formats.excel
from sklearn import preprocessing
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.preprocessing import label_binarize
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
%%time
model_path = 'BioSentVec_PubMed_MIMICIII-bigram_d700.bin'
model = sent2vec.Sent2vecModel()
try:
    model.load_model(model_path)
except Exception as e:
    print(e)
print('model successfully loaded')

model successfully loaded
CPU times: user 4.87 s, sys: 21.3 s, total: 26.1 s
Wall time: 24min 27s


In [33]:
def read_data(path):
    data = pd.read_excel(path)
    return data

In [34]:
def get_service_name(data):
    service_name = pd.DataFrame(data['SERVICE_NAME'])
    return service_name

In [35]:
def get_standards_path():
    paths = []
    for file in os.listdir("Standards"):
        paths.append(os.path.join("Standards", file))
    return paths

In [36]:
def get_data_by_service_type(data, types):
    data = data[data['SERVICE_TYPE'].isin(types)]
    data.reset_index(inplace=True,drop=True)
    return data

In [37]:
def read_standards(paths):
    stds = []
    for path in paths:
        stds.append(pd.read_excel(path).drop_duplicates())
    return stds

In [38]:
def get_providers_path():
    paths = []
    for file in os.listdir("Providers"):
        paths.append(os.path.join("Providers", file))
    return paths

In [39]:
def create_document(DF1,col_index): #DF1 is the DF to be preprocessed #col_index is the number of the describtion column (int)
    stopWords = set(stopwords.words('english'))
    uselessWords = ['intended','for','use','helps','to','provide','all','detection','of']
    # Making A copy to prevent change to the original DF as it is needd subsequently
    DF = DF1.copy()
    # Insuring string type for the wanted col
    DF[DF.columns[col_index]] = DF[DF.columns[col_index]].astype(str)
    # Removing all newline and replacing it with space
    DF[DF.columns[col_index]] = DF[DF.columns[col_index]].str.replace('\n',' ')
    #Replacing all non alphanumeric in both english and arabic by space
    #DF[DF.columns[col_index]] = DF[DF.columns[col_index]].apply(lambda x: re.sub('[^A-Za-z0-9]+', ' ', x))
    #Separating camel case with space
    #DF[DF.columns[col_index]] = DF[DF.columns[col_index]].apply(lambda x: re.sub(r"([a-z])([A-Z])", r"\1 \2", x))
    #lowercasing all describtions
    DF[DF.columns[col_index]] = DF[DF.columns[col_index]].apply(lambda x:" ".join(x.lower() for x in x.split()))
    #removing stop words
    DF[DF.columns[col_index]] = DF[DF.columns[col_index]].apply(lambda x:" ".join (word for word in x.split() if word not in stopWords))
    #removing useless words
    DF[DF.columns[col_index]] = DF[DF.columns[col_index]].apply(lambda x:" ".join(word for word in x.split() if word not in uselessWords))
    #DF[DF.columns[col_index]] = DF[DF.columns[col_index]].apply(lambda x:''.join(i for i in x if not i.isdigit()))
    #In case a med dict present (referring the medical abbreviation to its origin) this line return the abbreviation to origin
    ## DF_document.document = DF_document.document.apply(lambda x: " ".join(med_dict(word) for word in x.split()) )
    #Counting all Words that occur either in very low freq or very high and removing them (Threshhold must be configured)
    ## freq = pd.Series(' '.join(DF[DF.columns[col_index]]).split()).value_counts()[:50]
    ## DF[DF.columns[col_index]] = DF[DF.columns[col_index]].apply(lambda x: " ".join(x for x in x.split() if  x not in freq.index))
    ## freq = pd.Series(' '.join(DF[DF.columns[col_index]]).split()).value_counts()[-1034:]
    ## DF[DF.columns[col_index]] = DF[DF.columns[col_index]].apply(lambda x: " ".join(x for x in x.split() if x not in freq.index))
    #Defining a stemmer and stemming all words in the description
    #st = PorterStemmer()
    #DF[DF.columns[col_index]] = DF[DF.columns[col_index]].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
    #DF[DF.columns[col_index]] = DF[DF.columns[col_index]].apply(lambda x: "" if x.isdigit() == True else x)
    return DF

In [40]:
def get_embeddings(data):
    emb = model.embed_sentences(data)
    emb = pd.DataFrame(emb)
    #print(emb)
    return pd.DataFrame(emb)
    

In [41]:
def get_cos(sent1,sent2):
    cosine_sim = 1 - distance.cosine(sent1, sent2)
    return cosine_sim

In [42]:
# def find_similar(sent1,emb_ref):
#     similarities = []
#     for i in range(len(emb_ref)):
#         similarities.append(get_cos(sent1.values,emb_ref.iloc[i].values))
#     #similarities = np.array(similarities)
#     maximum = max(similarities)
#     return similarities.index(maximum), maximum

In [43]:
def find_similar(tfidf_data,tfidf_ref, index, rank = 1):
    #multiplying the 2 matrix to find the cosine similarity
    cosine_similarities = cosine_similarity(tfidf_data[index:index+1], tfidf_ref[:]).flatten()
    #getting the most similar index
    highest_ind = cosine_similarities.argsort()[::-1][rank]
    #return the index along with its similarity
    return highest_ind, cosine_similarities[highest_ind]

In [44]:
# data = pd.read_excel('Abeer Supreme Medical Center 2 ## 6212001674 @ 1285.xlsx')
# achi = pd.read_excel('Standards/ACHI.XLSX')
# service_name = get_service_name(data)
# data_processed = create_document(service_name,0)
# ACHI_processed = create_document(achi,3)
# emb_achi = get_embeddings(ACHI_processed.iloc[:,3])
# emb_data = get_embeddings(data_processed.iloc[:,0])

In [45]:
# find_similar(emb_data,emb_achi,0)

In [46]:
def get_similarity(data,tfdata,tfref):
    result = pd.DataFrame(columns = ['Serv','cpt','Sim','Num'])
    resultAll = pd.DataFrame(columns = ['Serv','cpt','Sim','Num'])
    for i in range(len(data)):
        for j in range(0,1):
            index, simlarity = find_similar(tfdata, tfref, i, j)
            result.loc[i] = [i, index, simlarity, j]
            resultAll = resultAll.append(result)
            result.drop(result.index, inplace = True)
    #print(resultAll.head())
    return resultAll

In [47]:
#result = get_similarity(data,emb_data,emb_achi)

In [48]:
#result.head()

In [49]:
#result.Sim.value_counts()

In [50]:
def get_max_results(data,result_ACHI,result_LOINC,result_SFDA,result_DRS_ACHI,result_DRS_LOINC,result_DRS_SFDA):
    df = pd.DataFrame(columns=['Serv','cpt','Sim','Num','MAPPED_CODE'])
    for i in range(len(result_ACHI)):
        #sims=[result_ACHI.iloc[i,2], result_LOINC.iloc[i,2]]
        if data.iloc[i,5] != 'Lab' and data.iloc[i,5] != 'Other Medical Services':
            if float(result_DRS_ACHI.iloc[i][2]) > 0.9 :
                df.loc[i] = result_DRS_ACHI.iloc[i].to_list() + ['Drs_ACHI']
            else:
                df.loc[i] = result_ACHI.iloc[i].to_list() + ['ACHI']
        elif data.iloc[i,5] == 'Other Medical Services':
            if float(result_DRS_SFDA.iloc[i][2]) > 0.9 :
                df.loc[i] = result_DRS_SFDA.iloc[i].to_list() + ['Drs_SFDA']
            else:
                df.loc[i] = result_SFDA.iloc[i].to_list() + ['SFDA']
        elif data.iloc[i,5] == 'Lab':
            if float(result_DRS_LOINC.iloc[i][2]) > 0.9 :
                df.loc[i] = result_DRS_LOINC.iloc[i].to_list() + ['Drs_LOINC']
            else:
                df.loc[i] = result_LOINC.iloc[i].to_list() + ['LOINC']
    #print(df.head())
    return df

In [51]:
def do_map(data,result,ACHI,LOINC,SFDA,DRS_ACHI,DRS_LOINC,DRS_SFDA):
    data['MAPPED_CODE'] = result['MAPPED_CODE']
    for i in range(len(data)):
        if result.iloc[i,4] == 'ACHI':
            data.loc[i, 'UNIFIED_CODE'] = ACHI.iloc[int(result.iloc[i,1]),0]
            data.loc[i, 'UNIFIED_CODE_DESCRIPTION'] = ACHI.iloc[int(result.iloc[i,1]),3]
        elif result.iloc[i,4] == 'LOINC':
            data.loc[i, 'UNIFIED_CODE'] = LOINC.iloc[int(result.iloc[i,1]),0]
            data.loc[i, 'UNIFIED_CODE_DESCRIPTION'] = LOINC.iloc[int(result.iloc[i,1]),7]
        elif result.iloc[i,4] == 'Drs_LOINC':
            data.loc[i, 'UNIFIED_CODE'] = DRS_LOINC.iloc[int(result.iloc[i,1]),1]
            data.loc[i, 'UNIFIED_CODE_DESCRIPTION'] = DRS_LOINC.iloc[int(result.iloc[i,1]),0]
        elif result.iloc[i,4] == 'Drs_ACHI':
            data.loc[i, 'UNIFIED_CODE'] = DRS_ACHI.iloc[int(result.iloc[i,1]),1]
            data.loc[i, 'UNIFIED_CODE_DESCRIPTION'] = DRS_ACHI.iloc[int(result.iloc[i,1]),0]
        elif result.iloc[i,4] == 'Drs_SFDA':
            data.loc[i, 'UNIFIED_CODE'] = DRS_SFDA.iloc[int(result.iloc[i,1]),1]
            data.loc[i, 'UNIFIED_CODE_DESCRIPTION'] = DRS_SFDA.iloc[int(result.iloc[i,1]),0]    
        else:
            data.loc[i, 'UNIFIED_CODE'] = SFDA.iloc[int(result.iloc[i,1]),0]
            data.loc[i, 'UNIFIED_CODE_DESCRIPTION'] = SFDA.iloc[int(result.iloc[i,1]),3]
    data['Similarity'] = result['Sim']
    #print(data.head())
    return data

In [52]:
def filter_by_similarity(data, thresh):
    #data.loc[data['Similarity'] <= thresh, ['UNIFIED_CODE','UNIFIED_CODE_DESCRIPTION','MAPPED_CODE']] = "Couldn't Be Mapped"
    return data

In [53]:
def swap_columns(data):
    columnsTitles = data.columns.to_list()
    result = data.reindex(columns=columnsTitles[:5]+['UNIFIED_CODE_DESCRIPTION']+['UNIFIED_CODE']+['SERVICE_TYPE']+['MAPPED_CODE']+['Similarity'])
    #print(result.head())
    return result

In [54]:
def files_mapping(data_path):
    data = read_data(data_path)
    print("raeding data finished")
    data = get_data_by_service_type(data, ['Lab','Diagnosis Procedure','Dental','Diagnostic Procedures','Other Medical Services','Package Deal','Physiotherapy','Radiology'])
    print("filtering by service type finished")
    DRs_ACHI,DRs_LOINC,ACHI,DRs_SFDA,SFDA,LOINC = read_standards(get_standards_path())
    print("reading standards finshed")
    service_name = get_service_name(data)
    #print(service_name)
    ACHI_processed = create_document(ACHI,3)
    print("achi preprocessed")
    LOINC_processed = create_document(LOINC,7)
    print("loinc preprocessed")
    SFDA_processed = create_document(SFDA,3)
    print("sfda preprocessed")
    DRs_ACHI_processed = create_document(DRs_ACHI,0)
    print("drs achi preprocessed")
    DRs_LOINC_processed = create_document(DRs_LOINC,0)
    print("drs loinc preprocessed")
    DRs_SFDA_processed = create_document(DRs_SFDA,0)
    print("drs sfda preprocessed")
    data_processed = create_document(service_name,0)
    #print(data_processed)
    print("data preprocessed")
    if len(data_processed) > 0:
        emb_data = get_embeddings(data_processed.iloc[:,0])
        print("data vectorized")
        emb_achi = get_embeddings(ACHI_processed.iloc[:,3])
        print("achi vectorized")
        emb_loinc = get_embeddings(LOINC_processed.iloc[:,7])
        print("loinc vectorized")
        emb_sfda = get_embeddings(SFDA_processed.iloc[:,3])
        print("sfda vectorized")
        emb_drs_achi = get_embeddings(DRs_ACHI_processed.iloc[:,0])
        print("drs achi vectorized")
        emb_drs_loinc = get_embeddings(DRs_LOINC_processed.iloc[:,0])
        print("drs loinc vectorized")
        emb_drs_sfda = get_embeddings(DRs_SFDA_processed.iloc[:,0])
        print("drs sfda vectorized")
        resultAll_ACHI = get_similarity(data,emb_data,emb_achi)
        print("similarity for achi done")
        resultAll_LOINC = get_similarity(data,emb_data,emb_loinc)
        print("similarity for loinc done")
        resultAll_SFDA = get_similarity(data,emb_data,emb_sfda)
        print("similarity for sfda done")
        resultAll_DRS_ACHI = get_similarity(data,emb_data,emb_drs_achi)
        print("similarity for drs achi done")
        resultAll_DRS_LOINC = get_similarity(data,emb_data,emb_drs_loinc)
        print("similarity for drs loinc done")
        resultAll_DRS_SFDA = get_similarity(data,emb_data,emb_drs_sfda)
        print("similarity for drs sfda done")
        AllResult = get_max_results(data,resultAll_ACHI,resultAll_LOINC,resultAll_SFDA,resultAll_DRS_ACHI,resultAll_DRS_LOINC,resultAll_DRS_SFDA)
        print("getting max results done")
        AllResult = do_map(data,AllResult,ACHI,LOINC,SFDA,DRs_ACHI,DRs_LOINC,DRs_SFDA)
        print("mapping done")
        #AllResult = filter_by_similarity(AllResult, 0.0)
        AllResult = swap_columns(AllResult)
    else:
        AllResult = []
        
    return AllResult

In [55]:
def export_excel(path):
    writer = pd.ExcelWriter(path,engine='openpyxl')
    writer.book = openpyxl.load_workbook(path)
    
    pandas.io.formats.excel.ExcelFormatter.header_style = None
    
    result["Y/N"] = np.nan
    result["Doctor's Recommended Code"] = np.nan
    result["Doctor's Recommended Code Description"] = np.nan
    result[(result.SERVICE_TYPE != 'Lab') & (result.SERVICE_TYPE != 'Other Medical Services')].to_excel(writer,sheet_name='ACHI',index=False)
    result[result.SERVICE_TYPE == 'Lab'].to_excel(writer,sheet_name='LOINC',index=False)
    result[result.SERVICE_TYPE == 'Other Medical Services'].to_excel(writer,sheet_name='SFDA',index=False)
    
    for i, sheetname in enumerate(writer.book.sheetnames):
        worksheet = writer.book[sheetname]
        mediumStyle = openpyxl.worksheet.table.TableStyleInfo(name='TableStyleMedium2',showRowStripes=True)
        table = openpyxl.worksheet.table.Table(ref=worksheet.dimensions,displayName="Table" + str(i),tableStyleInfo=mediumStyle)
        worksheet.add_table(table)
        
        for col in worksheet.columns:
            worksheet.column_dimensions[col[0].column_letter].width = (len(str(col[0].value)) + 2) * 1.3
    
    writer.book.save(path)
    writer.book.close()
    writer.close()
    os.rename(path,os.path.join("mapped_embedding",os.path.splitext(os.path.basename(path))[0]+" Mapped_embedding"+os.path.splitext(path)[-1]))

In [56]:
%%time
n = 1
for provider_path in get_providers_path():
    start_time = time.time()
    print("Started mapping " + os.path.basename(provider_path))
    result = files_mapping(provider_path)
    if len(result) > 0:
        print("Finished mapping " + os.path.basename(provider_path))
        export_excel(provider_path)
        print("Exported " + os.path.basename(provider_path) + " to the 'mapped_embedding' folder")
        elapsed_time = time.time() - start_time
        print("Time: " + str(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) + "\n")
    else:
        print("There is no mappings for this provider\n")

Started mapping Abeer Supreme Medical Center 2 ## 6212001674 @ 1285.xlsx
raeding data finished
filtering by service type finished
reading standards finshed
achi preprocessed
loinc preprocessed
sfda preprocessed
drs achi preprocessed
drs loinc preprocessed
drs sfda preprocessed
data preprocessed
data vectorized
achi vectorized
loinc vectorized
sfda vectorized
drs achi vectorized
drs loinc vectorized
drs sfda vectorized
similarity for achi done
similarity for loinc done
similarity for sfda done
similarity for drs achi done
similarity for drs loinc done
similarity for drs sfda done
getting max results done
mapping done
Finished mapping Abeer Supreme Medical Center 2 ## 6212001674 @ 1285.xlsx
Exported Abeer Supreme Medical Center 2 ## 6212001674 @ 1285.xlsx to the 'mapped_embedding' folder
Time: 00:12:14

CPU times: user 15min 59s, sys: 2min 55s, total: 18min 54s
Wall time: 12min 14s


In [28]:
#data = pd.read_excel('Providers/Shifa Jeddah Polyclinic ## 6212000304 @ 5903.xlsx')

In [29]:
#model.embed_sentences(pd.DataFrame(data.iloc[:4,4]))

In [30]:
#pd.DataFrame(data.iloc[:4,4])

In [31]:
#sentence_vector = model.embed_sentence(sentence)
#print(sentence_vector)
 

In [None]:
# sentence_vector1 = model.embed_sentence(preprocess_sentence('CKMB'))
# sentence_vector2 = model.embed_sentence(preprocess_sentence('creatine kinase.MB\creatine kinase.MB.total in serum or plasma by electrophoresis'))

#cosine_sim = 1 - distance.cosine(sentence_vector1, sentence_vector2)
# print('cosine similarity:', cosine_sim)

In [None]:
#find_similar(achi_emp.iloc[4],data_emp)

In [None]:
#model.embed_sentences(create_document(data,4)['SERVICE_NAME'])[0].max()

In [None]:
#model.embed_sentences(create_document(data,4)['SERVICE_NAME'])[0].reshape(-1,1).shape