# LASA recognition

## Sound-alike

In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.cluster import KMeans, AffinityPropagation
import matplotlib.pyplot as plt
import distance
import nltk
from nltk.metrics.distance import edit_distance
from tqdm.notebook import tqdm

#### FDA DATASET

In [2]:
df = pd.read_csv("./drugsatfda20211116/Products.txt", sep='\t+', engine='python')
df

Unnamed: 0,ApplNo,ProductNo,Form,Strength,ReferenceDrug,DrugName,ActiveIngredient,ReferenceStandard
0,4,4,SOLUTION/DROPS;OPHTHALMIC,1%,0,PAREDRINE,HYDROXYAMPHETAMINE HYDROBROMIDE,0.0
1,159,1,TABLET;ORAL,500MG,0,SULFAPYRIDINE,SULFAPYRIDINE,0.0
2,552,1,INJECTABLE;INJECTION,"20,000 UNITS/ML",0,LIQUAEMIN SODIUM,HEPARIN SODIUM,0.0
3,552,2,INJECTABLE;INJECTION,"40,000 UNITS/ML",0,LIQUAEMIN SODIUM,HEPARIN SODIUM,0.0
4,552,3,INJECTABLE;INJECTION,"5,000 UNITS/ML",0,LIQUAEMIN SODIUM,HEPARIN SODIUM,0.0
...,...,...,...,...,...,...,...,...
43206,761201,2,INJECTABLE;INJECTION,3ML(100UNITS/ML),0,SEMGLEE,INSULIN GLARGINE-YFGN,0.0
43207,761202,1,INJECTABLE;INJECTION,0.5MG(10MG/ML),0,BYOOVIZ,RANIBIZUMAB-NUNA,0.0
43208,761208,1,INJECTABLE;INJECTION,40MG,0,TIVDAK,TISOTUMAB VEDOTIN-TFTV,0.0
43209,761210,1,INJECTABLE;INJECTION,350MG/7ML(50MG/ML),0,RYBREVANT,AMIVANTAMAB-VMJW,0.0


In [3]:
drugNames = df['DrugName']
drugNames

0               PAREDRINE
1           SULFAPYRIDINE
2        LIQUAEMIN SODIUM
3        LIQUAEMIN SODIUM
4        LIQUAEMIN SODIUM
               ...       
43206             SEMGLEE
43207             BYOOVIZ
43208              TIVDAK
43209           RYBREVANT
43210            JEMPERLI
Name: DrugName, Length: 43211, dtype: object

In [12]:
drugNames = drugNames.drop_duplicates() \
                     .dropna()
random_incides = [np.random.randint(0, len(drugNames)) for _ in range(10)]
drugNames.iloc[random_incides]

279                                              PHENURONE
2061                     DEXTROSE 40% IN PLASTIC CONTAINER
36780                                             CRESEMBA
7416                                              VALTURNA
7544                                               OZURDEX
15730                    DESOGESTREL AND ETHINYL ESTRADIOL
2586                                               MYCELEX
2638                        ISOLYTE S IN PLASTIC CONTAINER
3217     POTASSIUM CHLORIDE 30MEQ IN DEXTROSE 5% AND SO...
831                                              MEPROSPAN
Name: DrugName, dtype: object

In [13]:
names = np.array(drugNames)
len(names)

7572

In [6]:
# Levenshtein distance
# n = len(names)
n = 3000
lev_dist = np.zeros((n, n))
for i in tqdm(range(n)):
    for j in range(i + 1, n):
        dist = edit_distance(names[i], names[j])
        lev_dist[i, j] = lev_dist[j, i] = dist

  0%|          | 0/3000 [00:00<?, ?it/s]

In [6]:
import pickle

In [7]:
file_path = 'lev_dist3000.pickle'
# pickle.dump(lev_dist, open(file_path, "wb"))
lev_dist = pickle.load(open(file_path, "rb"))

In [8]:
# Distance to similarity
# Try out other ways to translate distance to similarity
lev_sim = 1 / (1 + lev_dist)

In [9]:
# Cluster on computed similarities
aff_prop = AffinityPropagation(affinity="precomputed", damping=0.96, verbose=True)
aff_prop.fit(lev_sim);
print(f'Found {len(aff_prop.cluster_centers_indices_)} clusters.')

Converged after 15 iterations.
Found 61 clusters.


In [10]:
for cluster_id in range(len(aff_prop.cluster_centers_indices_)):
    exemplar = names[aff_prop.cluster_centers_indices_[cluster_id]]
    members = names[np.nonzero(aff_prop.labels_ == cluster_id)]

    print(f'{cluster_id + 1}. \033[1m{exemplar}\033[0m ({len(members)} members): {", ".join(members)}')


1. [1mCALCIUM GLUCEPTATE[0m (11 members): HEAVY SOLUTION NUPERCAINE, CALCIUM GLUCEPTATE, XYLOCAINE W/ EPINEPHRINE, QUINIDINE GLUCONATE, CALCIUM DISODIUM VERSENATE, XYLOCAINE 5% W/ GLUCOSE 7.5%, GALLIUM CITRATE GA 67, MPI STANNOUS DIPHOSPHONATE, TECHNESCAN GLUCEPTATE, CHLORHEXIDINE GLUCONATE, HUMALOG MIX 75/25 PEN
2. [1mWYAMINE SULFATE[0m (51 members): LIQUAEMIN SODIUM, LIQUAEMIN LOCK FLUSH, HISTAMINE PHOSPHATE, SULFADIAZINE SODIUM, AMINOHIPPURATE SODIUM, DIASONE SODIUM, SULFONAMIDES DUPLEX, PROTAMINE SULFATE, TRIPLE SULFAS, AZULFIDINE EN-TABS, SUS-PHRINE SULFITE FREE, WYAMINE SULFATE, PAMINE FORTE, CYSTOGRAFIN DILUTE, KENALOG IN ORABASE, URACIL MUSTARD, ORAGRAFIN CALCIUM, CHROMITOPE SODIUM, CITANEST FORTE, FREAMINE II 8.5%, FREAMINE III 8.5%, FREAMINE III 10%, FREAMINE HBC 6.9%, FLAVORED COLESTID, AMINOSYN 7% (PH6), MPI DMSA KIDNEY REAGENT, NORMINEST FE, NOVAMINE 11.4%, DRAXIMAGE MDP-10, PHENDIMETRAZINE TARTRATE, BETAMETHASONE VALERATE, BRETYLIUM TOSYLATE, MANGANESE SULFATE, ZINC S

#### NL Dataset

In [17]:
df = pd.read_csv('./metadata.csv', sep='|', engine='python')
df

Unnamed: 0,REGISTRATIENUMMER,SOORT,PRODUCTNAAM,INSCHRIJVINGSDATUM,HANDELSVERGUNNINGHOUDER,AFLEVERSTATUS,FARMACEUTISCHEVORM,POTENTIE,PROCEDURENUMMER,TOEDIENINGSWEG,...,BIJSLUITER_WIJZIG_DATUM,ATC,WERKZAMESTOFFEN,HULPSTOFFEN,PRODUCTDETAIL_LINK,NIEUWS_LINKS,NIEUWS_LINK_DATUMS,REFERENTIE,SMPC_VORIGE_VERSIE,SMPC_VORIGE_VORIGE_VERSIE
0,29437//00027,RVG,"Sustanon ""250"", injectievloeistof",2003/05/06,Euro Registratie Collectief B.V.,Uitsluitend recept,Oplossing voor injectie,,,Parenteraal,...,22-FEB-21,G03BA03 - Testosterone,TESTOSTERON#TESTOSTERONDECANOAAT#TESTOSTERONFE...,ARACHISOLIE#BENZYLALCOHOL (E 1519)#STIKSTOF (H...,https://www.geneesmiddeleninformatiebank.nl/nl...,,,"Euro Registratie Collectief B.V., Sustanon ""25...",,
1,123693//00859,RVG,Puri-Nethol 50 mg tabletten,2018/10/01,Medcor Pharmaceuticals B.V.,Uitsluitend recept,Tablet,,,Oraal gebruik,...,27-MAY-21,L01BB02 - Mercaptopurine,MERCAPTOPURINE 1-WATER,LACTOSE 0-WATER#MAGNESIUMSTEARAAT (E 470b)#MAÏ...,https://www.geneesmiddeleninformatiebank.nl/nl...,,,"Medcor Pharmaceuticals B.V., Puri-Nethol 50 mg...",,
2,128716//01032,RVG,"Antabus dispergetten 400 mg, bruistabletten",2021/08/25,Eureco-Pharma B.V.,Uitsluitend recept,Bruistablet,,,Oraal gebruik,...,26-AUG-21,N07BB01 - Disulfiram,DISULFIRAM,"CELLULOSE, MICROKRISTALLIJN (E 460)#MAGNESIUMS...",https://www.geneesmiddeleninformatiebank.nl/nl...,,,"Eureco-Pharma B.V., Antabus dispergetten 400 m...",,
3,124568//01363,RVG,"Lanoxin 250, tabletten 0,25 mg",2019/03/27,BModesto B.V.,Uitsluitend recept,Tablet,,,Oraal gebruik,...,27-MAR-20,C01AA05 - Digoxin,DIGOXINE,LACTOSE 1-WATER#MAGNESIUMSTEARAAT (E 470b)#MAÏ...,https://www.geneesmiddeleninformatiebank.nl/nl...,,,"BModesto B.V., Lanoxin 250, tabletten 0,25 mg ...",,
4,114919//02011,RVG,"GlucaGen HypoKít 1 mg (1 IE), met poeder en op...",2014/02/11,Eureco-Pharma B.V.,Uitsluitend recept,Poeder en oplosmiddel voor oplossing voor inje...,,,Parenteraal,...,09-AUG-21,H04AA01 - Glucagon,GLUCAGONHYDROCHLORIDE,LACTOSE 1-WATER#NATRIUMHYDROXIDE (E 524)#WATER...,https://www.geneesmiddeleninformatiebank.nl/nl...,,,"Eureco-Pharma B.V., GlucaGen HypoKít 1 mg (1 I...",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18972,119145,RVG,"Venlafaxine STADA 37,5 mg, harde capsules met ...",2017/10/26,Stada Arzneimittel AG,Uitsluitend recept,"Capsule met verlengde afgifte, hard",,NL/H/3669/001,Oraal gebruik,...,19-APR-21,N06AX16 - Venlafaxine,VENLAFAXINEHYDROCHLORIDE,"AMMONIA (E 527)#CELLULOSE, MICROKRISTALLIJN (E...",https://www.geneesmiddeleninformatiebank.nl/nl...,,,"Stada Arzneimittel AG, Venlafaxine STADA 37,5 ...",https://www.geneesmiddeleninformatiebank.nl/do...,https://www.geneesmiddeleninformatiebank.nl/do...
18973,92122,RVH,Jaborandi,1999/12/10,VSM Geneesmiddelen B.V.,Uitsluitend apotheek of drogist,Globuli,,,Oraal gebruik,...,,,JABORANDI#JABORANDI#JABORANDI,,https://www.geneesmiddeleninformatiebank.nl/nl...,,,"VSM Geneesmiddelen B.V., Jaborandi (92122),",,
18974,91285,RVH,Drosera rotundifolia,1998/11/27,VSM Geneesmiddelen B.V.,Uitsluitend apotheek of drogist,Tablet,"D4, D6, D12",,Oraal gebruik,...,,,DROSERA ROTUNDIFOLIA,,https://www.geneesmiddeleninformatiebank.nl/nl...,,,"VSM Geneesmiddelen B.V., Drosera rotundifolia ...",,
18975,118846,RVG,"Betahistine Sandoz 8 mg, tabletten",2018/05/28,Sandoz B.V.,Uitsluitend recept,Tablet,,NL/H/3705/001,Oraal gebruik,...,06-JUN-18,N07CA01 - Betahistine,BETAHISTINEDIHYDROCHLORIDE,"CELLULOSE, MICROKRISTALLIJN (E 460)#CITROENZUU...",https://www.geneesmiddeleninformatiebank.nl/nl...,,,"Sandoz B.V., Betahistine Sandoz 8 mg, tablette...",,


In [22]:
drugNames = df['PRODUCTNAAM'].drop_duplicates() \
                     .dropna()
random_incides = [np.random.randint(0, len(drugNames)) for _ in range(10)]
drugNames.iloc[random_incides]

16521                 Lacoala 200 mg filmomhulde tabletten
17610    Fentanyl Sandoz 800 microgram, tabletten voor ...
8561     Mirtazapine Aurobindo 15 mg orodispergeerbare ...
6027     CAPD/DPCA 17 met 1,5% glucose, oplossing voor ...
13472    Mesavancol 1200 mg maagsapresistent, tabletten...
5692                Aprepitant Accord 40 mg harde capsules
9719     Levocetirizine dihydrochloride Medcor 0,5 mg/m...
6124     Xylocaine 1% - Adrenaline, injectievloeistof 1...
8665                  Codeinefosfaat Teva 20 mg, tabletten
9966         Valsartan Mylan 320 mg, filmomhulde tabletten
Name: PRODUCTNAAM, dtype: object

In [23]:
names = np.array(drugNames)

In [29]:
# Levenshtein distance
# n = len(names)
n = 3000
lev_dist = np.zeros((n, n))
for i in tqdm(range(n)):
    for j in range(i + 1, n):
        dist = edit_distance(names[i], names[j])
        lev_dist[i, j] = lev_dist[j, i] = dist

  0%|          | 0/3000 [00:00<?, ?it/s]

In [25]:
import pickle

In [26]:
file_path = 'lev_NLdist3000.pickle'
# pickle.dump(lev_dist, open(file_path, "wb"))
lev_dist = pickle.load(open(file_path, "rb"))

In [27]:
# Distance to similarity
# Try out other ways to translate distance to similarity
lev_sim = 1 / (1 + lev_dist)

In [28]:
# Cluster on computed similarities
aff_prop = AffinityPropagation(affinity="precomputed", damping=0.99, verbose=True)
aff_prop.fit(lev_sim);
print(f'Found {len(aff_prop.cluster_centers_indices_)} clusters.')

Converged after 15 iterations.
Found 12 clusters.


In [29]:
for cluster_id in range(len(aff_prop.cluster_centers_indices_)):
    exemplar = names[aff_prop.cluster_centers_indices_[cluster_id]]
    members = names[np.nonzero(aff_prop.labels_ == cluster_id)]

    print(f'{cluster_id + 1}. \033[1m{exemplar}\033[0m ({len(members)} members): {", ".join(members)}')

1. [1mCitanest 3% - Octapressine DENTAL, injectievloeistof voor perineurale toediening 30 mg/ml + 0,54 mcg/ml[0m (2 members): Citanest 3% - Octapressine DENTAL, injectievloeistof voor perineurale toediening 30 mg/ml + 0,54 mcg/ml, Citanest 3% - Octapressine DENTAL, injectievloeistof voor perineurale toediening 30 mg/ml + 0,54 microgram/ml
2. [1mBeclometasondipropionaat/formoterolfumaraatdihydraat ERC 100/6 microgram/dosis, aërosol, oplossing[0m (4 members): Beclometasondipropionaat/formoterolfumaraatdihydraat ERC 100/6 microgram/dosis, aërosol, oplossing, Beclometasondipropionaat/ formoterolfumaraatdihydraat 100/6 microgram/dosis Medcor, aërosol, oplossing, Salmeterol/Fluticasonpropionaat 25 microgram/250 microgram/dosis Vincion, aërosol, suspensie, Otrivin Duo Xylometazolinehydrochloride & Ipratropiumbromide, 0,5/0,6 mg/ml, neusspray, oplossing
3. [1mArtane 2 mg, tabletten[0m (1224 members): Sustanon "250", injectievloeistof, Puri-Nethol 50 mg tabletten, Antabus dispergetten 400