# LASA recognition

## Sound-alike

In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.cluster import KMeans, AffinityPropagation
import matplotlib.pyplot as plt
import distance
import nltk
from nltk.metrics.distance import edit_distance
from tqdm.notebook import tqdm

In [2]:
df = pd.read_csv("./drugsatfda20211116/Products.txt", sep='\t+', engine='python')
df

Unnamed: 0,ApplNo,ProductNo,Form,Strength,ReferenceDrug,DrugName,ActiveIngredient,ReferenceStandard
0,4,4,SOLUTION/DROPS;OPHTHALMIC,1%,0,PAREDRINE,HYDROXYAMPHETAMINE HYDROBROMIDE,0.0
1,159,1,TABLET;ORAL,500MG,0,SULFAPYRIDINE,SULFAPYRIDINE,0.0
2,552,1,INJECTABLE;INJECTION,"20,000 UNITS/ML",0,LIQUAEMIN SODIUM,HEPARIN SODIUM,0.0
3,552,2,INJECTABLE;INJECTION,"40,000 UNITS/ML",0,LIQUAEMIN SODIUM,HEPARIN SODIUM,0.0
4,552,3,INJECTABLE;INJECTION,"5,000 UNITS/ML",0,LIQUAEMIN SODIUM,HEPARIN SODIUM,0.0
...,...,...,...,...,...,...,...,...
43206,761201,2,INJECTABLE;INJECTION,3ML(100UNITS/ML),0,SEMGLEE,INSULIN GLARGINE-YFGN,0.0
43207,761202,1,INJECTABLE;INJECTION,0.5MG(10MG/ML),0,BYOOVIZ,RANIBIZUMAB-NUNA,0.0
43208,761208,1,INJECTABLE;INJECTION,40MG,0,TIVDAK,TISOTUMAB VEDOTIN-TFTV,0.0
43209,761210,1,INJECTABLE;INJECTION,350MG/7ML(50MG/ML),0,RYBREVANT,AMIVANTAMAB-VMJW,0.0


In [3]:
drugNames = df['DrugName']
drugNames

0               PAREDRINE
1           SULFAPYRIDINE
2        LIQUAEMIN SODIUM
3        LIQUAEMIN SODIUM
4        LIQUAEMIN SODIUM
               ...       
43206             SEMGLEE
43207             BYOOVIZ
43208              TIVDAK
43209           RYBREVANT
43210            JEMPERLI
Name: DrugName, Length: 43211, dtype: object

In [4]:
drugNames = drugNames.drop_duplicates() \
                     .dropna()
random_incides = [np.random.randint(0, len(drugNames)) for _ in range(10)]
drugNames.iloc[random_incides]

5454                                                SONORX
2042     DIANEAL 137 W/ DEXTROSE 4.25% IN PLASTIC CONTA...
9018                                            OPHTHOCORT
29375                                            HERCEPTIN
42108                                              GEMMILY
39767                                              MEKTOVI
7466                                               VELETRI
4501     TRAVASOL 2.75% SULFITE FREE W/ ELECTROLYTES IN...
3570     POTASSIUM CHLORIDE 30MEQ IN DEXTROSE 5% AND SO...
10111                                     FULVICIN P/G 165
Name: DrugName, dtype: object

In [5]:
names = np.array(drugNames)
len(names)

7572

In [6]:
# Levenshtein distance
# n = len(names)
n = 3000
lev_dist = np.zeros((n, n))
for i in tqdm(range(n)):
    for j in range(i + 1, n):
        dist = edit_distance(names[i], names[j])
        lev_dist[i, j] = lev_dist[j, i] = dist

  0%|          | 0/3000 [00:00<?, ?it/s]

In [6]:
import pickle

In [8]:
file_path = 'lev_dist3000.pickle'
# pickle.dump(lev_dist, open(file_path, "wb"))
lev_dist = pickle.load(open(file_path, "rb"))

In [9]:
# Distance to similarity
# Try out other ways to translate distance to similarity
lev_sim = 1 / (1 + lev_dist)

In [10]:
# Cluster on computed similarities
aff_prop = AffinityPropagation(affinity="precomputed", damping=0.96, verbose=True)
aff_prop.fit(lev_sim);
print(f'Found {len(aff_prop.cluster_centers_indices_)} clusters.')

Converged after 15 iterations.
Found 61 clusters.


In [11]:
for cluster_id in range(len(aff_prop.cluster_centers_indices_)):
    exemplar = names[aff_prop.cluster_centers_indices_[cluster_id]]
    members = names[np.nonzero(aff_prop.labels_ == cluster_id)]

    print(f'{cluster_id + 1}. \033[1m{exemplar}\033[0m ({len(members)} members): {", ".join(members)}')


1. [1mCALCIUM GLUCEPTATE[0m (11 members): HEAVY SOLUTION NUPERCAINE, CALCIUM GLUCEPTATE, XYLOCAINE W/ EPINEPHRINE, QUINIDINE GLUCONATE, CALCIUM DISODIUM VERSENATE, XYLOCAINE 5% W/ GLUCOSE 7.5%, GALLIUM CITRATE GA 67, MPI STANNOUS DIPHOSPHONATE, TECHNESCAN GLUCEPTATE, CHLORHEXIDINE GLUCONATE, HUMALOG MIX 75/25 PEN
2. [1mWYAMINE SULFATE[0m (51 members): LIQUAEMIN SODIUM, LIQUAEMIN LOCK FLUSH, HISTAMINE PHOSPHATE, SULFADIAZINE SODIUM, AMINOHIPPURATE SODIUM, DIASONE SODIUM, SULFONAMIDES DUPLEX, PROTAMINE SULFATE, TRIPLE SULFAS, AZULFIDINE EN-TABS, SUS-PHRINE SULFITE FREE, WYAMINE SULFATE, PAMINE FORTE, CYSTOGRAFIN DILUTE, KENALOG IN ORABASE, URACIL MUSTARD, ORAGRAFIN CALCIUM, CHROMITOPE SODIUM, CITANEST FORTE, FREAMINE II 8.5%, FREAMINE III 8.5%, FREAMINE III 10%, FREAMINE HBC 6.9%, FLAVORED COLESTID, AMINOSYN 7% (PH6), MPI DMSA KIDNEY REAGENT, NORMINEST FE, NOVAMINE 11.4%, DRAXIMAGE MDP-10, PHENDIMETRAZINE TARTRATE, BETAMETHASONE VALERATE, BRETYLIUM TOSYLATE, MANGANESE SULFATE, ZINC S