# LASA recognition

## Sound-alike

In [2]:
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.cluster import KMeans, AffinityPropagation
import matplotlib.pyplot as plt
import nltk
from nltk.metrics.distance import edit_distance
from tqdm.notebook import tqdm
import pickle
import string
import sys, math, random, copy

In [3]:
df = pd.read_csv("./drugsatfda20211116/Products.txt", sep='\t+', engine='python')
drugNames = df['DrugName']

In [4]:
drugNames = drugNames.drop_duplicates() \
                     .dropna()
random_incides = [np.random.randint(0, len(drugNames)) for _ in range(10)]
drugNames.iloc[random_incides]

6555                          ENABLEX
4309     NITROGLYCERIN IN DEXTROSE 5%
8543                      BENZONATATE
43123                          AVSOLA
201                         GANTRISIN
5628                      METROLOTION
21053             EPOPROSTENOL SODIUM
10491                      GENTACIDIN
5343                          EMADINE
4133                           ADAGEN
Name: DrugName, dtype: object

In [6]:
names = np.array(drugNames)
lasa_names_ISMP_FDA = np.unique(np.loadtxt("lists_LASA/sa_ISMP+FDA.txt", dtype=str))
# lasa_names_ISMP = np.unique(np.loadtxt("lists_LASA/sa_ISMP.txt", dtype=str, delimiter="\n"))
# lasa_names = np.append(lasa_names_ISMP_FDA, lasa_names_ISMP)
# names = np.append(lasa_names, names)
# names = [each_string.lower() for each_string in names]
names = np.append(lasa_names_ISMP_FDA, names)
names = np.unique(names)
print(len(lasa_names_ISMP_FDA))

164


In [7]:
# Calculate similarity matrix between letters
# Adapted from Samuelsson, made for Spotify
# Neighbors are currently chosen by phonetic experimentation
neighbors_of = {}
neighbors_of['q'] = ['w', 'c', 'k']
neighbors_of['w'] = ['v', 'u']
neighbors_of['e'] = ['i', 'y', 'a']
neighbors_of['r'] = ['t', 'f', 'd', 'e']
neighbors_of['t'] = ['d', 'f', 'r', 'v', 'p']
neighbors_of['y'] = ['i', 'e', 'a', 'u']
neighbors_of['u'] = ['i', 'y', 'o', 'a', 'e', 'w']
neighbors_of['i'] = ['e', 'y', 'u']
neighbors_of['o'] = ['e', 'u']
neighbors_of['p'] = ['l', 'o', 't']
neighbors_of['a'] = ['e', 'i', 'u', 'y']
neighbors_of['s'] = ['x', 'z', 'c']
neighbors_of['d'] = ['b', 'f', 't', 'p']
neighbors_of['f'] = ['v', 'd', 't']
neighbors_of['g'] = ['j', 'h', 'q']
neighbors_of['h'] = ['f', 'g']
neighbors_of['j'] = ['g', 'c']
neighbors_of['k'] = ['c', 'q']
neighbors_of['l'] = ['m', 'n']
neighbors_of['z'] = ['s', 'x', 'c']
neighbors_of['x'] = ['s', 'c', 'z', 'k']
neighbors_of['c'] = ['k', 's']
neighbors_of['v'] = ['f', 'b', 'c', 'w']
neighbors_of['b'] = ['g', 'n', 'v', 'd']
neighbors_of['n'] = ['m', 'b']
neighbors_of['m'] = ['b', 'n']

keys = sorted(neighbors_of.keys())
dists = {el:{} for el in keys}

# Distance between letters and their neighbours
def distance(start, end, raw):
    if start == end:
        if raw:
            return 0
        else:
            return 1
        
    visited = [start]
    queue = []
    
    for key in neighbors_of[start]:
        queue.append({'char': key, 'dist': 1})
        
    while True:
        key = queue.pop(0)
        visited.append(key['char'])
        if key['char'] == end:
            return key['dist']
        
        for neighbor in neighbors_of[key['char']]:
            if neighbor not in visited:
                queue.append({'char': neighbor, 'dist': key['dist']+1})

In [10]:
# Computes a similarity matrix for letters of the English alphabet
# Inspired by the keyboard distances research of Samuelsson for Spotify
def alldists(option, verbose):          
    if option == "raw":
        longest_dist = 0
        avgdist = 0
        for i in range(len(keys)):
            for j in range(len(keys)):
                dists[keys[i]][keys[j]] = distance(keys[i], keys[j], True)
                avgdist += dists[keys[i]][keys[j]]
                if dists[keys[i]][keys[j]] > longest_dist:
                    longest_dist = dists[keys[i]][keys[j]]
        key_dist = longest_dist
        avgdist /= len(keys) ** 2 + 0.0
        
        buckets = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        
        for i in range(len(keys)):
            for j in range(len(keys)):
                buckets[dists[keys[i]][keys[j]]] += 1
        if verbose:
            print("Average distance: " + str(avgdist))
            print("Longest distance: " + str(key_dist))
            print("Buckets: " + str(buckets))
            print(str(dists).replace("'", '"'))
    return copy.deepcopy(dists)

In [11]:
# Take all ascii characters
all_ascii = string.printable

# Add the manually computed Edit Distance for letters to the full similarity matrix
# Add hardcoded similarity for the other characters (0 if same character, 12 otherwise)
similarity_dict = alldists("raw", False)
similarity_dict_all = {}

# Construct full similarity matrix by iterating through all ascii characters
for a in all_ascii:
    similarity_dict_all[a] = {}
    for b in all_ascii:
        # If characters are the same, assign 0
        # Otherwise if similarity has alredy been computed, assign that value
        # Otherwise assign 12      
        similarity_dict_all[a][b] = (0 if a == b else similarity_dict[a][b] if a in similarity_dict and b in similarity_dict[a] else 12)
similarity_array = np.zeros((len(similarity_dict), len(similarity_dict)))

for character_index, (character, other_characters) in enumerate(similarity_dict.items()):
    for c_index, c in enumerate(other_characters.values()):
        similarity_array[character_index][c_index] = c

In [14]:
ins_cost = 3
del_cost = 4

def edit_distance_dp(seq1, seq2):
    # There is no difference between upper and lower case for this application    
    seq1 = seq1.lower()
    seq2 = seq2.lower()
    
    # Create an empty 2D matrix to store cost
    cost = np.zeros((len(seq1)+1, len(seq2)+1))
    
    # Fill the first row
    cost[0] = [i for i in range(len(seq2)+1)]
    
    # Fill the first column
    cost[:, 0] = [i for i in range(len(seq1)+1)]
    
    # Iterate over earch row and column
    for row in range(1, len(seq1)+1):
        
        for col in range(1, len(seq2)+1):
            
            # If both the characters are same then the cost will be same as 
            # the cost of the previous sub-sequence
            if seq1[row-1] == seq2[col-1]:
                cost[row][col] = cost[row-1][col-1]
            else:
                
                insertion_cost = cost[row][col-1] + ins_cost
                deletion_cost = cost[row-1][col] + del_cost
                substitution_cost = cost[row-1][col-1] + similarity_dict_all[seq1[row-1]][seq2[col-1]]
#                 print(f"sim for {seq1[row-1]} and {seq2[col-1]}: {similarity_dict_all[seq1[row-1]][seq2[col-1]]}")
                
                # Calculate the minimum cost
                cost[row][col] = min(insertion_cost, deletion_cost, substitution_cost)
                
    return cost[len(seq1), len(seq2)]

edit_distance_dp("novolin", "novolog")

4.0

In [15]:
# Levenshtein distance
# n = len(names)
n = 100
lev_dist = np.zeros((n, n))
lev_sim = np.zeros((n, n))

for i in tqdm(range(n)):
    for j in range(i+1, n):
        ni = names[i]
        nj = names[j]
        dist = edit_distance_dp(ni, nj)
        lev_dist[i, j] = dist
        lev_dist[j, i] = dist
        
file_path = 'lev_dist.pickle'
pickle.dump(lev_dist, open(file_path, "wb"))
# lev_dist = pickle.load(open(file_path, "rb"))

  0%|          | 0/100 [00:00<?, ?it/s]

In [16]:
# Apply thresholding so not all medications get clustered
# Only used in the first stages of the research
def is_row_similar(row, threshold=26):
    sorted_row = sorted(row)[:len(row)//4]
    return np.average(sorted_row) < threshold

filter_lev_dist = []
columns_to_remove = []
for i, row in enumerate(lev_dist):
    if is_row_similar(row):
        filter_lev_dist.append(row)  
    else:
        columns_to_remove.append(i) 

for i, row in enumerate(filter_lev_dist):
    filter_lev_dist[i] = [entry for c, entry in enumerate(row) if c not in columns_to_remove]
      
filter_lev_dist = np.array(filter_lev_dist)

In [17]:
# Distance to similarity
lev_sim = 1 / (1 + lev_dist)

In [18]:
# Cluster on computed similarities
aff_prop = AffinityPropagation(affinity="precomputed", damping=0.96,max_iter = 10000, verbose=True)
aff_prop.fit(lev_sim)
print(f'Found {len(aff_prop.cluster_centers_indices_)} clusters.')

Converged after 15 iterations.
Found 14 clusters.




In [19]:
for cluster_id in range(len(aff_prop.cluster_centers_indices_)):
    exemplar = names[aff_prop.cluster_centers_indices_[cluster_id]]
    members = names[np.nonzero(aff_prop.labels_ == cluster_id)]

    print(f'{cluster_id + 1}. \033[1m{exemplar}\033[0m ({len(members)} members): {", ".join(members)}')

1. [1mABACAVIR SULFATE AND LAMIVUDINE[0m (4 members): ABACAVIR AND LAMIVUDINE, ABACAVIR SULFATE AND LAMIVUDINE, ABACAVIR SULFATE, LAMIVUDINE AND ZIDOVUDINE, ABACAVIR SULFATE; LAMIVUDINE
2. [1mACETAMINOPHEN AND CODEINE PHOSPHATE[0m (1 members): ACETAMINOPHEN AND CODEINE PHOSPHATE
3. [1mACETAMINOPHEN AND PENTAZOCINE HYDROCHLORIDE[0m (2 members): ACETAMINOPHEN AND HYDROCODONE BITARTRATE, ACETAMINOPHEN AND PENTAZOCINE HYDROCHLORIDE
4. [1mACETAMINOPHEN, ASPIRIN, AND CODEINE PHOSPHATE[0m (2 members): ACETAMINOPHEN, ASPIRIN AND CAFFEINE, ACETAMINOPHEN, ASPIRIN, AND CODEINE PHOSPHATE
5. [1mACETAMINOPHEN, CAFFEINE AND DIHYDROCODEINE BITARTRATE[0m (2 members): ACETAMINOPHEN, CAFFEINE AND DIHYDROCODEINE BITARTRATE, ACETAMINOPHEN, CAFFEINE, AND DIHYDROCODEINE BITARTRATE
6. [1mACETAMINOPHEN; OXYCODONE HYDROCHLORIDE[0m (1 members): ACETAMINOPHEN; OXYCODONE HYDROCHLORIDE
7. [1mACETASOL[0m (20 members): 8-MOP, A.P.L., ABELCET, ABILIFY MYCITE KIT, ABIRATERONE ACETATE, ABSORICA, ABSORICA L

In [25]:
clusters = dict()
all_LASA = []
t=7
for cluster_id in range(len(aff_prop.cluster_centers_indices_)):
    exemplar = names[aff_prop.cluster_centers_indices_[cluster_id]]
    member_ind = np.nonzero(aff_prop.labels_ == cluster_id)
    members = names[member_ind]
    most_similar_members = set()
    # For each member (member index) of the cluster, check if it is similar enough to the rest     
    for member in member_ind[0]:
        for datapoint in range(len(lev_dist)):
            # Omit the distance to the point itself and if it does not have distances below threshold,
            # remove it, probably not LASA
            if (member != datapoint and lev_dist[member][datapoint] < t):
                most_similar_members.add(names[member])
        most_similar_members.add(exemplar)
                
    if len(most_similar_members) > 1:
        clusters[cluster_id] = most_similar_members
        all_LASA.append(list(most_similar_members))
        print(f'\033[1m{exemplar}\033[0m ({len(most_similar_members)} most similar from {len(members)} total): {", ".join(most_similar_members)}')

[1mACETAMINOPHEN, CAFFEINE AND DIHYDROCODEINE BITARTRATE[0m (2 most similar from 2 total): ACETAMINOPHEN, CAFFEINE, AND DIHYDROCODEINE BITARTRATE, ACETAMINOPHEN, CAFFEINE AND DIHYDROCODEINE BITARTRATE
[1mACETASOL[0m (2 most similar from 20 total): ACHROMYCIN, ACETASOL
[1mACTHAR[0m (7 most similar from 19 total): ACEPHEN, ACTH, ACTOS, ACTHAR, ACTIQ, ACIPHEX, ACTICLATE
[1mACTIFED[0m (6 most similar from 44 total): ACTONEL, ACTIDIL, ACHROMYCIN V, ACTIFED, ACTISITE, ACTICORT


In [29]:
# Flatten the list for easy LASA check
LASA = [item for sublist in all_LASA for item in sublist]
print(len(LASA))
it = list(clusters.values())
print(it)

17
[{'ACETAMINOPHEN, CAFFEINE, AND DIHYDROCODEINE BITARTRATE', 'ACETAMINOPHEN, CAFFEINE AND DIHYDROCODEINE BITARTRATE'}, {'ACHROMYCIN', 'ACETASOL'}, {'ACEPHEN', 'ACTH', 'ACTOS', 'ACTHAR', 'ACTIQ', 'ACIPHEX', 'ACTICLATE'}, {'ACTONEL', 'ACTIDIL', 'ACHROMYCIN V', 'ACTIFED', 'ACTISITE', 'ACTICORT'}]


In [40]:
# Good results:
# 19. cloZAPine (4 members): QUEtiapine, azaTHIOprine, cloZAPine, clomiPHENE
# OxyCONTIN (3 members): FLUoxetine, OxyCONTIN, oxyMORphone
# 10. oxyCODONE (11 members): QUEtiapine, chlorproMAZINE, clonazePAM, hydrOXYzine, oxyCODONE, AMINOPHYLLIN, FOLVRON, HYPROTIGEN 5%, VI-TWEL, BENEMID, PAMINE
# 53. TENSILON (12 members): DACTINomycin, busPIRone, sulfADIAZINE, STILBESTROL, SUS-PHRINE SULFITE FREE, TENSILON, PROMETHAZINE HYDROCHLORIDE PLAIN, PRO-BANTHINE, SERPANRAY, SERPALAN, RITALIN, METICORTELONE
# 54. TENSILON PRESERVATIVE FREE (6 members): buPROPion, cefTAZidime, VASOXYL, POTASSIUM CHLORIDE, TENSILON PRESERVATIVE FREE, STERANE
# 1. DEPO-Medrol (5 members): DEPO-Medrol, NexAVAR, NovoLOG, PARoxetine, PROzac
# 45. GANTRISIN PEDIATRIC (10 members): HumuLIN, OxyCONTIN, hydrOXYzine, GANTRISIN PEDIATRIC, DIUPRES-250, PLEGINE, PARNATE, TENUATE DOSPAN, VOSOL HC, NOVRAD
# 2. DAPTOmycin (7 members): DAPTOmycin, KlonoPIN, methIMAzole, raNITIdine, riMANTAdine, rifAMPin, ALCOHOL 10% AND DEXTROSE 5%