In [None]:
import json
import numpy as np

# Open the file and read lines
with open('output.json', 'r') as file:
    drugs_data = json.load(file)

In [None]:
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        return json.JSONEncoder.default(self, obj)

def save_data(filename, data):
    with open(filename, 'w') as file:
        file.write('[\n')  # Write the opening bracket of the JSON list
        for i, entry in enumerate(data):
            # Write each dictionary as a JSON string followed by a comma and newline, except for the last entry
            if i < len(data) - 1:
                file.write(json.dumps(entry, cls=NumpyEncoder) + ',\n')
            else:
                # The last entry should not have a comma at the end
                file.write(json.dumps(entry, cls=NumpyEncoder) + '\n')
        file.write(']')  # Write the closing bracket of the JSON list

In [None]:
import json
# Assuming the JSON data is a list of dictionaries
# We will use a dictionary to remove duplicates based on a unique key in the dictionaries
unique_data = {each_dict['name']: each_dict for each_dict in drugs_data}.values()
for idx, drug in enumerate(unique_data):
    drug['id'] = idx

In [None]:
save_data('output2.json', unique_data)

In [None]:
import re
from nltk.corpus import stopwords

# Download NLTK stopwords if you haven't already
import nltk
nltk.download('stopwords')

# Define a list of custom stop words (add more if needed)
custom_stopwords = ["and", "the", "in", "with"]

def preprocess_text(text):
    # Remove text in parentheses
    
    text_no_parentheses = re.sub(r'\s*\([^)]*\)', '', text)
    
    # Remove commas
    text_no_commas = text_no_parentheses.replace(',', '')
    
    # Tokenize the text
    words = text_no_commas.split()
    
    # Remove stop words and custom stop words
    cleaned_words = [word for word in words if word.lower() not in set(custom_stopwords)]
    
    # Join the cleaned words back into a string
    cleaned_text = ' '.join(cleaned_words)
    if cleaned_text == '':
        print(f'{text} was converted to {cleaned_text}')
    # Return the cleaned text
    return cleaned_text


In [None]:
with open('output2.json', 'r') as file:
    drugs_data = json.load(file)

In [None]:
# Apply preprocessing to drug names
combined_features = [str((preprocess_text(drug['name']), drug['drug_classes'])) for drug in drugs_data]
# Display the processed names
#combined_features.sort()
len(combined_features)


In [None]:
combined_features = [re.sub(r"[()',\[\]]", "", entry) for entry in combined_features]
combined_features

In [None]:
def find_drug_id_by_name(drug_name, drugs_data):
    for drug in drugs_data:
        if drug['name'] == drug_name:
            return drug['id']
    print('not found')
    return None  # or raise an exception, or any other way you prefer to handle not found cases

find_drug_id_by_name('', drugs_data)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import numpy as np

# Step 1: Vectorize the Names
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(combined_features)

# Step 2: Find an Appropriate 'eps' Value
nearest_neighbors = NearestNeighbors(n_neighbors=2)
neighbors = nearest_neighbors.fit(X)
distances, indices = neighbors.kneighbors(X)

# Sort the distances
sorted_distances = np.sort(distances, axis=0)
sorted_distances = sorted_distances[:, 1]

plt.figure(figsize=(10,5))
plt.plot(sorted_distances)
plt.xlabel("Points")
plt.ylabel("Distance")
plt.title("Nearest Neighbors Distance")
plt.show()




In [None]:
# You will need to visually identify a suitable 'eps' value from this plot

# Step 3: Apply DBSCAN Clustering
# Assuming you chose an 'eps' value, for example, 0.3
eps_value = .55 # adjust based on your observations
dbscan = DBSCAN(eps=eps_value, min_samples=2)  # adjust min_samples as needed
clusters = dbscan.fit_predict(X)

# Output the cluster assignments
for name, cluster in zip(combined_features, clusters):
    print(f"Name: {name}, Cluster: {cluster}")

In [None]:
clusters_dict = {}
for name, cluster in zip(combined_features, clusters):
    if cluster not in clusters_dict:
        clusters_dict[cluster] = []
    clusters_dict[cluster].append(name)

# Now you can print the drugs for each cluster or work with them as needed
for cluster, names in clusters_dict.items():
    print(f"Cluster {cluster}: {', '.join(names)}")

In [None]:
name_mapping = {entry['name']: name for entry, name in zip(drugs_data, combined_features)}
# Add normalized names and cluster assignments to your data
for entry, cluster in zip(drugs_data, clusters):
    entry['Cluster'] = cluster



In [None]:
save_data('o3.json', drugs_data)

In [None]:
from collections import defaultdict
import json
import numpy as np

def consolidate_clusters(drugs_data):
    # Group Data by Cluster, excluding -1
    cluster_groups = defaultdict(list)
    for drug in drugs_data:
        if drug['Cluster'] != -1:
            cluster_groups[drug['Cluster']].append(drug)
    
    # Consolidate Grouped Data
    consolidated_data = []
    for cluster, drugs in cluster_groups.items():
        if drugs:
            # Use the first drug's data as the template
            consolidated_entry = drugs[0].copy()
            # Replace name with the most common name in the cluster
            names = [drug['name'] for drug in drugs]
            consolidated_entry['name'] = min(set(names), key=names.count)
            # Optionally, combine other attributes here
            consolidated_entry['uses'] = list(set(use for drug in drugs for use in drug.get('uses', [])))
            consolidated_entry['drug_classes'] = list(set(use for drug in drugs for use in drug.get('drug_classes', [])))
            
            consolidated_data.append(consolidated_entry)
    
    # Include the entries with Cluster -1 as they are
    consolidated_data.extend(drug for drug in drugs_data if drug['Cluster'] == -1)
    
    return consolidated_data




In [None]:
# Assuming `drugs_data` is your list of dictionaries with 'Cluster' and 'Drug Name' keys
consolidated_drugs_data = consolidate_clusters(drugs_data)
cleaned_drug_data = [drug for drug in consolidated_drugs_data if len(drug['uses']) > 0]
# Define a JSON encoder subclass to convert numpy integers to Python integers
save_data('consolidated_data.json', consolidated_drugs_data)

In [None]:
cleaned_drug_data = [drug for drug in consolidated_drugs_data if len(drug['uses']) > 0]
for drug in cleaned_drug_data:
    drug['name'] = re.sub(r"\s*\(.*?\)", "", drug['name'])


In [None]:
save_data('consolidated_data.json', cleaned_drug_data)

In [None]:
import spacy
import scispacy 

# Load the scispaCy model
nlp = spacy.load("en_core_sci_sm")

# Process a sample text
text = "Androgel Testosterone testosterone testosterone alcohol isopropyl myristate water sodium hydroxide CARBOMER HOMOPOLYMER TYPE C (ALLYL PENTAERYTHRITOL CROSSLINKED)"
doc = nlp(text)

# Extract entities recognized as DRUG
drugs = [ent for ent in doc.ents]
for drug in drugs:
    print(drug.text)
    print(drug.label)




In [1]:
import spacy

from scispacy.abbreviation import AbbreviationDetector

nlp = spacy.load("en_core_sci_md")

# Add the abbreviation pipe to the spacy pipeline.
nlp.add_pipe("abbreviation_detector")

doc = nlp("Spinal and bulbar muscular atrophy (SBMA) is an \
           inherited motor neuron disease caused by the expansion \
           of a polyglutamine tract within the androgen receptor (AR). \
           SBMA can be caused by this easily. Attention Deficit Hyperactivity Disorder (ADHD) is also a problem")

print("Abbreviation", "\t", "Definition")
for abrv in doc._.abbreviations:
	print(f"{abrv} \t ({abrv.start}, {abrv.end}) {abrv._.long_form}")


  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


Abbreviation 	 Definition
SBMA 	 (6, 7) Spinal and bulbar muscular atrophy
SBMA 	 (33, 34) Spinal and bulbar muscular atrophy
AR 	 (29, 30) androgen receptor
ADHD 	 (46, 47) Attention deficit Hyperactivity Disorder


In [4]:
import spacy
import scispacy

from scispacy.linking import EntityLinker

nlp = spacy.load("en_core_sci_sm")

# This line takes a while, because we have to download ~1GB of data
# and load a large JSON file (the knowledge base). Be patient!
# Thankfully it should be faster after the first time you use it, because
# the downloads are cached.
# NOTE: The resolve_abbreviations parameter is optional, and requires that
# the AbbreviationDetector pipe has already been added to the pipeline. Adding
# the AbbreviationDetector pipe and setting resolve_abbreviations to True means
# that linking will only be performed on the long form of abbreviations.
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})

doc = nlp("Spinal and bulbar muscular atrophy (SBMA) is an \
           inherited motor neuron disease caused by the expansion \
           of a polyglutamine tract within the androgen receptor (AR). \
           SBMA can be caused by this easily. Attention deficit Hyperactivity Disorder (ADHD) is also a problem")

# Let's look at a random entity!
entity = doc.ents[1]

print("Name: ", entity)
# >>> Name: bulbar muscular atrophy

# Each entity is linked to UMLS with a score
# (currently just char-3gram matching).
linker = nlp.get_pipe("scispacy_linker")
for umls_ent in entity._.kb_ents:
	print(linker.kb.cui_to_entity[umls_ent[0]])


Your CPU supports instructions that this binary was not compiled to use: AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib


Name:  bulbar
CUI: C1947952, Name: anatomical bulb
Definition: A rounded dilation or expansion in a canal, vessel, or organ.
TUI(s): T017
Aliases: (total: 2): 
	 Bulbar, Bulb
CUI: C0032372, Name: Poliomyelitis, Bulbar
Definition: A form of paralytic poliomyelitis affecting neurons of the MEDULLA OBLONGATA of the brain stem. Clinical features include impaired respiration, HYPERTENSION, alterations of vasomotor control, and dysphagia. Weakness and atrophy of the limbs and trunk due to spinal cord involvement is usually associated. (From Adams et al., Principles of Neurology, 6th ed, p765)
TUI(s): T047
Aliases (abbreviated, total: 23): 
	 Acute anterior poliomyelitis, Acute infantile paralysis, Bulbar Polio, Medullary Involvement Poliomyelitis, Acute paralytic bulbar poliomyelitis, POLIOMYELITIS, ANTERIOR, ACUTE, Anterior acute poliomyelitis, Polio, Bulbar, acute anterior poliomyelitis, Bulbar Poliomyelitis
CUI: C2586323, Name: Structure of fascial sheath of eyeball
Definition: Sheath of 

In [9]:
entity2 = doc.ents[12]
print("Name: ", entity2)
# >>> Name: bulbar muscular atrophy

# Each entity is linked to UMLS with a score
# (currently just char-3gram matching).
for umls_ent in entity2._.kb_ents:
	print(linker.kb.cui_to_entity[umls_ent[0]])

Name:  ADHD
CUI: C1263846, Name: Attention deficit hyperactivity disorder
Definition: A behavior disorder originating in childhood in which the essential features are signs of developmentally inappropriate inattention, impulsivity, and hyperactivity. Although most individuals have symptoms of both inattention and hyperactivity-impulsivity, one or the other pattern may be predominant. The disorder is more frequent in males than females. Onset is in childhood. Symptoms often attenuate during late adolescence although a minority experience the full complement of symptoms into mid-adulthood. (From DSM-V)
TUI(s): T048
Aliases (abbreviated, total: 67): 
	 Attention deficit hyperactivity disorder (ADHD), ADHD, ADHD (attention deficit hyperactivity disorder), Deficit-Hyperactivity Disorders, Attention, Attention deficit-hyperactivity disorder (ADHD), Attention deficit hyperactivity disorder, predominantly hyperactive impulsive type (disorder), Hyperkinetic syndrome NOS, Hyperkinetic syndrome o

In [12]:
entity2 = doc.ents[11]
print("Name: ", entity2)
# >>> Name: bulbar muscular atrophy

# Each entity is linked to UMLS with a score
# (currently just char-3gram matching).
for umls_ent in entity2._.kb_ents:
	print(linker.kb.cui_to_entity[umls_ent[0]])

Name:  Attention deficit Hyperactivity Disorder
CUI: C1263846, Name: Attention deficit hyperactivity disorder
Definition: A behavior disorder originating in childhood in which the essential features are signs of developmentally inappropriate inattention, impulsivity, and hyperactivity. Although most individuals have symptoms of both inattention and hyperactivity-impulsivity, one or the other pattern may be predominant. The disorder is more frequent in males than females. Onset is in childhood. Symptoms often attenuate during late adolescence although a minority experience the full complement of symptoms into mid-adulthood. (From DSM-V)
TUI(s): T048
Aliases (abbreviated, total: 67): 
	 Attention deficit hyperactivity disorder (ADHD), ADHD, ADHD (attention deficit hyperactivity disorder), Deficit-Hyperactivity Disorders, Attention, Attention deficit-hyperactivity disorder (ADHD), Attention deficit hyperactivity disorder, predominantly hyperactive impulsive type (disorder), Hyperkinetic s

In [11]:
doc.ents

(Spinal,
 bulbar,
 muscular atrophy,
 SBMA,
 inherited,
 motor neuron,
 expansion,
 polyglutamine tract,
 androgen receptor,
 AR,
 SBMA,
 Attention deficit Hyperactivity Disorder,
 ADHD,
 problem)

In [15]:
doc = nlp("I have ADHD. It's a huge problem")


for i in range(len(doc.ents)):
    # Let's look at a random entity!
    entity = doc.ents[i]

    print("Name: ", entity)
    # >>> Name: bulbar muscular atrophy

    # Each entity is linked to UMLS with a score
    # (currently just char-3gram matching).
    for umls_ent in entity._.kb_ents:
        print(linker.kb.cui_to_entity[umls_ent[0]])

Name:  ADHD
CUI: C1263846, Name: Attention deficit hyperactivity disorder
Definition: A behavior disorder originating in childhood in which the essential features are signs of developmentally inappropriate inattention, impulsivity, and hyperactivity. Although most individuals have symptoms of both inattention and hyperactivity-impulsivity, one or the other pattern may be predominant. The disorder is more frequent in males than females. Onset is in childhood. Symptoms often attenuate during late adolescence although a minority experience the full complement of symptoms into mid-adulthood. (From DSM-V)
TUI(s): T048
Aliases (abbreviated, total: 67): 
	 Attention deficit hyperactivity disorder (ADHD), ADHD, ADHD (attention deficit hyperactivity disorder), Deficit-Hyperactivity Disorders, Attention, Attention deficit-hyperactivity disorder (ADHD), Attention deficit hyperactivity disorder, predominantly hyperactive impulsive type (disorder), Hyperkinetic syndrome NOS, Hyperkinetic syndrome o