In [None]:
import json
import numpy as np

# Open the file and read lines
with open('output.json', 'r') as file:
    drugs_data = json.load(file)

In [None]:
data = drugs_data
# Assuming 'data' is a list of dictionaries loaded from a JSON file as shown earlier.

# We can calculate the average size of the 'uses' list, excluding drugs with an empty 'uses' list.
# First, filter out the drugs with empty 'uses' list.
filtered_data = [drug for drug in data if drug['uses']]

# Now calculate the average size of the 'uses' list for the filtered drugs.
total_uses = sum(len(drug['uses']) for drug in filtered_data)
average_uses_non_empty = total_uses / len(filtered_data) if filtered_data else 0

average_uses_non_empty


In [None]:
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        return json.JSONEncoder.default(self, obj)

def save_data(filename, data):
    with open(filename, 'w') as file:
        file.write('[\n')  # Write the opening bracket of the JSON list
        for i, entry in enumerate(data):
            # Write each dictionary as a JSON string followed by a comma and newline, except for the last entry
            if i < len(data) - 1:
                file.write(json.dumps(entry, cls=NumpyEncoder) + ',\n')
            else:
                # The last entry should not have a comma at the end
                file.write(json.dumps(entry, cls=NumpyEncoder) + '\n')
        file.write(']')  # Write the closing bracket of the JSON list

In [None]:
import json
# Assuming the JSON data is a list of dictionaries
# We will use a dictionary to remove duplicates based on a unique key in the dictionaries
unique_data = {each_dict['name']: each_dict for each_dict in drugs_data}.values()
for idx, drug in enumerate(unique_data):
    drug['id'] = idx

In [None]:
save_data('output2.json', unique_data)

In [None]:
import re
from nltk.corpus import stopwords

# Download NLTK stopwords if you haven't already
import nltk
nltk.download('stopwords')

# Define a list of custom stop words (add more if needed)
custom_stopwords = ["and", "the", "in", "with"]

def preprocess_text(text):
    # Remove text in parentheses
    
    text_no_parentheses = re.sub(r'\s*\([^)]*\)', '', text)
    
    # Remove commas
    text_no_commas = text_no_parentheses.replace(',', '')
    
    # Tokenize the text
    words = text_no_commas.split()
    
    # Remove stop words and custom stop words
    cleaned_words = [word for word in words if word.lower() not in set(custom_stopwords)]
    
    # Join the cleaned words back into a string
    cleaned_text = ' '.join(cleaned_words)
    if cleaned_text == '':
        print(f'{text} was converted to {cleaned_text}')
    # Return the cleaned text
    return cleaned_text


In [None]:
with open('output2.json', 'r') as file:
    drugs_data = json.load(file)

In [None]:
# Apply preprocessing to drug names
combined_features = [str((preprocess_text(drug['name']), drug['drug_classes'])) for drug in drugs_data]
# Display the processed names
#combined_features.sort()
len(combined_features)


In [None]:
combined_features = [re.sub(r"[()',\[\]]", "", entry) for entry in combined_features]
combined_features

In [None]:
def find_drug_id_by_name(drug_name, drugs_data):
    for drug in drugs_data:
        if drug['name'] == drug_name:
            return drug['id']
    print('not found')
    return None  # or raise an exception, or any other way you prefer to handle not found cases

find_drug_id_by_name('', drugs_data)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import numpy as np

# Step 1: Vectorize the Names
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(combined_features)

# Step 2: Find an Appropriate 'eps' Value
nearest_neighbors = NearestNeighbors(n_neighbors=2)
neighbors = nearest_neighbors.fit(X)
distances, indices = neighbors.kneighbors(X)

# Sort the distances
sorted_distances = np.sort(distances, axis=0)
sorted_distances = sorted_distances[:, 1]

plt.figure(figsize=(10,5))
plt.plot(sorted_distances)
plt.xlabel("Points")
plt.ylabel("Distance")
plt.title("Nearest Neighbors Distance")
plt.show()




In [None]:
# You will need to visually identify a suitable 'eps' value from this plot

# Step 3: Apply DBSCAN Clustering
# Assuming you chose an 'eps' value, for example, 0.3
eps_value = .55 # adjust based on your observations
dbscan = DBSCAN(eps=eps_value, min_samples=2)  # adjust min_samples as needed
clusters = dbscan.fit_predict(X)

# Output the cluster assignments
for name, cluster in zip(combined_features, clusters):
    print(f"Name: {name}, Cluster: {cluster}")

In [None]:
clusters_dict = {}
for name, cluster in zip(combined_features, clusters):
    if cluster not in clusters_dict:
        clusters_dict[cluster] = []
    clusters_dict[cluster].append(name)

# Now you can print the drugs for each cluster or work with them as needed
for cluster, names in clusters_dict.items():
    print(f"Cluster {cluster}: {', '.join(names)}")

In [None]:
name_mapping = {entry['name']: name for entry, name in zip(drugs_data, combined_features)}
# Add normalized names and cluster assignments to your data
for entry, cluster in zip(drugs_data, clusters):
    entry['Cluster'] = cluster



In [None]:
save_data('o3.json', drugs_data)

In [None]:
from collections import defaultdict
import json
import numpy as np

def consolidate_clusters(drugs_data):
    # Group Data by Cluster, excluding -1
    cluster_groups = defaultdict(list)
    for drug in drugs_data:
        if drug['Cluster'] != -1:
            cluster_groups[drug['Cluster']].append(drug)
    
    # Consolidate Grouped Data
    consolidated_data = []
    for cluster, drugs in cluster_groups.items():
        if drugs:
            # Use the first drug's data as the template
            consolidated_entry = drugs[0].copy()
            # Replace name with the most common name in the cluster
            names = [drug['name'] for drug in drugs]
            consolidated_entry['name'] = min(set(names), key=names.count)
            # Optionally, combine other attributes here
            consolidated_entry['uses'] = list(set(use for drug in drugs for use in drug.get('uses', [])))
            consolidated_entry['drug_classes'] = list(set(use for drug in drugs for use in drug.get('drug_classes', [])))
            
            consolidated_data.append(consolidated_entry)
    
    # Include the entries with Cluster -1 as they are
    consolidated_data.extend(drug for drug in drugs_data if drug['Cluster'] == -1)
    
    return consolidated_data




In [None]:
# Assuming `drugs_data` is your list of dictionaries with 'Cluster' and 'Drug Name' keys
consolidated_drugs_data = consolidate_clusters(drugs_data)
cleaned_drug_data = [drug for drug in consolidated_drugs_data if len(drug['uses']) > 0]
# Define a JSON encoder subclass to convert numpy integers to Python integers
save_data('consolidated_data.json', consolidated_drugs_data)

In [None]:
cleaned_drug_data = [drug for drug in consolidated_drugs_data if len(drug['uses']) > 0]
for drug in cleaned_drug_data:
    drug['name'] = re.sub(r"\s*\(.*?\)", "", drug['name'])


In [None]:
save_data('consolidated_data.json', cleaned_drug_data)

In [None]:
!pip3 install spacy

In [None]:
!pip3 list

In [1]:
import spacy
import scispacy

from scispacy.linking import EntityLinker

In [None]:
import spacy
import scispacy 
import nmslib
# Load the scispaCy model
nlp = spacy.load("en_core_sci_sm")

# Process a sample text
text = "Androgel Testosterone testosterone testosterone alcohol isopropyl myristate water sodium hydroxide CARBOMER HOMOPOLYMER TYPE C (ALLYL PENTAERYTHRITOL CROSSLINKED)"
doc = nlp(text)

# Extract entities recognized as DRUG
drugs = [ent for ent in doc.ents]
for drug in drugs:
    print(drug.text)
    print(drug.label)




In [None]:
import spacy

from scispacy.abbreviation import AbbreviationDetector

nlp = spacy.load("en_core_sci_md")

# Add the abbreviation pipe to the spacy pipeline.
nlp.add_pipe("abbreviation_detector")

doc = nlp("Spinal and bulbar muscular atrophy (SBMA) is an \
           inherited motor neuron disease caused by the expansion \
           of a polyglutamine tract within the androgen receptor (AR). \
           SBMA can be caused by this easily. Attention Deficit Hyperactivity Disorder (ADHD) is also a problem")

print("Abbreviation", "\t", "Definition")
for abrv in doc._.abbreviations:
	print(f"{abrv} \t ({abrv.start}, {abrv.end}) {abrv._.long_form}")


In [None]:


nlp = spacy.load("en_core_sci_sm")

# This line takes a while, because we have to download ~1GB of data
# and load a large JSON file (the knowledge base). Be patient!
# Thankfully it should be faster after the first time you use it, because
# the downloads are cached.
# NOTE: The resolve_abbreviations parameter is optional, and requires that
# the AbbreviationDetector pipe has already been added to the pipeline. Adding
# the AbbreviationDetector pipe and setting resolve_abbreviations to True means
# that linking will only be performed on the long form of abbreviations.
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})

doc = nlp("Spinal and bulbar muscular atrophy (SBMA) is an \
           inherited motor neuron disease caused by the expansion \
           of a polyglutamine tract within the androgen receptor (AR). \
           SBMA can be caused by this easily. Attention deficit Hyperactivity Disorder (ADHD) is also a problem")

# Let's look at a random entity!
entity = doc.ents[1]

print("Name: ", entity)
# >>> Name: bulbar muscular atrophy

# Each entity is linked to UMLS with a score
# (currently just char-3gram matching).
linker = nlp.get_pipe("scispacy_linker")
for umls_ent in entity._.kb_ents:
	print(linker.kb.cui_to_entity[umls_ent[0]])


In [None]:
entity2 = doc.ents[12]
print("Name: ", entity2)
# >>> Name: bulbar muscular atrophy

# Each entity is linked to UMLS with a score
# (currently just char-3gram matching).
for umls_ent in entity2._.kb_ents:
	print(linker.kb.cui_to_entity[umls_ent[0]])

In [None]:
entity2 = doc.ents[11]
print("Name: ", entity2)
# >>> Name: bulbar muscular atrophy

# Each entity is linked to UMLS with a score
# (currently just char-3gram matching).
for umls_ent in entity2._.kb_ents:
	print(linker.kb.cui_to_entity[umls_ent[0]])

In [None]:
doc.ents

In [None]:
doc = nlp("I have ADHD. It's a huge problem. I also take adderall.")


for i in range(len(doc.ents)):
    # Let's look at a random entity!
    entity = doc.ents[i]

    print("Name: ", entity)
    # >>> Name: bulbar muscular atrophy

    # Each entity is linked to UMLS with a score
    # (currently just char-3gram matching).
    for umls_ent in entity._.kb_ents:
        print(linker.kb.cui_to_entity[umls_ent[0]])

In [None]:
side_effects = ["sweating", "nausea/vomiting, chemotherapy induced", "agitation"]

In [2]:
# Load the model
import spacy
import scispacy

from scispacy.linking import EntityLinker
nlp = spacy.load("en_core_sci_sm")
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})

# Obtain the linker from the pipeline
linker = nlp.get_pipe("scispacy_linker")

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [3]:
import spacy
from scispacy.linking import EntityLinker

def normalize_side_effects(side_effects_list, nlp, linker):
    side_effects_text = ", ".join(side_effects_list) + "."
    doc = nlp(side_effects_text)

    normalized_effects = []

    for ent in doc.ents:
        kb_ents = ent._.kb_ents
        if kb_ents:
            cui = kb_ents[0][0]
            umls_entity = linker.kb.cui_to_entity[cui]
            semantic_types = umls_entity.types  # Semantic types of the UMLS entity
            print(semantic_types)
            # Use semantic types to filter diseases or symptoms
            if "T047" in semantic_types or "T184" in semantic_types or "T033" in semantic_types:  # T047: Disease, T184: Sign or Symptom
                normalized_name = umls_entity.canonical_name
                normalized_effects.append(normalized_name)

    return normalized_effects


# Example usage
side_effects = ["sweating", "nausea", "vomiting", "chemotherapy induced", "agitation"]
normalized_side_effects = normalize_side_effects(side_effects, nlp, linker)
print(normalized_side_effects)



['T033']
['T184']
['T184']
['T061']
['T169']
['T184']
['Sweating', 'Nausea', 'Vomiting', 'Agitation']


In [12]:
text = "Lidocaine topical jelly or ointment is used on different parts of the body to cause numbness or loss of feeling for patients having certain medical procedures. It is also used to relieve pain and itching caused by conditions such as sunburn or other minor burns, insect bites or stings, poison ivy, poison oak, poison sumac, minor cuts, or scratches. Xylocaine® jelly is used to treat painful urethritis (inflammation of the urethra). It is also used to prevent and control pain in procedures involving the male and female urethra. Xylocaine® jelly is also used to lubricate the nose, mouth, and throat for intubation."
# Process the text
doc = nlp(text)
# Initialize list for conditions and diseases
conditions_and_diseases = []
relevant_types =  {"T047", "T184", "T033", "T037", "T046",  "T195"}


# Iterate over the entities and link them to UMLS
for ent in doc.ents:
    # Access the linked entities in UMLS
    kb_ents = ent._.kb_ents
    if kb_ents:
        cui = kb_ents[0][0]  # Get the top CUI
        score = kb_ents[0][1]  # Similarity score
        umls_entity = linker.kb.cui_to_entity[cui]
        semantic_types = umls_entity.types
        print(f"{umls_entity.canonical_name}: {semantic_types}")
        # Check if the UMLS entity is a 'Finding' (T033)
        if any(st in relevant_types for st in umls_entity.types):
            conditions_and_diseases.append(umls_entity.canonical_name)

# Remove duplicates and print the list of conditions and diseases
unique_conditions_and_diseases = list(set(conditions_and_diseases))
print(unique_conditions_and_diseases)


lidocaine: ['T109', 'T121']
Human body: ['T016']
Hypesthesia: ['T033']
Loss: ['T081']
Patients: ['T101']
Medical procedure: ['T058']
Pain: ['T184']
Pruritus: ['T184']
Condition: ['T080']
Sunburn: ['T037']
Burn injury: ['T037']
Insecta: ['T204']
bite injury: ['T037']
Sting Injury: ['T037']
Poison Sumac: ['T002']
Dermatitis verrucosa: ['T047']
Xylocaine Jelly: ['T109', 'T121']
Treatment intent: ['T169']
Pain: ['T184']
Inflammation: ['T046']
Urethra: ['T023']
PREVENT (product): ['T121']
Pain management: ['T061']
Methods aspects: ['T169']
Males: ['T032']
Female urethral structure: ['T023']
Xylocaine Jelly: ['T109', 'T121']
Nose: ['T023']
Oral cavity: ['T030']
Pharyngeal structure: ['T023']
Intubation (procedure): ['T061']
['Dermatitis verrucosa', 'Hypesthesia', 'Burn injury', 'Inflammation', 'Sunburn', 'Pain', 'bite injury', 'Sting Injury', 'Pruritus']
