In [22]:
import json

# Open the file and read lines
with open('output.json', 'r') as file:
    lines = file.readlines()

# Remove the first and last lines if they only contain the '[' and ']'
if lines[0].strip() == '[':
    lines = lines[1:]
if lines[-1].strip() == ']':
    lines = lines[:-1]

# Now join the lines back together and try to parse the JSON
try:
    # Join the lines and remove the trailing commas if any
    json_str = '[' + ','.join(line.rstrip(',\n') for line in lines) + ']'
    drugs_data = json.loads(json_str)
except json.JSONDecodeError as e:
    print("JSON decode error:", e)
    # You could add more detailed error handling here


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Example list of drug names
drug_names = [drug['name'] for drug in drugs_data]

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Vectorize the drug names
X = vectorizer.fit_transform(drug_names)

# Cluster the drugs using K-Means
n_clusters = 400 # Example number of clusters, this may need tuning
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X)

# Assign each drug to a cluster
clustered_drugs = {i: [] for i in range(n_clusters)}
for i, label in enumerate(kmeans.labels_):
    clustered_drugs[label].append(drug_names[i])

clustered_drugs


{0: ['Apadaz'],
 1: ['Akynzeo for (injection)',
  'Akynzeo (Oral)',
  'Akynzeo (Intravenous)',
  'Akynzeo'],
 2: ['Aleve-D Sinus and Cold',
  'Aleve-D Cold and Sinus',
  'Aleve Sinus and Headache',
  'Aleve Cold and Sinus',
  'Advil Cold and Sinus Liqui-Gel',
  'Advil Cold & Sinus'],
 3: ['Altachlore (ophthalmic)',
  'Altacaine (Ophthalmic)',
  'Alamast (Ophthalmic)',
  'AKTob (Ophthalmic)',
  'Besifloxacin ophthalmic',
  'Besifloxacin (Ophthalmic)'],
 4: ['Bempedoic acid and ezetimibe', 'Bempedoic acid'],
 5: ['Betamethacot (Topical application)'],
 6: ['Baxdela (Oral)', 'Baxdela (Intravenous)', 'Baxdela'],
 7: ['Allegra-D',
  'Allegra-D (Oral)',
  'Allegra OTC',
  'Allegra ODT (Oral)',
  'Allegra Allergy',
  'Allegra'],
 8: ['Aluminum chloride hexahydrate (topical)',
  'Adrenalin Chloride (Injection)',
  'Adrenalin (Injection)',
  'Adrenalin (Inhalation)',
  'Adrenalin',
  'Benzethonium chloride topical'],
 9: ['Aquamephyton (Injection)',
  'Antiflex (Injection)',
  'Ancef (Injection

In [17]:
clustered_drugs

{0: ['Albuterol and budesonide (inhalation)',
  'Albuterol and budesonide (Inhalation)'],
 1: ['Amcinonide topical', 'Amcinonide (Topical application)'],
 2: ['Anjeso (Intravenous)'],
 3: ['Acetaminophen, magnesium salicylate, and pamabrom',
  'Acetaminophen, caffeine, and magnesium salicylate'],
 4: ['Aleve Arthritis (Oral)'],
 5: ['Acetaminophen (rectal)', 'Acetaminophen (Oral, Rectal)'],
 6: ['Anzemet (Oral)', 'Anzemet (Intravenous)', 'Anzemet'],
 7: ['All Day Allergy-D'],
 8: ['Acid Gone Extra Strength', 'Acid Gone'],
 9: ['Afrin NoDrip Extra Moisturizing',
  'Afrin Extra Moisturizing',
  'Afrin 4 Hour Extra Moisturizing'],
 10: ['Afluria Quadrivalent 2021-2022 (injection)',
  'Afluria PF Quadrivalent 2021-2022 (injection)',
  'Afluria PF Pediatric Quadrivalent 2021-2022 (injection)'],
 11: ['Bepotastine ophthalmic', 'Bepotastine besilate (Ophthalmic)'],
 12: ['Apomorphine (Subcutaneous)', 'Apomorphine'],
 13: ['Betamethasone and clotrimazole topical',
  'Betamethasone and clotrima

In [20]:
drug_list = [{"name": "Benadryl Children's Dye Free", "drug_classes": ["Anticholinergic antiemetics", "Anticholinergic antiparkinson agents", "Antihistamines", "Miscellaneous anxiolytics, sedatives and hypnotics"], "uses": ["drowsiness", "hangover", "enlarged prostate", "agitation", "seizures", "cough", "allergic reactions", "allergies"]},
{"name": "Benadryl Children's Allergy Fastmelt", "drug_classes": ["Anticholinergic antiemetics", "Anticholinergic antiparkinson agents", "Antihistamines", "Miscellaneous anxiolytics, sedatives and hypnotics"], "uses": ["drowsiness", "hangover", "enlarged prostate", "agitation", "seizures", "allergic reactions", "cold symptoms", "cough"]},
{"name": "Benadryl Children's Allergy", "drug_classes": ["Anticholinergic antiemetics", "Anticholinergic antiparkinson agents", "Antihistamines", "Miscellaneous anxiolytics, sedatives and hypnotics"], "uses": ["drowsiness", "hangover", "enlarged prostate", "agitation", "seizures", "allergic reactions", "cold symptoms", "cough"]},
{"name": "Benadryl Allergy Sinus Headache", "drug_classes": ["Upper respiratory combinations"], "uses": ["nasal congestion", "liver disease", "heart disease", "pheochromocytoma", "enlarged prostate", "sinus symptoms"]}]

In [21]:
names = [drug['name'] for drug in drug_list]
names

["Benadryl Children's Dye Free",
 "Benadryl Children's Allergy Fastmelt",
 "Benadryl Children's Allergy",
 'Benadryl Allergy Sinus Headache']

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Load your data into `drugs_data` (a list of dictionaries)

# Step 1: Feature Extraction
# Combine the text of drug names, classes, and uses
combined_features = []
for drug in drugs_data:
    combined_text = ' '.join([drug['name'], ' '.join(drug['drug_classes']), ' '.join(drug['uses'])])
    combined_features.append(combined_text)

# Step 2: Text Vectorization
# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Vectorize the combined features
X = vectorizer.fit_transform(combined_features)

# Step 3: Clustering
# Choose the number of clusters (you might need to tune this)
n_clusters = 500
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X)

# Assign each drug to a cluster
drugs_data_clustered = [{'cluster': cluster_label, **drug} for cluster_label, drug in zip(kmeans.labels_, drugs_data)]

# Step 4: Post-Processing
# Inspect the clusters and refine if necessary
# ...

drugs_data_clustered
# Optionally save the clustered data to a JSON file
#with open('clustered_drugs.json', 'w') as file:
    #json.dump(drugs_data_clustered, file, indent=4)


[{'cluster': 212,
  'name': 'Azulfidine',
  'drug_classes': ['5-aminosalicylates', 'Antirheumatics'],
  'uses': ['sore throat',
   'porphyria',
   'drowsiness',
   'inflammatory bowel disease',
   'juvenile rheumatoid arthritis',
   'lymphocytic colitis']},
 {'cluster': 89,
  'name': 'Abstral',
  'drug_classes': ['Opioids (narcotic analgesics)'],
  'uses': ['breakthrough pain', 'chronic pain']},
 {'cluster': 411,
  'name': 'Aczone',
  'drug_classes': ['Topical acne agents'],
  'uses': []},
 {'cluster': 287, 'name': 'Adzynma', 'drug_classes': [], 'uses': []},
 {'cluster': 50,
  'name': 'Adzenys XR-ODT',
  'drug_classes': ['CNS stimulants'],
  'uses': ['heart disease', "tourette's syndrome", 'adhd']},
 {'cluster': 478,
  'name': 'AfterPill',
  'drug_classes': ['Contraceptives'],
  'uses': ['birth control', 'seizures']},
 {'cluster': 30,
  'name': 'Aftate (Topical)',
  'drug_classes': ['Topical antifungals'],
  'uses': ['tinea cruris', 'tinea pedis']},
 {'cluster': 14,
  'name': 'Afstyla 

In [31]:
from collections import defaultdict

# Assuming `drugs_data_clustered` is your list of dictionaries with an additional 'cluster' key

# Initialize a dictionary to hold cluster information
cluster_info = defaultdict(lambda: {'count': 0, 'drugs': []})

# Populate the cluster_info dictionary
for drug in drugs_data_clustered:
    cluster_label = drug['cluster']
    cluster_info[cluster_label]['drugs'].append(drug['name'])  # Assuming 'name' is the key for drug names
    cluster_info[cluster_label]['count'] += 1

# Convert cluster_info to a list for easier reading and processing
clusters_summary = [{'cluster': key, 'count': value['count'], 'drugs': value['drugs']} for key, value in cluster_info.items()]

# Sort the clusters by count (optional)
clusters_summary = sorted(clusters_summary, key=lambda x: x['count'], reverse=True)

# Now, clusters_summary contains a list of dictionaries where each dictionary has cluster label, count, and drugs list
# You can print this out or save it to a JSON file
for cluster in clusters_summary:
    print(f"Cluster {cluster['cluster']}:")
    print(f"Count: {cluster['count']}")
    print(f"Drugs: {', '.join(cluster['drugs'])}")
    print("\n")  # Newline for better readability

# Optionally save the clusters_summary to a JSON file
clusters_summary

Cluster 125:
Count: 13
Drugs: Afrin Severe Congestion NoDrip, Afrin Severe Congestion, Afrin Pump Mist, Afrin NoDrip Extra Moisturizing, Afrin No Drip Sinus, Afrin No Drip Severe Congestion, Afrin No Drip, Afrin Extra Moisturizing, Afrin Allergy Sinus, Afrin All Night NoDrip, Afrin 4 Hour Extra Moisturizing, Afrin, Benzedrex


Cluster 22:
Count: 12
Drugs: Allermax, Allergy Relief (Diphenhydramine HCl), Allergy (Diphenhydramine HCl), Benadryl Ultratab, Benadryl Fastmelt, Benadryl Dye Free Allergy, Benadryl DF, Benadryl Children's Dye Free, Benadryl Children's Allergy Fastmelt, Benadryl Children's Allergy, Benadryl Allergy, Banophen


Cluster 483:
Count: 10
Drugs: Akne-Mycin (Topical), Acne-Clear, Acne Wash (Topical), Acne Foaming Cream Face Cleanser, Acne 10 Gel (Topical), Acne (Topical), Acanya, BenzEFoam Ultra (Topical), Benzashave (Topical), Benzagel (Topical)


Cluster 26:
Count: 9
Drugs: Allergy Relief (Fexofenadine HCl), Aller-Ease, Allegra OTC, Allegra ODT (Oral), Allegra Allergy

[{'cluster': 125,
  'count': 13,
  'drugs': ['Afrin Severe Congestion NoDrip',
   'Afrin Severe Congestion',
   'Afrin Pump Mist',
   'Afrin NoDrip Extra Moisturizing',
   'Afrin No Drip Sinus',
   'Afrin No Drip Severe Congestion',
   'Afrin No Drip',
   'Afrin Extra Moisturizing',
   'Afrin Allergy Sinus',
   'Afrin All Night NoDrip',
   'Afrin 4 Hour Extra Moisturizing',
   'Afrin',
   'Benzedrex']},
 {'cluster': 22,
  'count': 12,
  'drugs': ['Allermax',
   'Allergy Relief (Diphenhydramine HCl)',
   'Allergy (Diphenhydramine HCl)',
   'Benadryl Ultratab',
   'Benadryl Fastmelt',
   'Benadryl Dye Free Allergy',
   'Benadryl DF',
   "Benadryl Children's Dye Free",
   "Benadryl Children's Allergy Fastmelt",
   "Benadryl Children's Allergy",
   'Benadryl Allergy',
   'Banophen']},
 {'cluster': 483,
  'count': 10,
  'drugs': ['Akne-Mycin (Topical)',
   'Acne-Clear',
   'Acne Wash (Topical)',
   'Acne Foaming Cream Face Cleanser',
   'Acne 10 Gel (Topical)',
   'Acne (Topical)',
   'Acan