In [6]:
import json

# Open the file and read lines
with open('output.json', 'r') as file:
    lines = file.readlines()

# Remove the first and last lines if they only contain the '[' and ']'
if lines[0].strip() == '[':
    lines = lines[1:]
if lines[-1].strip() == ']':
    lines = lines[:-1]

# Now join the lines back together and try to parse the JSON
try:
    # Join the lines and remove the trailing commas if any
    json_str = '[' + ','.join(line.rstrip(',\n') for line in lines) + ']'
    drugs_data = json.loads(json_str)
except json.JSONDecodeError as e:
    print("JSON decode error:", e)
    # You could add more detailed error handling here


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Example list of drug names
drug_names = [drug['name'] for drug in drugs_data]

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Vectorize the drug names
X = vectorizer.fit_transform(drug_names)

# Cluster the drugs using K-Means
n_clusters = 400 # Example number of clusters, this may need tuning
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X)

# Assign each drug to a cluster
clustered_drugs = {i: [] for i in range(n_clusters)}
for i, label in enumerate(kmeans.labels_):
    clustered_drugs[label].append(drug_names[i])

clustered_drugs




{0: ['Aloe vera topical', 'Aloe Grande (topical)', 'Aloe'],
 1: ['Adzenys XR-ODT',
  'Aptensio XR',
  'Adhansia XR',
  'Belviq XR (Oral)',
  'Belviq'],
 2: ['Allegra-D 24 Hour Allergy and Congestion',
  'Allegra-D 24 Hour',
  'Allegra-D 12 Hour Allergy and Congestion',
  'Allegra 24 Hour Allergy'],
 3: ['Anectine (Intramuscular, Intravenous)',
  'BayRho-D (Injection, Intramuscular, Intravenous)',
  'Bayhep B (Intramuscular, Intravenous)',
  'Baygam (Intramuscular, Intravenous, Subcutaneous)'],
 4: ['Aloe polysaccharides, hydrocortisone, and iodoquinol (topical)',
  'Acyclovir and hydrocortisone topical',
  'Acyclovir and hydrocortisone (Topical application)'],
 5: ['Altachlore (ophthalmic)',
  'Altacaine (Ophthalmic)',
  'Alamast (Ophthalmic)',
  'AKTob (Ophthalmic)',
  'Besifloxacin ophthalmic',
  'Besifloxacin (Ophthalmic)'],
 6: ['Alka-Seltzer Morning Relief',
  'Alka-Seltzer Hangover Relief',
  'Alka-Seltzer Extra Strength Heartburn Relief',
  'Alka-Seltzer Cool Action Heartburn Re

In [17]:
clustered_drugs

{0: ['Albuterol and budesonide (inhalation)',
  'Albuterol and budesonide (Inhalation)'],
 1: ['Amcinonide topical', 'Amcinonide (Topical application)'],
 2: ['Anjeso (Intravenous)'],
 3: ['Acetaminophen, magnesium salicylate, and pamabrom',
  'Acetaminophen, caffeine, and magnesium salicylate'],
 4: ['Aleve Arthritis (Oral)'],
 5: ['Acetaminophen (rectal)', 'Acetaminophen (Oral, Rectal)'],
 6: ['Anzemet (Oral)', 'Anzemet (Intravenous)', 'Anzemet'],
 7: ['All Day Allergy-D'],
 8: ['Acid Gone Extra Strength', 'Acid Gone'],
 9: ['Afrin NoDrip Extra Moisturizing',
  'Afrin Extra Moisturizing',
  'Afrin 4 Hour Extra Moisturizing'],
 10: ['Afluria Quadrivalent 2021-2022 (injection)',
  'Afluria PF Quadrivalent 2021-2022 (injection)',
  'Afluria PF Pediatric Quadrivalent 2021-2022 (injection)'],
 11: ['Bepotastine ophthalmic', 'Bepotastine besilate (Ophthalmic)'],
 12: ['Apomorphine (Subcutaneous)', 'Apomorphine'],
 13: ['Betamethasone and clotrimazole topical',
  'Betamethasone and clotrima

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Load your data into `drugs_data` (a list of dictionaries)

# Step 1: Feature Extraction
# Combine the text of drug names, classes, and uses
combined_features = []
for drug in drugs_data:
    combined_text = ' '.join([drug['name'], ' '.join(drug['drug_classes']), ' '.join(drug['uses'])])
    combined_features.append(combined_text)

# Step 2: Text Vectorization
# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Vectorize the combined features
X = vectorizer.fit_transform(combined_features)

# Step 3: Clustering
# Choose the number of clusters (you might need to tune this)
n_clusters = 800
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X)

# Convert numpy integers to Python integers for JSON serialization
drugs_data_clustered = [{'cluster': int(cluster_label), **drug} for cluster_label, drug in zip(kmeans.labels_, drugs_data)]
# Sort the list by the 'cluster' key
sorted_drugs_data_clustered = sorted(drugs_data_clustered, key=lambda x: x['cluster'])

# Optionally save the clustered data to a JSON file
with open('clustered_drugs.json', 'w') as file:
    json.dump(sorted_drugs_data_clustered, file, indent=4)


In [7]:
from collections import defaultdict

# Assuming `drugs_data_clustered` is your list of dictionaries with an additional 'cluster' key

# Initialize a dictionary to hold cluster information
cluster_info = defaultdict(lambda: {'count': 0, 'drugs': []})

# Populate the cluster_info dictionary
for drug in drugs_data_clustered:
    cluster_label = drug['cluster']
    cluster_info[cluster_label]['drugs'].append(drug['name'])  # Assuming 'name' is the key for drug names
    cluster_info[cluster_label]['count'] += 1

# Convert cluster_info to a list for easier reading and processing
clusters_summary = [{'cluster': key, 'count': value['count'], 'drugs': value['drugs']} for key, value in cluster_info.items()]

# Sort the clusters by count (optional)
clusters_summary = sorted(clusters_summary, key=lambda x: x['count'], reverse=True)

# Now, clusters_summary contains a list of dictionaries where each dictionary has cluster label, count, and drugs list
# You can print this out or save it to a JSON file
for cluster in clusters_summary:
    print(f"Cluster {cluster['cluster']}:")
    print(f"Count: {cluster['count']}")
    print(f"Drugs: {', '.join(cluster['drugs'])}")
    print("\n")  # Newline for better readability

# Optionally save the clusters_summary to a JSON file
clusters_summary

Cluster 8:
Count: 9
Drugs: Adalimumab-fkjp (Subcutaneous), Adalimumab-bwwd (Subcutaneous), Adalimumab-atto (Subcutaneous), Adalimumab-aqvh (Subcutaneous), Adalimumab-afzb (Subcutaneous), Adalimumab-adbm (Subcutaneous), Adalimumab-adaz (Subcutaneous), Adalimumab-aaty (Subcutaneous), Adalimumab-aacf (Subcutaneous)


Cluster 21:
Count: 8
Drugs: Albutein (human), Alburx (human), Albuminex (human), Albuminar-25 (human), Albumin (human), Albuked (human), Albuked 5 (human), Albuked 25 (human)


Cluster 9:
Count: 7
Drugs: Allermax, Benadryl Ultratab, Benadryl Fastmelt, Benadryl DF, Benadryl Children's Allergy Fastmelt, Benadryl Allergy, Banophen


Cluster 75:
Count: 6
Drugs: Acne-Clear, Acne Wash (Topical), Acne 10 Gel (Topical), Acne (Topical), Benzashave (Topical), Benzagel (Topical)


Cluster 22:
Count: 5
Drugs: Aleve-D Sinus and Cold, Aleve-D Cold and Sinus, Aleve Sinus and Headache, Aleve Cold and Sinus, Advil Sinus Congestion & Pain


Cluster 372:
Count: 5
Drugs: Afluria Quadrivalent 202

[{'cluster': 8,
  'count': 9,
  'drugs': ['Adalimumab-fkjp (Subcutaneous)',
   'Adalimumab-bwwd (Subcutaneous)',
   'Adalimumab-atto (Subcutaneous)',
   'Adalimumab-aqvh (Subcutaneous)',
   'Adalimumab-afzb (Subcutaneous)',
   'Adalimumab-adbm (Subcutaneous)',
   'Adalimumab-adaz (Subcutaneous)',
   'Adalimumab-aaty (Subcutaneous)',
   'Adalimumab-aacf (Subcutaneous)']},
 {'cluster': 21,
  'count': 8,
  'drugs': ['Albutein (human)',
   'Alburx (human)',
   'Albuminex (human)',
   'Albuminar-25 (human)',
   'Albumin (human)',
   'Albuked (human)',
   'Albuked 5 (human)',
   'Albuked 25 (human)']},
 {'cluster': 9,
  'count': 7,
  'drugs': ['Allermax',
   'Benadryl Ultratab',
   'Benadryl Fastmelt',
   'Benadryl DF',
   "Benadryl Children's Allergy Fastmelt",
   'Benadryl Allergy',
   'Banophen']},
 {'cluster': 75,
  'count': 6,
  'drugs': ['Acne-Clear',
   'Acne Wash (Topical)',
   'Acne 10 Gel (Topical)',
   'Acne (Topical)',
   'Benzashave (Topical)',
   'Benzagel (Topical)']},
 {'clus