In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import collect_list, col

# Initialize Spark Session (This line might be optional if your notebook already has a 'spark' variable)
spark = SparkSession.builder.appName("DrugDataProcessing").getOrCreate()

StatementMeta(, f6e57c01-b84b-45f9-95d6-8193b4eca671, 16, Finished, Available, Finished)

In [15]:
from pyspark.sql.functions import collect_list, col

# --- 1. Load DataFrames ---

# Load the ATC data table
drug_atc_df = spark.table("SilverLakeHouse.dim_drug_atc")

# Load the Drug Name data table
drug_name_df = spark.table("SilverLakeHouse.dim_drugs")


# --- 2. Aggregate ATC data ---

# Group by kegg_id and collect all associated ATC codes into a list
atc_aggregated_df = drug_atc_df.groupBy("kegg_id").agg(
    collect_list("atc").alias("atc_name")
)
print("--- ATC Aggregation Complete ---")


# --- 3. Aggregate Drug Name data ---

# Group by drug_id (which we treat as kegg_id) and collect all associated drug names
# We use 'name_list' as a temporary column name
drug_name_aggregated_df = drug_name_df.groupBy("drug_id").agg(
    collect_list("drug_name").alias("drug_name")
)

# Rename 'drug_id' to 'kegg_id' to prepare for the join
drug_name_aggregated_df = drug_name_aggregated_df.withColumnRenamed("drug_id", "kegg_id")
print("--- Drug Name Aggregation Complete ---")


# --- 4. Join the Aggregated DataFrames ---

# Join the two aggregated DataFrames on the common key 'kegg_id'.
# Using a full outer join ensures we keep all records from both sides,
# including drugs that only have names or only have ATC data.
merged_df = atc_aggregated_df.join(
    drug_name_aggregated_df,
    on="kegg_id",
    how="full" # CHANGED: Switched from 'inner' to 'full' for complete record inclusion
)




# --- 6. Convert to Pandas and Save to JSON ---

# Define the final output path for the merged file
MERGED_PANDAS_PATH = "abfss://drug_atc@onelake.dfs.fabric.microsoft.com/GoldenLakeHouse.Lakehouse/Files/drugs.json"

# Convert the final Spark DataFrame to a Pandas DataFrame
final_pandas_df = merged_df.toPandas()

print("\n--- Final Merged Pandas DataFrame Head ---")
print(final_pandas_df.head())

# Write the JSON file in the requested format (records, lines=True)
final_pandas_df.to_json(MERGED_PANDAS_PATH, orient='records', lines=True)

print(f"\n--- Successfully saved merged JSON to: {MERGED_PANDAS_PATH} ---")


StatementMeta(, f6e57c01-b84b-45f9-95d6-8193b4eca671, 17, Finished, Available, Finished)

--- ATC Aggregation Complete ---
--- Drug Name Aggregation Complete ---

--- Final Merged Pandas DataFrame Head ---
  kegg_id                              atc_name  \
0  D00002                             [A11HA01]   
1  D00003                             [V03AN01]   
2  D00004                             [V03AN02]   
3  D00006                             [A11HA06]   
4  D00008  [S02AA06, A01AB02, D11AX25, D08AX01]   

                                           drug_name  
0        [Nadide, Nicotinamide adenine dinucleotide]  
1                                           [Oxygen]  
2                                   [Carbon dioxide]  
3  [Pyridoxal phosphate monohydrate, Hipyridoxin,...  
4               [Hydrogen peroxide, Oxyfull, Oxydol]  

--- Successfully saved merged JSON to: abfss://drug_atc@onelake.dfs.fabric.microsoft.com/GoldenLakeHouse.Lakehouse/Files/drugs.json ---


In [16]:
drug_df = spark.table("SilverLakeHouse.dim_targets")
aggregated_df = drug_df.groupBy("ko_number").agg(
    collect_list("name").alias("target_name")
).orderBy("ko_number")

print("\n--- Aggregated DataFrame ---")

pandas_df = aggregated_df.toPandas() 


pandas_df.head()
# Use Pandas to write the JSON with the exact name

PANDAS_PATH = "abfss://drug_atc@onelake.dfs.fabric.microsoft.com/GoldenLakeHouse.Lakehouse/Files/targets.json"

pandas_df.to_json(PANDAS_PATH, orient='records', lines=True)

StatementMeta(, f6e57c01-b84b-45f9-95d6-8193b4eca671, 18, Finished, Available, Finished)


--- Aggregated DataFrame ---


In [17]:
drug_df = spark.table("SilverLakeHouse.dim_diseases")
aggregated_df = drug_df.groupBy("disease_id").agg(
    collect_list("name").alias("disease_name")
).orderBy("disease_id")

print("\n--- Aggregated DataFrame ---")

pandas_df = aggregated_df.toPandas() 


pandas_df.head()
# Use Pandas to write the JSON with the exact name

PANDAS_PATH = "abfss://drug_atc@onelake.dfs.fabric.microsoft.com/GoldenLakeHouse.Lakehouse/Files/diseases.json"

pandas_df.to_json(PANDAS_PATH, orient='records', lines=True)

StatementMeta(, f6e57c01-b84b-45f9-95d6-8193b4eca671, 19, Finished, Available, Finished)


--- Aggregated DataFrame ---


In [18]:
# Read the table into a Spark DataFrame
disease_df = spark.table("SilverLakeHouse.fact_drug_disease")

# Select the required columns (assuming column names are drug_kegg_id and disease_id)
# Note: You can rename them here if needed, but we will use the existing column names
output_df = disease_df.select("drug_kegg_id", "disease_id")

# Define the output path for the CSV directory
CSV_PATH = "abfss://drug_atc@onelake.dfs.fabric.microsoft.com/GoldenLakeHouse.Lakehouse/Files/drug_disease.csv"

# (Optional: Use .coalesce(1) before .write.csv if you MUST have a single output file)
output_df.toPandas().to_csv(CSV_PATH, index=False)

StatementMeta(, f6e57c01-b84b-45f9-95d6-8193b4eca671, 20, Finished, Available, Finished)

In [19]:
# Read the table into a Spark DataFrame
disease_df = spark.table("SilverLakeHouse.fact_drug_target")

# Select the required columns (assuming column names are drug_kegg_id and disease_id)
# Note: You can rename them here if needed, but we will use the existing column names
output_df = disease_df.select("drug_kegg_id", "target_ko_number")

# Define the output path for the CSV directory
CSV_PATH = "abfss://drug_atc@onelake.dfs.fabric.microsoft.com/GoldenLakeHouse.Lakehouse/Files/drug_target.csv"

# (Optional: Use .coalesce(1) before .write.csv if you MUST have a single output file)
output_df.toPandas().to_csv(CSV_PATH, index=False)

StatementMeta(, f6e57c01-b84b-45f9-95d6-8193b4eca671, 21, Finished, Available, Finished)