In [1]:
import pandas as pd
import os


def EDA(PATH):

    # Load the provided data splits
    train_df = pd.read_csv(os.path.join(PATH, "train.csv"))
    val_df = pd.read_csv(os.path.join(PATH, "val.csv"))
    test_df = pd.read_csv(os.path.join(PATH, "test.csv"))

    print("THIS SUMMARY IGNORES PLACEBO INSTANCE.")
    print()
    print(f"The train set has {len(train_df)} instances.")
    print(f"The val set has {len(val_df)} instances.")
    print(f"The test set has {len(test_df)} instances.")
    print("-" * 100)
    print()

    # Extracting relevant columns for analysis
    train_drug_criteria_group = train_df[
        ["smiles", "eligibility_criteria", "group_description"]
    ]
    train_drug_criteria_group = train_drug_criteria_group.copy()
    val_drug_criteria_group = val_df[
        ["smiles", "eligibility_criteria", "group_description"]
    ]
    val_drug_criteria_group = val_drug_criteria_group.copy()
    test_drug_criteria_group = test_df[
        ["smiles", "eligibility_criteria", "group_description"]
    ]
    test_drug_criteria_group = test_drug_criteria_group.copy()

    # Create combination
    train_drug_criteria_group["combination"] = train_drug_criteria_group.apply(
        lambda x: "".join(
            [x["smiles"], x["eligibility_criteria"], x["group_description"]]
        ),
        axis=1,
    )

    val_drug_criteria_group["combination"] = val_drug_criteria_group.apply(
        lambda x: "".join(
            [x["smiles"], x["eligibility_criteria"], x["group_description"]]
        ),
        axis=1,
    )

    test_drug_criteria_group["combination"] = test_drug_criteria_group.apply(
        lambda x: "".join(
            [x["smiles"], x["eligibility_criteria"], x["group_description"]]
        ),
        axis=1,
    )

    # Unique comb in each set
    unique_comb_train = train_drug_criteria_group["combination"].nunique()
    unique_comb_val = val_drug_criteria_group["combination"].nunique()
    unique_comb_test = test_drug_criteria_group["combination"].nunique()

    print(
        f"Among train set instances, {unique_comb_train/len(train_drug_criteria_group['combination'])*100:.2f}% are unique."
    )
    print(
        f"Among validation set instances, {unique_comb_val/len(val_drug_criteria_group['combination'])*100:.2f}% are unique."
    )
    print(
        f"Among test set instances, {unique_comb_test/len(test_drug_criteria_group['combination'])*100:.2f}% are unique."
    )

    # Unique drugs in each set
    unique_drugs_train = train_drug_criteria_group["smiles"].nunique()
    unique_drugs_val = val_drug_criteria_group["smiles"].nunique()
    unique_drugs_test = test_drug_criteria_group["smiles"].nunique()
    print()
    print(f"There is {unique_drugs_train} unique drugs in the train set.")
    print(f"There is {unique_drugs_val} unique drugs in the val set.")
    print(f"There is {unique_drugs_test} unique drugs in the test set.")

    # Overlapping drugs between train, val, and test
    overlapping_drugs_val = set(train_drug_criteria_group["smiles"]).intersection(
        val_drug_criteria_group["smiles"]
    )
    overlapping_drugs_test = set(train_drug_criteria_group["smiles"]).intersection(
        test_drug_criteria_group["smiles"]
    )
    num_overlapping_drugs_val = len(overlapping_drugs_val)
    num_overlapping_drugs_test = len(overlapping_drugs_test)
    print()
    print(
        f"Among the {unique_drugs_val} unique drugs in the validation set, {num_overlapping_drugs_val/unique_drugs_val*100:.2f}% are included in the train set."
    )
    print(
        f"Among the {unique_drugs_test} unique drugs in the test set, {num_overlapping_drugs_test/unique_drugs_test*100:.2f}% are included in the train set."
    )

    # Unique criteria in each set
    unique_criteria_train = train_drug_criteria_group["eligibility_criteria"].nunique()
    unique_criteria_val = val_drug_criteria_group["eligibility_criteria"].nunique()
    unique_criteria_test = test_drug_criteria_group["eligibility_criteria"].nunique()
    print()
    print(f"There is {unique_criteria_train} unique criteria in the train set.")
    print(f"There is {unique_criteria_val} unique criteria in the val set.")
    print(f"There is {unique_criteria_test} unique criteria in the test set.")

    # Overlapping criteria between train, val, and test
    overlapping_criteria_val = set(
        train_drug_criteria_group["eligibility_criteria"]
    ).intersection(val_drug_criteria_group["eligibility_criteria"])
    overlapping_criteria_test = set(
        train_drug_criteria_group["eligibility_criteria"]
    ).intersection(test_drug_criteria_group["eligibility_criteria"])
    num_overlapping_criteria_val = len(overlapping_criteria_val)
    num_overlapping_criteria_test = len(overlapping_criteria_test)
    print()
    print(
        f"Among the {unique_criteria_val} unique criteria in the validation set, {num_overlapping_criteria_val/unique_criteria_val*100:.2f}% are included in the train set."
    )
    print(
        f"Among the {unique_criteria_test} unique criteria in the test set, {num_overlapping_criteria_test/unique_criteria_test*100:.2f}% are included in the train set."
    )

    # Unique and overlapping group descriptions
    unique_group_desc_train = set(train_drug_criteria_group["group_description"])
    unique_group_desc_val = set(val_drug_criteria_group["group_description"])
    unique_group_desc_test = set(test_drug_criteria_group["group_description"])
    overlapping_group_desc_val = unique_group_desc_train.intersection(
        unique_group_desc_val
    )
    overlapping_group_desc_test = unique_group_desc_train.intersection(
        unique_group_desc_test
    )
    print()
    print(
        f"There is {len(unique_group_desc_train)} unique group descriptions in the train set."
    )
    print(
        f"There is {len(unique_group_desc_val)} unique group descriptions in the val set."
    )
    print(
        f"There is {len(unique_group_desc_test)} unique group descriptions in the test set."
    )
    print()
    print(
        f"Among group descriptions, {len(overlapping_group_desc_val)/len(unique_group_desc_val)*100:.2f}% overlap between the train and validation sets."
    )
    print(
        f"Among group descriptions, {len(overlapping_group_desc_test)/len(unique_group_desc_test)*100:.2f}% overlap between the train and test sets."
    )

    print("\nOverlapping Group Descriptions between Train and Test Sets:")
    print(overlapping_group_desc_test)

    # Overlapping group_description + smiles
    train_drug_criteria_group["group_smiles"] = (
        train_drug_criteria_group["group_description"]
        + train_drug_criteria_group["smiles"]
    )
    val_drug_criteria_group["group_smiles"] = (
        val_drug_criteria_group["group_description"] + val_drug_criteria_group["smiles"]
    )
    test_drug_criteria_group["group_smiles"] = (
        test_drug_criteria_group["group_description"]
        + test_drug_criteria_group["smiles"]
    )

    unique_group_smiles_train = set(train_drug_criteria_group["group_smiles"])
    unique_group_smiles_val = set(val_drug_criteria_group["group_smiles"])
    unique_group_smiles_test = set(test_drug_criteria_group["group_smiles"])

    overlapping_group_smiles_val = unique_group_smiles_train.intersection(
        unique_group_smiles_val
    )
    overlapping_group_smiles_test = unique_group_smiles_train.intersection(
        unique_group_smiles_test
    )

    print()
    print(
        f"There is {len(unique_group_smiles_train)} unique group_description + smiles combinations in the train set."
    )
    print(
        f"There is {len(unique_group_smiles_val)} unique group_description + smiles combinations in the val set."
    )
    print(
        f"There is {len(unique_group_smiles_test)} unique group_description + smiles combinations in the test set."
    )
    print()
    print(
        f"Among group_description + drugs combinations, {len(overlapping_group_smiles_val)/len(unique_group_smiles_val)*100:.2f}% overlap between the train and validation sets."
    )
    print(
        f"Among group_description + drugs combinations, {len(overlapping_group_smiles_test)/len(unique_group_smiles_test)*100:.2f}% overlap between the train and test sets."
    )

    # Overlapping group_description + eligibility_criteria
    train_drug_criteria_group["group_criteria"] = (
        train_drug_criteria_group["group_description"]
        + train_drug_criteria_group["eligibility_criteria"]
    )
    val_drug_criteria_group["group_criteria"] = (
        val_drug_criteria_group["group_description"]
        + val_drug_criteria_group["eligibility_criteria"]
    )
    test_drug_criteria_group["group_criteria"] = (
        test_drug_criteria_group["group_description"]
        + test_drug_criteria_group["eligibility_criteria"]
    )

    unique_group_criteria_train = set(train_drug_criteria_group["group_criteria"])
    unique_group_criteria_val = set(val_drug_criteria_group["group_criteria"])
    unique_group_criteria_test = set(test_drug_criteria_group["group_criteria"])

    overlapping_group_criteria_val = unique_group_criteria_train.intersection(
        unique_group_criteria_val
    )
    overlapping_group_criteria_test = unique_group_criteria_train.intersection(
        unique_group_criteria_test
    )

    print()
    print(
        f"There is {len(unique_group_criteria_train)} unique group_description + eligibility_criteria combinations in the train set."
    )
    print(
        f"There is {len(unique_group_criteria_val)} unique group_description + eligibility_criteria combinations in the val set."
    )
    print(
        f"There is {len(unique_group_criteria_test)} unique group_description + eligibility_criteria combinations in the test set."
    )
    print()
    print(
        f"Among group_description + eligibility_criteria combinations, {len(overlapping_group_criteria_val)/len(unique_group_criteria_val)*100:.2f}% overlap between the train and validation sets."
    )
    print(
        f"Among group_description + eligibility_criteria combinations, {len(overlapping_group_criteria_test)/len(unique_group_criteria_test)*100:.2f}% overlap between the train and test sets."
    )

    # Combining smiles and eligibility_criteria
    train_drug_criteria_group["smiles_criteria"] = (
        train_drug_criteria_group["smiles"]
        + train_drug_criteria_group["eligibility_criteria"]
    )
    val_drug_criteria_group["smiles_criteria"] = (
        val_drug_criteria_group["smiles"]
        + val_drug_criteria_group["eligibility_criteria"]
    )
    test_drug_criteria_group["smiles_criteria"] = (
        test_drug_criteria_group["smiles"]
        + test_drug_criteria_group["eligibility_criteria"]
    )

    # Unique combinations in each set
    unique_smiles_criteria_train = set(train_drug_criteria_group["smiles_criteria"])
    unique_smiles_criteria_val = set(val_drug_criteria_group["smiles_criteria"])
    unique_smiles_criteria_test = set(test_drug_criteria_group["smiles_criteria"])

    # Overlapping combinations between train and other sets
    overlapping_smiles_criteria_val = unique_smiles_criteria_train.intersection(
        unique_smiles_criteria_val
    )
    overlapping_smiles_criteria_test = unique_smiles_criteria_train.intersection(
        unique_smiles_criteria_test
    )

    print()
    print(
        f"There is {len(unique_smiles_criteria_train)} unique drugs + eligibility_criteria combinations in the train set."
    )
    print(
        f"There is {len(unique_smiles_criteria_val)} unique drugs + eligibility_criteria combinations in the val set."
    )
    print(
        f"There is {len(unique_smiles_criteria_test)} unique drugs + eligibility_criteria combinations in the test set."
    )
    print()
    print(
        f"Among drugs + eligibility_criteria combinations, {len(overlapping_smiles_criteria_val)/len(unique_smiles_criteria_val)*100:.2f}% overlap between the train and validation sets."
    )
    print(
        f"Among drugs + eligibility_criteria combinations, {len(overlapping_smiles_criteria_test)/len(unique_smiles_criteria_test)*100:.2f}% overlap between the train and test sets."
    )

    # Combining smiles, eligibility_criteria, and group_description
    train_drug_criteria_group["smiles_criteria_group"] = (
        train_drug_criteria_group["smiles"]
        + train_drug_criteria_group["eligibility_criteria"]
        + train_drug_criteria_group["group_description"]
    )
    val_drug_criteria_group["smiles_criteria_group"] = (
        val_drug_criteria_group["smiles"]
        + val_drug_criteria_group["eligibility_criteria"]
        + val_drug_criteria_group["group_description"]
    )
    test_drug_criteria_group["smiles_criteria_group"] = (
        test_drug_criteria_group["smiles"]
        + test_drug_criteria_group["eligibility_criteria"]
        + test_drug_criteria_group["group_description"]
    )

    # Unique combinations in each set
    unique_smiles_criteria_group_train = set(
        train_drug_criteria_group["smiles_criteria_group"]
    )
    unique_smiles_criteria_group_val = set(
        val_drug_criteria_group["smiles_criteria_group"]
    )
    unique_smiles_criteria_group_test = set(
        test_drug_criteria_group["smiles_criteria_group"]
    )

    # Overlapping combinations between train and other sets
    overlapping_smiles_criteria_group_val = (
        unique_smiles_criteria_group_train.intersection(
            unique_smiles_criteria_group_val
        )
    )
    overlapping_smiles_criteria_group_test = (
        unique_smiles_criteria_group_train.intersection(
            unique_smiles_criteria_group_test
        )
    )

    print()
    print(
        f"There is {len(unique_smiles_criteria_group_train)} unique drugs + eligibility_criteria + group_description combinations in the train set."
    )
    print(
        f"There is {len(unique_smiles_criteria_group_val)} unique drugs + eligibility_criteria + group_description combinations in the val set."
    )
    print(
        f"There is {len(unique_smiles_criteria_group_test)} unique drugs + eligibility_criteria + group_description combinations in the test set."
    )
    print()
    print(
        f"Among drugs + eligibility_criteria + group_description combinations, {len(overlapping_smiles_criteria_group_val)/len(unique_smiles_criteria_group_val)*100:.2f}% overlap between the train and validation sets."
    )
    print(
        f"Among drugs + eligibility_criteria + group_description combinations, {len(overlapping_smiles_criteria_group_test)/len(unique_smiles_criteria_group_test)*100:.2f}% overlap between the train and test sets."
    )

In [2]:
path = '../data/classification/smiles/train_base'
EDA(path)

THIS SUMMARY IGNORES PLACEBO INSTANCE.

The train set has 5128 instances.
The val set has 603 instances.
The test set has 750 instances.
----------------------------------------------------------------------------------------------------

Among train set instances, 100.00% are unique.
Among validation set instances, 100.00% are unique.
Among test set instances, 100.00% are unique.

There is 900 unique drugs in the train set.
There is 113 unique drugs in the val set.
There is 113 unique drugs in the test set.

Among the 113 unique drugs in the validation set, 0.00% are included in the train set.
Among the 113 unique drugs in the test set, 0.00% are included in the train set.

There is 3712 unique criteria in the train set.
There is 481 unique criteria in the val set.
There is 506 unique criteria in the test set.

Among the 481 unique criteria in the validation set, 13.93% are included in the train set.
Among the 506 unique criteria in the test set, 12.65% are included in the train set.


In [3]:
# We have +2 for unique drugs in the train set ([PLACEBO] and [NOMSILES]).
path = '../data/classification/smiles/train_augmented'
EDA(path)

THIS SUMMARY IGNORES PLACEBO INSTANCE.

The train set has 11081 instances.
The val set has 603 instances.
The test set has 750 instances.
----------------------------------------------------------------------------------------------------

Among train set instances, 100.00% are unique.
Among validation set instances, 100.00% are unique.
Among test set instances, 100.00% are unique.

There is 902 unique drugs in the train set.
There is 113 unique drugs in the val set.
There is 113 unique drugs in the test set.

Among the 113 unique drugs in the validation set, 0.00% are included in the train set.
Among the 113 unique drugs in the test set, 0.00% are included in the train set.

There is 7537 unique criteria in the train set.
There is 481 unique criteria in the val set.
There is 506 unique criteria in the test set.

Among the 481 unique criteria in the validation set, 41.37% are included in the train set.
Among the 506 unique criteria in the test set, 41.70% are included in the train set.