In [1]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("inputs/26DMP.csv")

# Step 1: Extract full dataset stats
tag_series_all = df['tags'].dropna().apply(lambda x: [tag.strip() for tag in x.split(',')])
all_tags = set(tag for tags in tag_series_all for tag in tags)
total_unique_tags = len(all_tags)
total_unique_institutes = df['institute'].nunique()
total_human = (df['isHumanStudy'].str.lower() == 'yes').sum()
total_non_human = (df['isHumanStudy'].str.lower() == 'no').sum()

# Step 2: Create a DMP-to-tags mapping
dmp_tag_map = {i: set(tags) for i, tags in tag_series_all.items()}

# Step 3: Greedy selection of 10 DMPs maximizing tag and institute diversity
selected_dmps = []
covered_tags = set()
used_institutes = set()

for _ in range(10):
    best_dmp = None
    best_new_tags = set()
    
    for dmp_id, tags in dmp_tag_map.items():
        if dmp_id in selected_dmps:
            continue
        
        new_tags = tags - covered_tags
        institute = df.loc[dmp_id, 'institute']
        
        if len(new_tags) > len(best_new_tags) or (
            len(new_tags) == len(best_new_tags) and institute not in used_institutes
        ):
            best_dmp = dmp_id
            best_new_tags = new_tags
            
    if best_dmp is not None:
        selected_dmps.append(best_dmp)
        covered_tags.update(dmp_tag_map[best_dmp])
        used_institutes.add(df.loc[best_dmp, 'institute'])

# Step 4: Collect selected DMPs and compute stats
selected_df = df.loc[selected_dmps].copy()
selected_df = selected_df[['description', 'tags', 'institute', 'isHumanStudy']]

selected_tag_set = set(tag.strip() for tags in selected_df['tags'] for tag in tags.split(','))
selected_tag_count = len(selected_tag_set)
selected_institute_count = selected_df['institute'].nunique()
human_count = (selected_df['isHumanStudy'].str.lower() == 'yes').sum()
non_human_count = (selected_df['isHumanStudy'].str.lower() == 'no').sum()

# Save results
selected_df.to_csv("top_10_dmps_by_tags.csv", index=False)

# Print full dataset stats
print("=== Full Dataset Stats (26 DMPs) ===")
print("Total unique tags:", total_unique_tags)
print("Total unique institutes:", total_unique_institutes)
print("Total human studies:", total_human)
print("Total non-human studies:", total_non_human)

# Print selected subset stats
print("\n=== Selected 10 DMPs Stats ===")
print("Selected tag count:", selected_tag_count)
print("Selected institute count:", selected_institute_count)
print("Human studies count:", human_count)
print("Non-human studies count:", non_human_count)

# Print summary paragraph
print("\nSummary Paragraph:")
print(f"""
We analyzed all 26 NIH DMPs available on the website by extracting metadata fields including NIH institute and topic tags.
The full set includes {total_unique_institutes} unique NIH institutes and {total_unique_tags} unique tags, with {total_human} human and {total_non_human} non-human studies.

To identify a representative subset, we wrote a script that uses a greedy selection algorithm to choose 10 DMPs that maximize diversity in both tag coverage and institutional representation.
The final selection spans {selected_institute_count} different NIH institutes and covers {selected_tag_count} unique tags.
Among the selected 10 DMPs, {human_count} involve human data and {non_human_count} involve non-human data.

The selected DMPs have been saved to 'top_10_dmps_by_tags.csv'.
""")


=== Full Dataset Stats (26 DMPs) ===
Total unique tags: 21
Total unique institutes: 8
Total human studies: 20
Total non-human studies: 6

=== Selected 10 DMPs Stats ===
Selected tag count: 21
Selected institute count: 8
Human studies count: 8
Non-human studies count: 2

Summary Paragraph:

We analyzed all 26 NIH DMPs available on the website by extracting metadata fields including NIH institute and topic tags.
The full set includes 8 unique NIH institutes and 21 unique tags, with 20 human and 6 non-human studies.

To identify a representative subset, we wrote a script that uses a greedy selection algorithm to choose 10 DMPs that maximize diversity in both tag coverage and institutional representation.
The final selection spans 8 different NIH institutes and covers 21 unique tags.
Among the selected 10 DMPs, 8 involve human data and 2 involve non-human data.

The selected DMPs have been saved to 'top_10_dmps_by_tags.csv'.

