In [3]:
%load_ext autoreload
%autoreload 2

import dvu
import matplotlib.pyplot as plt
import pandas as pd
from os.path import join
import os.path
from tqdm import tqdm
tqdm.pandas()
from pprint import pprint
import os
import mdcalc
import openai
import re
openai.api_key = open('/home/chansingh/.OPENAI_KEY').read().strip()
plt.style.use('default')
dvu.set_style()
import paper_parsing, mdcalc


df_orig = pd.read_pickle('../data/cdis_with_schemas_cleaned.pkl')
df_orig = mdcalc.add_feature_names(df_orig).set_index('id').drop(columns=['index'])
df_orig = mdcalc.process_categories(df_orig)
df_manual = pd.read_csv('../data/main.csv').set_index('id')
df_affil = pd.read_pickle('../data/cdis_with_author_affil.pkl')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
# pandas merge and drop duplicates (override with manual data)
df = df_manual.merge(df_orig, on="id", how="left", suffixes=("", "_2"))
df = df[[col for col in df.columns if not col.endswith("_2")]]
# df["id"] = df.index
df = df.merge(df_affil, on="id", how="left")
df = df.rename(
    columns={
        "full_title_en": "title",
        "short_title_en": "title_short",
        "medium_description_en": "description",
        "short_description_en": "description_short",
        "disease_en": "categorization___disease",
        "specialty_en": "categorization___specialty",
        "system_en": "categorization___system",
        "purpose_en": "categorization___purpose",
        "calc_type": "categorization___type",
        "chief_complaint_en": "categorization___chief_complaint",
        "num_total_corrected": "participants___total",
        "num_male_corrected": "participants___male",
        "num_female_corrected": "participants___female",
        "num_white_corrected": "participants___white",
        "num_black_corrected": "participants___black",
        "num_asian_corrected": "participants___asian",
        "num_latino_corrected": "participants___latino",
        # info
        "instructions_en": "info___instructions",
        "search_abbreviation_en": "info___keywords",
        "before_use": "info___before_use",
        "url_full": "info___mdcalc_url",
        # paper
        "ref_href": "paper___url",
        "ref_text": "paper___citation",
        "ref_year": "paper___year",
        # paper authors
        "ref_authors_affils": "paper___author_affiliations",
        "ref_authors_countries": "paper___author_countries",
        "ref_authors_full": "paper___author_names",
        "ref_authors_genders": "paper___author_genders",
        # feature names
        "feature_names_unique": "features_names",
        "feature_names_unique_uncleaned": "features_names_raw",
    }
).drop(
    columns=[
        "found_paper (0=no, 1=yes)",
        "isVisibleInListView",
        "disabled_reason",
        "disabled",
        "logic_language",
        "dosing",
        "created_at",
        "updated_at",
        "notes",
        "md5",
        "favorite_id",
        "replacement_calc_id",
        "versions",
        "versionNumber",
        "vuid",
        "input_schema",
        "publishedAt",
        "paper_contains_race_keywords",
        "tags",
        "type",
        "ref_href_corrected",
        "slug",
        "feature_names",
        "seo",
        "feature_score_tuples_list",
        "ref_original",
    ]
)
df = df.drop(
    columns=[
        k
        for k in df.columns
        if k.startswith("num_") or k.startswith("cme") or k.startswith("evidence_")
    ]
)

# get stuff out of related_calcs, refs, and content
df["info___related_calc_ids"] = df["related_calcs"].apply(
    lambda l: [x["calcId"] for x in l]
)


def remove_primary_ref(d):
    if "Original/Primary Reference" in d:
        del d["Original/Primary Reference"]
    return d


df["info___additional_references"] = (
    df["refs"].apply(lambda x: remove_primary_ref(x)).iloc[0]
)


def remove_p_tags(s):
    s = re.sub(r"^<p>", "", s)
    s = re.sub(r"</p>$", "", s)
    return s


def remove_placeholders(s):
    if "Do you use" in s and "want to contribute your expertise?" in s:
        return ""
    return s


# content stuff
df["info___usage___use_case"] = df["content"].apply(
    lambda x: remove_placeholders(remove_p_tags(x["how_to_use"]["use_case_en"]))
)
df["info___usage___why_use"] = df["content"].apply(
    lambda x: remove_placeholders(remove_p_tags(x["how_to_use"]["why_use_en"]))
)
df["info___usage___notes"] = df["content"].apply(
    lambda x: remove_placeholders(remove_p_tags(x["how_to_use"]["pearls_pitfalls_en"]))
)
for k in ["advice", "management", "critical_actions"]:
    df[f"info___next_steps___{k}"] = df["content"].apply(
        lambda x: remove_placeholders(remove_p_tags(x["next_steps"][f"{k}_en"]))
    )
df["info___details___formula"] = df["content"].apply(
    lambda x: remove_placeholders(remove_p_tags(x["about"]["formula_en"]))
)
df["info___details___more_info"] = df["content"].apply(
    lambda x: remove_placeholders(remove_p_tags(x["about"]["more_info_en"]))
)
df["info___details___evidence_overview"] = df["content"].apply(
    lambda x: remove_placeholders(
        remove_p_tags(x["about"]["evidence_based_medicine_en"])
    )
)
df["paper___raw_text"] = df["id"].apply(
    mdcalc.try_or_none(paper_parsing.get_paper_text)
)
df = df.drop(columns=["related_calcs", "refs", "content"])
# sorted(df.columns)

df.to_pickle('../data/export.pkl')