# Purpose
(Step 3.) Filter data to focus on relevant subset

In [None]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
from tqdm.notebook import tqdm

In [None]:
ROOT_DIR = Path.cwd().parent
DATA_DIR = ROOT_DIR / "data" 
RAW_DATA_DIR = DATA_DIR / "raw"
CSV_DATA_DIR = DATA_DIR / "to_csv"
FILTERED_DATA_DIR = DATA_DIR / "filtered"
TXT_DATA_DIR = RAW_DATA_DIR / "20210901"
MAIN_TXT_DATA_DIR = TXT_DATA_DIR / "main"

# Look at particular MedDRA IDs related to pancreatitis
Search for pancreatitis in ... 
Nevermind, this list was sent to me.

In [None]:
meddra_id_pancreatitis = [
  "10076058",
  "10076059",
  "10033588",
  "10033626",
  "10033625",
  "10071853",
  "10055312",
  "10058096",
  "10055858",
  "10033672",
  "10019607",
  "10073794",
  "10033650",
  "10033651",
  "10056219",
  "10028891",
  "10033654",
  "10033655",
  "10054706",
  "10052400",
  "10033635",
  "10059155",
  "10076039",
  "10033645",
  "10048365",
  "10033656",
  "10033658",
  "110067190",
  "10000971",
  "10033647",
  "10033648",
  "10066715",
  "10033657",
]
len(meddra_id_pancreatitis)

In [None]:
# import yaml

# Force into ints since they're inferred when pulling in the ADR table

In [None]:
meddra_id_pancreatitis = [int(meddra_id) for meddra_id in meddra_id_pancreatitis]

In [None]:
table_name = "ADR"
adr = pd.read_csv(CSV_DATA_DIR / f"{table_name}.csv", index_col=0, na_values="-")
print(len(adr))
adr.head(3)

In [None]:
adr.dtypes

# Filter ADR by MedDRA IDs we're interested in

In [None]:
adr = adr[adr["MedDRA_Id"].isin(meddra_id_pancreatitis)]
print(len(adr))
adr.head(3)
adr.to_csv(FILTERED_DATA_DIR / "ADR.csv")

# Find unique patients with some form of pancreatitis

In [None]:
len(adr["UMCReportId"])

In [None]:
len(adr["UMCReportId"].unique())

In [None]:
umc_report_id_pancreatitis = adr["UMCReportId"].unique()
umc_report_id_pancreatitis

# Also look at patients that followed up or changed/updated their UMC Report ID

In [None]:
table_name = "FOLLOWUP"
followup = pd.read_csv(CSV_DATA_DIR / f"{table_name}.csv", index_col=0, na_values="-")
print(table_name, "counts")
print("\tbefore:", f"{len(followup):,}")
followup = followup[followup["UMCReportId"].isin(umc_report_id_pancreatitis)]
print("\tafter:", f"{len(followup):,}")

In [None]:
followup
new_umc_report_id_pancreatitis = followup["ReplacedUMCReportId"].unique().tolist()
# # print(len(followup["ReplacedUMCReportId"]))
# # print(len(followup["ReplacedUMCReportId"].unique()))
# umc_report_id_pancreatitis += 

In [None]:
# print(new_umc_report_id_pancreatitis)
# print(umc_report_id_pancreatitis.tolist())

umc_report_id_pancreatitis = umc_report_id_pancreatitis.tolist() + followup["ReplacedUMCReportId"].unique().tolist()

# Filter tables to only have these pancreatitis patients

In [None]:
table_names = [
  "ADR",
  "DEMO",
  "DRUG",
  "FOLLOWUP",
  # "IND",
  # "LINK",
  "OUT",
  "SRCE",
]

In [None]:
for table in table_names:
  df = pd.read_csv(CSV_DATA_DIR / f"{table}.csv", index_col=0, na_values="-")
  print(table, "counts")
  
  print("\tbefore:", f"{len(df):,}")
  df = df[df["UMCReportId"].isin(umc_report_id_pancreatitis)]
  print("\tafter:", f"{len(df):,}")
  df.to_csv(FILTERED_DATA_DIR / f"{table}.csv")

In [None]:
len(umc_report_id_pancreatitis)

In [None]:
for table in table_names:
  df = pd.read_csv(CSV_DATA_DIR / f"{table}.csv", index_col=0, na_values="-")
  print(table, "counts")
  print("\tbefore:", f"{len(df):,}")
  df = df[df["UMCReportId"].isin(umc_report_id_pancreatitis)]
  print("\tafter:", f"{len(df):,}")
  df.to_csv(FILTERED_DATA_DIR / f"{table}.csv")

# Filter tables where the identifier is the drug ID, using only drugs used for the pancreatitis patients

In [None]:
table_name = "DRUG"
drug = pd.read_csv(FILTERED_DATA_DIR / f"{table_name}.csv", index_col=0, na_values="-")
drug

In [None]:
len(drug["Drug_Id"])

In [None]:
drug_id_pancreatitis = drug["Drug_Id"].unique()

In [None]:
table_names = ["IND", "LINK"]
for table in table_names:
  df = pd.read_csv(CSV_DATA_DIR / f"{table}.csv", index_col=0, na_values="-")
  print(table, "counts")
  print("\tbefore:", f"{len(df):,}")
  df = df[df["Drug_Id"].isin(drug_id_pancreatitis)]
  print("\tafter:", f"{len(df):,}")
  df.to_csv(FILTERED_DATA_DIR / f"{table}.csv")

In [None]:
# sns.barplot(data=adr, y="Outcome")

In [None]:
sns.barplot(data=res, x="Outcome", y="counts")

In [None]:
res = adr[["Outcome"]].value_counts().reset_index()
res = res.rename({0: "counts"}, axis="columns")

In [None]:
res.dtypes

In [None]:
table

In [None]:
adr.value_counts()