In [None]:
import numpy as np
import os
import pandas as pd
from pathlib import Path
import pandas as pd

ROOT_DIR = Path.cwd().parent
DATA_DIR = ROOT_DIR / "data" 
MANIFEST_DATA_DIR = DATA_DIR / "manifest"
RAW_DATA_DIR = DATA_DIR / "raw"
CSV_DATA_DIR = DATA_DIR / "to_csv"
TXT_DATA_DIR = RAW_DATA_DIR / "20210901"
MAIN_TXT_DATA_DIR = TXT_DATA_DIR / "main"

# Purpose
(Step 0.) Transcribe the file description (see `/databae/doc/VigiBase Extract Case Level - file description.pdf`) into tables to read in the text files line by line

In [None]:
manifest_demo.to_csv(MANIFEST_DATA_DIR / "DEMO.csv")

In [None]:
manifest_drug = pd.DataFrame(
    [
       ("UMCReportId", int, 11, 0, 11, "Unique number linking DRUG to DEMO"),
       ("Drug_Id", int, 11, 11, 22, "Unique number identifying each row in DRUG"),
       ("MedicinalProd_Id", int, 11, 22, 33, "A sequential number generated for each Medicinal product"),
       ("DrecNo", str, 6, 33, 39, "Drug record number"),
       ("Seq1", str, 2, 39, 41, "Sequence number 1"),
       ("Seq2", str, 3, 41, 44, "Sequence number 2"),
       ("Route", str, 2, 44, 46, "RouteOfAdm_Lx.Code. Route of administration of drug."),
       ("Basis", str, 1, 46, 47, "RepBasis_Lx.Code. Characterization of drug role."),
       ("Amount", str, 5, 47, 52, "Dosage regimen; Amount"),
       ("AmountU", str, 2, 52, 54, "SizeUnit_Lx.Code. Amount unit."),
       ("Frequency", str, 2, 54, 56, "Number of units in the interval"),
       ("FrequencyU", str, 3, 56, 59, "Frequency_Lx.Code. Definition of the interval."),
    ],
  columns = ["column_name", "dtype", "number_of_chars", "char_position_start", "char_position_end", "notes"]
)
manifest_drug

In [None]:
manifest_drug.to_csv(MANIFEST_DATA_DIR / "DRUG.csv")

In [None]:
manifest_followup = pd.DataFrame(
    [
       ("UMCReportId", int, 11, 0, 11, "Unique number linking FOLLOWUP to DEMO. This number represent the current active report and is used to link with previous versions of the same case."),
       ("ReplacedUMCReportId", int, 11, 11, 22, "Previous versions of the case, no longer available in DEMO"),
    ],
  columns = ["column_name", "dtype", "number_of_chars", "char_position_start", "char_position_end", "notes"]
)
manifest_followup

In [None]:
manifest_followup.to_csv(MANIFEST_DATA_DIR / "FOLLOWUP.csv")

In [None]:
manifest_ind = pd.DataFrame(
    [
       ("Drug_Id", int, 11, 0, 11, "Unique number linking IND to DRUG"),
       ("Indication", str, 255, 11, 266, "Reason for drug use. Indication can be decoded from MedDRA, ICD-8, ICD-9 or ICD-10."),
    ],
  columns = ["column_name", "dtype", "number_of_chars", "char_position_start", "char_position_end", "notes"]
)
manifest_ind

In [None]:
manifest_ind.to_csv(MANIFEST_DATA_DIR / "IND.csv")

In [None]:
manifest_link = pd.DataFrame(
    [
       ("Drug_Id", int, 11, 1, 11, "Unique number linking LINK to DRUG"),
       ("Adr_Id", int, 11, 11, 22, "Unique number linking LINK to ADR"),
       ("Dechallenge1", str, 1, 22, 23, "Dechallenge_Lx.Code. Dechallenge action."),
       ("Dechallenge2", str, 1, 23, 24, "Dechallenge2_Lx.Code. Dechallenge outcome."),
       ("Rechallenge1", str, 1, 24, 25, "Rechallenge_Lx.Code. Rechallenge action."),
       ("Rechallenge2", str, 1, 25, 26, "Rechallenge2_Lx.Code. Rechallenge outcome."),
       ("TimeToOnsetMin", float, 11, 26, 37, "Always in the base unit days (see Appendix 1)"),
       ("TimeToOnsetMax", float, 11, 37, 48, "Always in the base unit days (see Appendix 1)"),
    ],
  columns = ["column_name", "dtype", "number_of_chars", "char_position_start", "char_position_end", "notes"]
)
manifest_link

In [None]:
manifest_link.to_csv(MANIFEST_DATA_DIR / "LINK.csv")

In [None]:
manifest_out = pd.DataFrame(
    [
       ("UMCReportId", int, 11, 0, 11, "Unique number linking OUT to DEMO"),
       ("Seriousness", str, 2, 11, 13, "Seriousness_Lx.Code. Seriousness criteria of case."),
       ("Serious", str, 1, 13, 14, "The value refers to if the case was serious or not. Could be either ’Y’ = yes ’N’ = no"),
    ],
  columns = ["column_name", "dtype", "number_of_chars", "char_position_start", "char_position_end", "notes"]
)
manifest_out

In [None]:
manifest_out.to_csv(MANIFEST_DATA_DIR / "OUT.csv")

In [None]:
manifest_srce = pd.DataFrame(
    [
       ("UMCReportId", int, 11, 0, 11, "Unique number linking OUT to DEMO"),
       ("Type", str, 2, 11, 13, "Notifier_Lx.Code. Notifier type."),
    ],
  columns = ["column_name", "dtype", "number_of_chars", "char_position_start", "char_position_end", "notes"]
)
manifest_srce

In [None]:
manifest_srce.to_csv(MANIFEST_DATA_DIR / "SRCE.csv")

In [None]:
manifest_adr = pd.DataFrame(
    [
       ("UMCReportId", int, 11, 0, 11, "Unique number linking OUT to DEMO"),
       ("Adr_Id", int, 11, 11, 22, "Unique number identifying each row in ADR"),
       ("MedDRA_Id", int, 8, 22, 30, "MedDRA term code"),
       ("Outcome", str, 1, 30, 31, "Outcome_Lx.Code. Outcome of reaction."),
    ],
  columns = ["column_name", "dtype", "number_of_chars", "char_position_start", "char_position_end", "notes"]
)
manifest_adr

In [None]:
manifest_adr.to_dict(orient="record")[0]

In [None]:
manifest_adr.to_csv(MANIFEST_DATA_DIR / "ADR.csv")

In [None]:
count = pd.DataFrame(
  columns=["Table", "No of rows"],
  data=[
    ("ADR", 66802344),
    ("DEMO", 27213386),
    ("DRUG", 71006811),  # OLD
    # ("DRUG", 78_840_125),  # NEW
    #("DRUG", 78840125),  # NEW
    ("FOLLOWUP", 7759897),
    ("IND", 35474400),
    ("LINK", 99273742),
    ("OUT", 22677257),
    ("SRCE", 21105031 ),
  ],
)
pd.options.display.float_format = '{:,.0f}'.format

count["No of rows"] =  count["No of rows"].astype(float)
display(count)

In [None]:
pd.options.display.float_format

In [None]:
count.to_csv(MANIFEST_DATA_DIR / "count.csv")

In [None]:
count.to_csv(MANIFEST_DATA_DIR / "count.csv")

In [None]:
count[count["Table"] == "ADR"]["No of rows"].values


In [None]:
count.loc[count["Table"] == "ADR", "No of rows"].squeeze()