# Purpose
(Step 1.)
Parse the original text files and convert them into CSVs for pandas manipulation later

In [15]:
import os
import pydoc
import re
import time
from collections import Counter
from pathlib import Path

import numpy as np
import pandas as pd
import pandera as pa
from tqdm.notebook import tqdm

In [16]:
# PARQUET_PARTITIONED_DATA_DIR.mkdir(exist_ok=True, parents=True)
# PARQUET_DATA_DIR.mkdir()
# PARQUET_DATA_DIR.exists()
# "hi"

# df
# fpath = MANIFEST_DATA_DIR / f"{table_name}.csv"
# manifest_drug = pd.read_csv(fpath, index_col=0)
# manifest_drug.to_records
# manifest_drug
# %pip install --force-reinstall -v "ipywidgets == 7.7.2"
# %pip install --force-reinstall -v "jupyterlab_widgets == 1.1.1"

In [17]:
ROOT_DIR = Path.cwd().parent
DATA_DIR = ROOT_DIR / "data" 
RAW_DATA_DIR = DATA_DIR / "raw"
MANIFEST_DATA_DIR = DATA_DIR / "manifest"

# batch = "20210901"  # OLD
batch = "20220601" # NEW
TXT_DATA_DIR = RAW_DATA_DIR / batch
MAIN_TXT_DATA_DIR = TXT_DATA_DIR / "main"

PARQUET_DATA_DIR = DATA_DIR / "to_parquet" / batch
PARQUET_PARTITIONED_DATA_DIR = PARQUET_DATA_DIR / "partitioned"


def time_it(func):
    start_time = time.time()
    def wrapped(*vargs, **kwargs):
        return func(*vargs, **kwargs)
    end_time = time.time()

    fin_time = end_time - start_time
    fin_time_m = fin_time / 60
    fin_time_h = fin_time_m / 60

    print(f"{fin_time:.3f} seconds")
    print(f"{fin_time_m:.3f} minutes")
    print(f"{fin_time_h:.3f} hours")
    
    return wrapped

def get_record_count(table_name: str) -> int:
    fpath = MANIFEST_DATA_DIR / "count.csv"
    count = pd.read_csv(fpath, index_col=0)
    count.loc[:, "No of rows"] = count.loc[:, "No of rows"].astype(int)
    n_rows = count.loc[count["Table"] == table_name, "No of rows"].squeeze()

    return n_rows


def get_manifest(table_name: str) -> pd.DataFrame:
    fpath = MANIFEST_DATA_DIR / f"{table_name}.csv"
    manifest = pd.read_csv(fpath, index_col=0,)

    return manifest


def save_as_parquet_chunks(table_name: str, chunk_size: int = 1_000_000) -> None:
    manifest = get_manifest(table_name)

    widths = manifest["number_of_chars"]
    column_names = manifest["column_name"]
    
    # `dtype` column is inferred as string; need to convert back to `type`
    dtypes = [
    pydoc.locate(
        re.findall(r"'(.*)'", val).pop()
    ) for val in manifest["dtype"]
    ]
    
    fpath = MAIN_TXT_DATA_DIR / f"{table_name}.txt"
    n_rows = get_record_count(table_name)
    n_chunks = int(np.ceil(n_rows / chunk_size))
    for i_chunk, df in tqdm(
        enumerate(
        pd.read_fwf(
        fpath, 
        widths=widths,
        chunksize=chunk_size,
        header=None,
        names=column_names,
        converters={k: v for k, v in zip(column_names, dtypes)},
        encoding="ISO-8859-1",
        # na_values={"TimeToOnsetMin": "-", "TimeToOnsetMax": "-"},
        na_values="-",
        ), 
    ),
        total=n_chunks,
    ):
        fname = f"{fpath.stem}_{i_chunk + 1}_of_{n_chunks}.parquet"
        fpath_out = PARQUET_PARTITIONED_DATA_DIR / fname
        if fpath_out.exists():
          continue
        
        df.to_parquet(fpath_out)

In [19]:
# table_name = "DRUG"
# chunk_size = 1_000_000
table_names = [
    "IND",
    "DRUG",
    "ADR",
    "DEMO",
    "OUT",
    "SRCE",
    "FOLLOWUP",
    "LINK",
]
for table_name in tqdm(table_names):
    save_as_parquet_chunks(table_name=table_name)

  0%|          | 0/8 [00:00<?, ?it/s]

  count.loc[:, "No of rows"] = count.loc[:, "No of rows"].astype(int)


  0%|          | 0/36 [00:00<?, ?it/s]

  return [line[fromm:to].strip(self.delimiter) for (fromm, to) in self.colspecs]
  count.loc[:, "No of rows"] = count.loc[:, "No of rows"].astype(int)


  0%|          | 0/72 [00:00<?, ?it/s]

  count.loc[:, "No of rows"] = count.loc[:, "No of rows"].astype(int)


  0%|          | 0/67 [00:00<?, ?it/s]

  count.loc[:, "No of rows"] = count.loc[:, "No of rows"].astype(int)


  0%|          | 0/28 [00:00<?, ?it/s]

  count.loc[:, "No of rows"] = count.loc[:, "No of rows"].astype(int)


  0%|          | 0/23 [00:00<?, ?it/s]

  count.loc[:, "No of rows"] = count.loc[:, "No of rows"].astype(int)


  0%|          | 0/22 [00:00<?, ?it/s]

  count.loc[:, "No of rows"] = count.loc[:, "No of rows"].astype(int)


  0%|          | 0/8 [00:00<?, ?it/s]

  count.loc[:, "No of rows"] = count.loc[:, "No of rows"].astype(int)


  0%|          | 0/100 [00:00<?, ?it/s]

In [101]:
get_manifest("LINK")

Unnamed: 0,column_name,dtype,number_of_chars,char_position_start,char_position_end,notes
0,Drug_Id,<class 'int'>,11,1,11,Unique number linking LINK to DRUG
1,Adr_Id,<class 'int'>,11,11,22,Unique number linking LINK to ADR
2,Dechallenge1,<class 'str'>,1,22,23,Dechallenge_Lx.Code. Dechallenge action.
3,Dechallenge2,<class 'str'>,1,23,24,Dechallenge2_Lx.Code. Dechallenge outcome.
4,Rechallenge1,<class 'str'>,1,24,25,Rechallenge_Lx.Code. Rechallenge action.
5,Rechallenge2,<class 'str'>,1,25,26,Rechallenge2_Lx.Code. Rechallenge outcome.
6,TimeToOnsetMin,<class 'float'>,11,26,37,Always in the base unit days (see Appendix 1)
7,TimeToOnsetMax,<class 'float'>,11,37,48,Always in the base unit days (see Appendix 1)


In [None]:
results = [None for _ in range(n_rows)]
result = [None for _ in range(len(manifest_adr))]


# drugs = Counter()
# for i_chunk in range(n_chunks):
#   row_start = i_chunk * chunk_size
#   row_end = row_start + chunk_size
#   with open(fpath) as file:
#     for idx, line in tqdm(enumerate(file), total=n_rows):
#       if idx < row_start:
#         continue
#       if idx >= row_end:
#         break
#       for icol, col in enumerate(manifest_adr):
#         char_start = col["char_position_start"]
#         char_end = col["char_position_end"]
#         result[icol] =  line[char_start: char_end].rstrip()
#       results[idx % chunk_size] = tuple(result)
#     df = pd.DataFrame(results, columns=columns_adr)
#     drugs.update(df["DrecNo"])
# # df.head()
# drugs


In [62]:
drug_record_ids = [
  "15", 
  "911",
  "14456",
  "16129",
  "56986",
  "062611",
  "062574",
  "062576",
  "079186",
  "090204",
]

In [63]:
# new_data_drugs = drugs

In [64]:
for drug in drug_record_ids:
  print(f"Drug: {drug} \n\t{drugs[drug.rjust(6, '0')]:,}")

Drug: 15 
	82,643
Drug: 911 
	21,270
Drug: 14456 
	270,188
Drug: 16129 
	777,112
Drug: 56986 
	630
Drug: 062611 
	51,679
Drug: 062574 
	33,149
Drug: 062576 
	71,732
Drug: 079186 
	122,799
Drug: 090204 
	916


In [65]:
for drug in drug_record_ids:
  print(f"Drug: {drug} \n\t{new_data_drugs[drug.rjust(6, '0')]:,}")

Drug: 15 
	85,371
Drug: 911 
	22,367
Drug: 14456 
	276,929
Drug: 16129 
	817,358
Drug: 56986 
	731
Drug: 062611 
	54,252
Drug: 062574 
	37,952
Drug: 062576 
	80,521
Drug: 079186 
	137,081
Drug: 090204 
	2,983


In [61]:
"15".rjust(6, "0") in drugs

True

In [44]:
# drugs

In [22]:
drugs

set()

In [None]:
df

In [10]:
manifest_adr

[{'column_name': 'UMCReportId',
  'dtype': "<class 'int'>",
  'number_of_chars': 11,
  'char_position_start': 0,
  'char_position_end': 11,
  'notes': 'Unique number linking DRUG to DEMO'},
 {'column_name': 'Drug_Id',
  'dtype': "<class 'int'>",
  'number_of_chars': 11,
  'char_position_start': 11,
  'char_position_end': 22,
  'notes': 'Unique number identifying each row in DRUG'},
 {'column_name': 'MedicinalProd_Id',
  'dtype': "<class 'int'>",
  'number_of_chars': 11,
  'char_position_start': 22,
  'char_position_end': 33,
  'notes': 'A sequential number generated for each Medicinal product'},
 {'column_name': 'DrecNo',
  'dtype': "<class 'str'>",
  'number_of_chars': 6,
  'char_position_start': 33,
  'char_position_end': 39,
  'notes': 'Drug record number'},
 {'column_name': 'Seq1',
  'dtype': "<class 'str'>",
  'number_of_chars': 2,
  'char_position_start': 39,
  'char_position_end': 41,
  'notes': 'Sequence number 1'},
 {'column_name': 'Seq2',
  'dtype': "<class 'str'>",
  'number

NameError: name 'results' is not defined

In [None]:
# df.to_csv(CSV_DATA_DIR / f"{table_name}.csv")
# print(f"Saved {table_name}.csv!")
# del df

In [5]:
# df.to_csv(CSV_DATA_DIR / f"{table_name}.csv")

In [3]:
table_name = "DEMO"

# Get Manifest
fpath = MANIFEST_DATA_DIR / f"{table_name}.csv"
manifest_adr = pd.read_csv(fpath, index_col=0)
columns_adr = manifest_adr["column_name"].values
display(manifest_adr)
manifest_adr = manifest_adr.to_dict(orient="records")

# Get Counts
fpath = MANIFEST_DATA_DIR / "count.csv"
count = pd.read_csv(fpath, index_col=0)
n_rows = count.loc[count["Table"] == table_name, "No of rows"].squeeze()
display(count)

# Parse text file
fpath = MAIN_TXT_DATA_DIR / f"{table_name}.txt"
results = [None for _ in range(n_rows)]
result = [None for _ in range(len(manifest_adr))]
with open(fpath) as file:
  for idx, line in tqdm(enumerate(file), total=n_rows):
      for icol, col in enumerate(manifest_adr):
        char_start = col["char_position_start"]
        char_end = col["char_position_end"]
        result[icol] =  line[char_start: char_end].rstrip()
      results[idx] = tuple(result)
df = pd.DataFrame(results, columns=columns_adr)
df.head()

Unnamed: 0,column_name,dtype,number_of_chars,char_position_start,char_position_end,notes
0,UMCReportId,<class 'int'>,11,0,11,Unique number linking OUT to DEMO
1,AgeGroup,<class 'str'>,1,11,12,AgeGroup_Lx.Code. Age of patient at time of on...
2,Gender,<class 'str'>,1,12,13,Gender_Lx.Code
3,DateDatabase,<class 'str'>,8,13,21,Date when report was entered into the database...
4,Type,<class 'str'>,1,21,22,ReportType_Lx.Code. Report type.
5,Region,<class 'str'>,1,22,23,Region_lx.Code.
6,FirstDateDatabase,<class 'str'>,8,23,31,Date when the first version of the report was ...


Unnamed: 0,Table,No of rows
0,ADR,66802344
1,DEMO,27213386
2,DRUG,71006811
3,FOLLOWUP,7759897
4,IND,35474400
5,LINK,99273742
6,OUT,22677257
7,SRCE,21105031


  0%|          | 0/27213386 [00:00<?, ?it/s]

Unnamed: 0,UMCReportId,AgeGroup,Gender,DateDatabase,Type,Region,FirstDateDatabase
0,34777001,5,2,20210205,1,2,20210205
1,34776971,9,-,20210205,1,2,20210205
2,34776965,9,1,20210205,1,2,20210205
3,34776982,9,-,20210205,1,2,20210205
4,34776980,9,-,20210205,1,2,20210205


In [4]:
df.to_csv(CSV_DATA_DIR / f"{table_name}.csv")
print(f"Saved {table_name}.csv!")
del df

Saved DEMO.csv!


In [5]:
table_name = "DRUG"

# Get Manifest
fpath = MANIFEST_DATA_DIR / f"{table_name}.csv"
manifest_adr = pd.read_csv(fpath, index_col=0)
columns_adr = manifest_adr["column_name"].values
display(manifest_adr)
manifest_adr = manifest_adr.to_dict(orient="records")

# Get Counts
fpath = MANIFEST_DATA_DIR / "count.csv"
count = pd.read_csv(fpath, index_col=0)
n_rows = count.loc[count["Table"] == table_name, "No of rows"].squeeze()
display(count)

# Parse text file
fpath = MAIN_TXT_DATA_DIR / f"{table_name}.txt"
results = [None for _ in range(n_rows)]
result = [None for _ in range(len(manifest_adr))]
with open(fpath) as file:
  for idx, line in tqdm(enumerate(file), total=n_rows):
      for icol, col in enumerate(manifest_adr):
        char_start = col["char_position_start"]
        char_end = col["char_position_end"]
        result[icol] =  line[char_start: char_end].rstrip()
      results[idx] = tuple(result)
df = pd.DataFrame(results, columns=columns_adr)
df.head()

Unnamed: 0,column_name,dtype,number_of_chars,char_position_start,char_position_end,notes
0,UMCReportId,<class 'int'>,11,0,11,Unique number linking DRUG to DEMO
1,Drug_Id,<class 'int'>,11,11,22,Unique number identifying each row in DRUG
2,MedicinalProd_Id,<class 'int'>,11,22,33,A sequential number generated for each Medicin...
3,DrecNo,<class 'str'>,6,33,39,Drug record number
4,Seq1,<class 'str'>,2,39,41,Sequence number 1
5,Seq2,<class 'str'>,3,41,44,Sequence number 2
6,Route,<class 'str'>,2,44,46,RouteOfAdm_Lx.Code. Route of administration of...
7,Basis,<class 'str'>,1,46,47,RepBasis_Lx.Code. Characterization of drug role.
8,Amount,<class 'str'>,5,47,52,Dosage regimen; Amount
9,AmountU,<class 'str'>,2,52,54,SizeUnit_Lx.Code. Amount unit.


Unnamed: 0,Table,No of rows
0,ADR,66802344
1,DEMO,27213386
2,DRUG,71006811
3,FOLLOWUP,7759897
4,IND,35474400
5,LINK,99273742
6,OUT,22677257
7,SRCE,21105031


  0%|          | 0/71006811 [00:00<?, ?it/s]

Unnamed: 0,UMCReportId,Drug_Id,MedicinalProd_Id,DrecNo,Seq1,Seq2,Route,Basis,Amount,AmountU,Frequency,FrequencyU
0,34777001,10,1240849,3819,1,29,15,1,-,-,-,-
1,34777001,42,1240849,3819,1,29,15,1,-,-,-,-
2,34776971,19,4134085,56993,1,2,65,1,-,-,-,-
3,34776971,27,4134085,56993,1,2,-,1,-,-,-,-
4,34776965,18,4239665,88719,2,2,65,1,-,-,-,-


In [6]:
df.to_csv(CSV_DATA_DIR / f"{table_name}.csv")
print(f"Saved {table_name}.csv!")
del df

Saved DRUG.csv!


In [21]:
table_name = "FOLLOWUP"

# Get Manifest
fpath = MANIFEST_DATA_DIR / f"{table_name}.csv"
manifest_adr = pd.read_csv(fpath, index_col=0)
columns_adr = manifest_adr["column_name"].values
display(manifest_adr)
manifest_adr = manifest_adr.to_dict(orient="records")

# Get Counts
fpath = MANIFEST_DATA_DIR / "count.csv"
count = pd.read_csv(fpath, index_col=0)
n_rows = count.loc[count["Table"] == table_name, "No of rows"].squeeze()
display(count)

# Parse text file
fpath = MAIN_TXT_DATA_DIR / f"{table_name}.txt"
results = [None for _ in range(n_rows)]
result = [None for _ in range(len(manifest_adr))]
with open(fpath) as file:
  for idx, line in tqdm(enumerate(file), total=n_rows):
    for icol, col in enumerate(manifest_adr):
      char_start = col["char_position_start"]
      char_end = col["char_position_end"]
      result[icol] =  line[char_start: char_end].rstrip()
    results[idx] = tuple(result)
df = pd.DataFrame(results, columns=columns_adr)
df.head()

Unnamed: 0,column_name,dtype,number_of_chars,char_position_start,char_position_end,notes
0,UMCReportId,<class 'int'>,11,0,11,Unique number linking FOLLOWUP to DEMO. This n...
1,ReplacedUMCReportId,<class 'int'>,11,11,22,"Previous versions of the case, no longer avail..."


Unnamed: 0,Table,No of rows
0,ADR,66802344
1,DEMO,27213386
2,DRUG,71006811
3,FOLLOWUP,7759897
4,IND,35474400
5,LINK,99273742
6,OUT,22677257
7,SRCE,21105031


  0%|          | 0/7759897 [00:00<?, ?it/s]

Unnamed: 0,UMCReportId,ReplacedUMCReportId
0,23711586,22149020
1,23711587,22149082
2,23711585,22149019
3,23711576,22149231
4,23711574,22149228


In [22]:
# results

In [12]:
table_name = "IND"

# Get Manifest
fpath = MANIFEST_DATA_DIR / f"{table_name}.csv"
manifest_adr = pd.read_csv(fpath, index_col=0)
columns_adr = manifest_adr["column_name"].values
display(manifest_adr)
manifest_adr = manifest_adr.to_dict(orient="records")

# Get Counts
fpath = MANIFEST_DATA_DIR / "count.csv"
count = pd.read_csv(fpath, index_col=0)
n_rows = count.loc[count["Table"] == table_name, "No of rows"].squeeze()
display(count)

# Parse text file
fpath = MAIN_TXT_DATA_DIR / f"{table_name}.txt"
results = [None for _ in range(n_rows)]
result = [None for _ in range(len(manifest_adr))]
with open(fpath, encoding="latin-1") as file:
# pd.read_csv(fpath, delim_whitespace="", skiprows=lambda x: x != 385_757, encoding='latin-1', nrows=1)
#   file.reead
  for idx, line in tqdm(enumerate(file), total=n_rows):
    # if idx > 385_757:
    # if idx > 385_000:
      # print(line)
    for icol, col in enumerate(manifest_adr):
      char_start = col["char_position_start"]
      char_end = col["char_position_end"]
      result[icol] =  line[char_start: char_end].rstrip()
    results[idx] = tuple(result)
df = pd.DataFrame(results, columns=columns_adr)
df.head()

Unnamed: 0,column_name,dtype,number_of_chars,char_position_start,char_position_end,notes
0,Drug_Id,<class 'int'>,11,0,11,Unique number linking IND to DRUG
1,Indication,<class 'str'>,255,11,266,Reason for drug use. Indication can be decoded...


Unnamed: 0,Table,No of rows
0,ADR,66802344
1,DEMO,27213386
2,DRUG,71006811
3,FOLLOWUP,7759897
4,IND,35474400
5,LINK,99273742
6,OUT,22677257
7,SRCE,21105031


  0%|          | 0/35474400 [00:00<?, ?it/s]

Unnamed: 0,Drug_Id,Indication
0,42,Contraception
1,19,Cold type haemolytic anaemia
2,27,Off label use
3,18,Spinal muscular atrophy
4,6,Hyperchlorhydria


In [13]:
# help(pd.read_csv)

In [14]:
# results

In [15]:
df.to_csv(CSV_DATA_DIR / f"{table_name}.csv")
print(f"Saved {table_name}.csv!")

Saved IND.csv!


In [12]:
# help(tqdm)

In [8]:
print("DONE)

SyntaxError: unterminated string literal (detected at line 1) (2425909208.py, line 1)

In [13]:
# Verify that the last column is 8 + 1 characters (8 for MedDRA_id, 1 for Outcome)
# result["MedDRA_Id___Outcome"].str.len()

In [14]:
# Split entries of the last column into two
# result["MedDRA_Id"] = result["MedDRA_Id___Outcome"].str[:-1]
# result["Outcome"] = result["MedDRA_Id___Outcome"].str.slice(-1)
# result = result.drop(["MedDRA_Id___Outcome"], axis="columns")
result

Unnamed: 0,0
0,34777001 15 100125781
1,34776971 27 100537626
2,34776965 5 100192116
3,34776982 17 100383896
4,34776980 32 100369186
5,34776980 33 100050116
6,34776999 16 100708635
7,34776984 21 100119065
8,34776989 29 100626856
9,34776974 22 100807516


In [58]:
fpath = MAIN_TXT_DATA_DIR / "DEMO.txt"
result = pd.read_csv(fpath, nrows=10, header=None, delim_whitespace=" ")
demo_columns = ["UMCReportId", "AgeGroup", "Gender", "DateDatabase", "Type", "Region", "FirstDateDatabase"]
result

Unnamed: 0,0,1
0,34777001,52202102051220210205
1,34776971,9-202102051220210205
2,34776965,91202102051220210205
3,34776982,9-202102051220210205
4,34776980,9-202102051220210205
5,34776999,72202102051220210205
6,34776984,81202102052220210205
7,34776989,62202102051220210205
8,34776974,92202102052220210205
9,34776991,92202102051220210205


In [59]:
result["UMCReportId"] = result[0]
result["AgeGroup"] = result[1].str[:1]
result["Gender"] = result[1].str[1:2]
result["DateDatabase"] = result[1].str[2:10]
result["Type"] = result[1].str[10:11]
result["Region"] = result[1].str[11:12]
result["FirstDateDatabase"] = result[1].str[12:]
result
result = result.drop([0, 1], axis="columns")
result

Unnamed: 0,UMCReportId,AgeGroup,Gender,DateDatabase,Type,Region,FirstDateDatabase
0,34777001,5,2,20210205,1,2,20210205
1,34776971,9,-,20210205,1,2,20210205
2,34776965,9,1,20210205,1,2,20210205
3,34776982,9,-,20210205,1,2,20210205
4,34776980,9,-,20210205,1,2,20210205
5,34776999,7,2,20210205,1,2,20210205
6,34776984,8,1,20210205,2,2,20210205
7,34776989,6,2,20210205,1,2,20210205
8,34776974,9,2,20210205,2,2,20210205
9,34776991,9,2,20210205,1,2,20210205


In [4]:
# result = pd.read_csv(fpath, nrows=10, header=None, delim_whitespace="")
# result = pd.read_csv(fpath, header=None, delim_whitespace="")
# drug_columns = ["UMCReportId", "Drug_Id", "MedicinalProd_Id", "DrecNo", "Seq1", "Seq2", "Route", "Basis", "Amount", "AmountU", "Frequency", "FrequencyU"]
# result

In [5]:
fpath = MAIN_TXT_DATA_DIR / "DRUG.txt"
result_df = pd.DataFrame()
n_rows = 71_006_811
# n_rows = 300
chunk_size = 5_000_000
results =  [None for _ in range(chunk_size)]
# results =  [None for _ in range(n_rows)]
columns_drug = (
                  "UMCReportId",
                  "Drug_Id",
                  "MedicinalProd_Id",
                  "DrecNo",
                  "Seq1",
                  "Seq2",
                  "Route",
                  "Basis",
                  "Amount",
                  "AmountU",
                  "Frequency",
                  "FrequencyU",
                )
with open(fpath) as file:
  # result = {}
  for idx, line in tqdm(enumerate(file)):
      # print(line.rstrip())
      # print(len(line.rstrip()))
      # print(len(line))
      UMCReportId      = line[0:11]
      Drug_Id          = line[11:22]
      MedicinalProd_Id = line[22:33]
      DrecNo           = line[33:39]
      Seq1             = line[39:41]
      Seq2             = line[41:44]
      Route            = line[44:46]
      Basis            = line[46:47]
      Amount           = line[47:52]
      AmountU          = line[52:54]
      Frequency        = line[54:56]
      FrequencyU       = line[56:59]
      # print(line)
      result = (
        val.rstrip() for val in 
                (
                  UMCReportId,
                  Drug_Id,
                  MedicinalProd_Id ,
                  DrecNo,
                  Seq1,
                  Seq2,
                  Route,
                  Basis,
                  Amount,
                  AmountU,
                  Frequency,
                  FrequencyU,
                )
      )
      # result = {k: v.rstrip() for k,v in result.items()}
      # result_df.iloc[idx] = result
      # results[idx] = result
      results[idx % chunk_size] = result
      # if idx >= n_rows - 1:
      if idx % chunk_size == 0:
        if idx == 0:
          continue
        df = pd.DataFrame(results, columns=columns_drug)
        chunk = int(idx // chunk_size)
        df.to_csv(CSV_DATA_DIR / f"DRUG_{chunk}.csv")
        results =  [None for _ in range(chunk_size)]
        del df
# result_df
# pd.DataFrame(results, columns=columns_drug)

results = [result for result in results if result is not None]
df = pd.DataFrame(results, columns=columns_drug)
df.to_csv(CSV_DATA_DIR / f"DRUG_{chunk + 1}.csv")

0it [00:00, ?it/s]

In [None]:
result_df = pd.DataFrame(results, columns=columns_drug)
result_df

In [3]:
# result_df = pd.DataFrame()
# with open(fpath) as file:
#   result = {}
#   for line in file:
#       # print(line.rstrip())
#       # print(len(line.rstrip()))
#       # print(len(line))
#       # result["Drug_Id"]        = line[:11]
#       # result["Adr_Id"]         = line[11:22]
#       # result["Dechallenge1"]   = line[22]
#       # result["Dechallenge2"]   = line[23]
#       # result["Rechallenge1"]   = line[24]
#       # result["Rechallenge2"]   = line[25]
#       # result["TimeToOnsetMin"] = line[26:37]
#       # result["TimeToOnsetMax"] = line[37:48]
#
#
#       result["UMCReportId"]      = line[0:11]
#       result["Drug_Id"]          = line[11:22]
#       result["MedicinalProd_Id"] = line[22:33]
#       result["DrecNo"]           = line[33:39]
#       result["Seq1"]             = line[39:41]
#       result["Seq2"]             = line[41:44]
#       result["Route"]            = line[44:46]
#       result["Basis"]            = line[46:47]
#       result["Amount"]           = line[47:52]
#       result["AmountU"]          = line[52:54]
#       result["Frequency"]        = line[54:56]
#       result["FrequencyU"]       = line[56:59]
#       # print(line)
#       result = {k: v.rstrip() for k,v in result.items()}
#       result_df = result_df.append(result, ignore_index=True)
# result_df

In [None]:
result["UMCReportId"] = result[0].str[0:11]
result["Drug_Id"] = result[0].str[11:22]
result["MedicinalProd_Id"] = result[0].str[22:33]
result["DrecNo"] = result[0].str[33:39]
result["Seq1"] = result[0].str[39:41]
result["Seq2"] = result[0].str[41:44]
result["Route"] = result[0].str[44:46]
result["Basis"] = result[0].str[46:47]
result["Amount"] = result[0].str[47:52]
result["AmountU"] = result[0].str[52:54]
result["Frequency"] = result[0].str[54:56]
result["FrequencyU"] = result[0].str[56:59]
result = result.drop(0, axis="columns")
result

In [None]:
result.to_csv(CSV_DATA_DIR / "DRUG.csv")

In [126]:
# fpath = MAIN_TXT_DATA_DIR / "LINK.txt"
# # result = pd.read_csv(fpath, nrows=100, header=None, delim_whitespace="")
# result = pd.read_csv(fpath, header=None, delim_whitespace="")
# # result

In [121]:
# result["Drug_Id"] = result[0].str[:11]
# result["Adr_Id"] = result[0].str[11:22]
# result["Dechallenge1"] = result[0].str[22]
# result["Dechallenge2"] = result[0].str[23]
# result["Rechallenge1"] = result[0].str[24]
# result["Rechallenge2"] = result[0].str[25]
# result["TimeToOnsetMin"] = result[0].str[26:37]
# result["TimeToOnsetMax"] = result[0].str[37:48]
# result = result.drop(0, axis="columns")
# result
# result.to_csv(CSV_DATA_DIR / "LINK.csv")

Unnamed: 0,Drug_Id,Adr_Id,Dechallenge1,Dechallenge2,Rechallenge1,Rechallenge2,TimeToOnsetMin,TimeToOnsetMax
0,10,15,6,1,-,-,-,-
1,42,15,6,1,-,-,-,-
2,19,27,5,5,-,-,-,-
3,27,27,5,5,-,-,-,-
4,18,5,5,5,1,3,-,-
...,...,...,...,...,...,...,...,...
99273737,71006805,66802340,5,4,4,3,335.00069,396.99931
99273738,71006811,66802341,5,4,4,3,-27.99931,27.99931
99273739,71006811,66802342,5,4,4,3,-27.99931,27.99931
99273740,71006811,66802343,5,4,4,3,-27.99931,27.99931


In [6]:
# table_names = [
#   "ADR",
#   "DEMO",
#   "DRUG",
#   "FOLLOWUP",
#   "IND",
#   "LINK",
#   "OUT",
#   "SRCE",
# ]

In [11]:
# i_table = 2
# table_name = table_names[i_table]
# print(table_name)
# fpath = MAIN_TXT_DATA_DIR / f"{table_name}.txt"
# result = pd.read_csv(fpath, nrows=100, header=None, delim_whitespace=True)
# # result = pd.read_csv(fpath, header=None, delim_whitespace=" ")
# # result.columns = ["UMCReportId", "Seriousness", "Serious"]
# result
# # result.to_csv(CSV_DATA_DIR / "OUT.csv")

In [110]:
  # fpath = MAIN_TXT_DATA_DIR / "SRCE.txt"
# result = pd.read_csv(fpath, header=None, delim_whitespace=" ")
# result.columns = ["UMCReportId", "Type"]
# result result.to_csv(CSV_DATA_DIR / "SRCE.csv")

In [124]:
del result

In [None]:

# import time


# start_time = time.time()
# # Get Manifest
# fpath = MAIN_TXT_DATA_DIR / f"{table_name}.txt"
# # pd.read_csv(fpath, index_col=0, nrows=10, delimiter=r"\w")
# pieces = []
# df = pd.read_fwf(
#   fpath, 
#   # index_col=0, 
#   # colspecs=[(start, end) for start, end in zip(manifest_drug["char_position_start"], manifest_drug["char_position_end"])], 
#   widths=manifest_drug["number_of_chars"],
#   # nrows=1_000_000,
#   header=None,
# )
# end_time = time.time()

# fin_time = end_time - start_time
# fin_time_m = fin_time / 60
# fin_time_h = fin_time_m / 60

# print(f"{fin_time}")
# print(f"{fin_time_m}")
# print(f"{fin_time_h}")
# df.columns = manifest_drug["column_name"]
# df
# table_name = "DRUG"
# chunk_size = 1_000_000

# # Get Counts
# fpath = MANIFEST_DATA_DIR / "count.csv"
# count = pd.read_csv(fpath, index_col=0)
# count.loc[:, "No of rows"] = count.loc[:, "No of rows"].astype(int)
# n_rows = count.loc[count["Table"] == table_name, "No of rows"].squeeze()
# print(n_rows)

# # Get Manifest
# fpath = MANIFEST_DATA_DIR / f"{table_name}.csv"
# manifest_adr = pd.read_csv(fpath, index_col=0)
# columns_adr = manifest_adr["column_name"].values
# # display(manifest_adr)
# manifest_adr = manifest_adr.to_dict(orient="records")

# n_chunks = int(np.ceil(n_rows / chunk_size))
# n_rows = np.min([n_chunks * chunk_size, n_rows]) 
# n_rows = chunk_size

# # # Parse text file
# fpath = MAIN_TXT_DATA_DIR / f"{table_name}.txt"

# # drugs = Counter()
# start_time = time.time()
# pieces = []
# for i_chunk in tqdm(range(n_chunks), total=(n_chunks)):
#     row_start = i_chunk * chunk_size
#     row_end = row_start + chunk_size
#     print(row_start, row_end, chunk_size)
    
#     fpath_out = CSV_DATA_DIR / f"{fpath.stem}_{i_chunk}_of_{n_chunks}.parquet"
#     if fpath_out.exists():
#       continue
    
#     # Get Manifest
#     df = pd.read_fwf(
#       fpath, 
#       # index_col=0, 
#       # colspecs=[(start, end) for start, end in zip(manifest_drug["char_position_start"], manifest_drug["char_position_end"])], 
#       widths=manifest_drug["number_of_chars"],
#       nrows=chunk_size,
#       skiprows=i_chunk * chunk_size,
#       header=None,
#     )
    
#     df.columns = manifest_drug["column_name"]
    
#     df.to_parquet(fpath_out)
#     # pieces.append(df)

# end_time = time.time()

# fin_time = end_time - start_time
# fin_time_m = fin_time / 60
# fin_time_h = fin_time_m / 60

# print(f"{fin_time}")
# print(f"{fin_time_m}")
# # print(f"{fin_time_h}")
# # df.columns = manifest_drug["column_name"]
# #   # df = pd.(results, columns=columns_adr)
# # # drugs.update(df["DrecNo"])
# # # df.head()
# # drugs