In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!pip install dbfread

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting dbfread
  Downloading dbfread-2.0.7-py2.py3-none-any.whl (20 kB)
Installing collected packages: dbfread
Successfully installed dbfread-2.0.7


In [None]:
import zipfile
import os
from os.path import exists
from pathlib import Path
import urllib.request
from dbfread import DBF
import pandas as pd

In [None]:
path = '/content/drive/MyDrive/files.txt'
download_path = '/content/drive/MyDrive/medcine'


In [3]:
def get_links(path):

  """
    Reads a file containing the list of data files
    Arguments:
        path: file path of the file
    Returns:
        List of urls to be downloaded
  """

  url_file = open(path, "r")
  urls = url_file.read().split('\n')

  return urls

In [5]:
def download(urls,download_path):

  """
    Reads the list of urls and download the files to the given path
    Arguments:
        urls: list of files to be downloaded
        download_path: file path to use for downloading zipped data files
    
  """

  no_of_files = len(urls) 

  for i, u in enumerate(urls):
    full_path = os.path.join(download_path, u.split('/')[-1])
    if exists(full_path): 
      continue
      
    urllib.request.urlretrieve(u, full_path)
    print(f'{u} downloaded successfully! {i+1}/{no_of_files}')

In [7]:
def files_to_extract(file_path):

  """
    Finds files to be unzipped
    Arguments:
        file_path: file paths of files to be unzipped
    Returns:
        List of files to be unzipped
  """

  files = [f for f in os.listdir(file_path) 
                      if (f.endswith('zip') or f.endswith('ZIP'))]

  return files

In [8]:
def conversion(file_path,files):

  """
    Unzip data files and combine them into one parquette file
    Arguments:
        files: list of files to be unzipped
        file_path: file path of the resulting data file 
  """

  results=pd.DataFrame()

  for file in files:

    print(file+' to extract!')

    with zipfile.ZipFile(os.path.join(file_path, file), 'r') as zip_ref:
      
      files_in_zip = zip_ref.namelist()
      data_file = [x for x in files_in_zip if x.find('BETEGSZAMOK_ATC')!=-1][0]
      data = zip_ref.read(data_file)
      path = Path(file_path) / Path(data_file).name
      path.write_bytes(data)
    
    try:
      table = DBF(Path(file_path) / Path(data_file).name,ignore_missing_memofile=True)
      frame = pd.DataFrame(iter(table))

    except UnicodeDecodeError:
      table = DBF(Path(file_path) / Path(data_file).name,encoding='latin1',ignore_missing_memofile=True)
      frame = pd.DataFrame(iter(table)) 

    filename = file.split('.')[0][:6] ##first 6 digit of the filename
    

    try:
      patient_no_col = [x for x in list(frame) if filename in x ][0]
    except: 
      print('missing column!')
      continue

    
    frame['month'] = patient_no_col.split('_')[1]
    frame.rename(columns = {patient_no_col:'patient_count'}, inplace = True)

    os.remove(Path(file_path) / Path(data_file).name)

    cols_to_drop = [x for x in list(frame) if x not in ['month','ATC','ATCNEV','patient_count']]
    frame.drop(cols_to_drop,axis=1,inplace=True)
    
    results = results.append(frame)
    print(file+' conversion complete!')

  results.to_parquet(os.path.join(file_path, 'patients')+'.parquet')

  print('patients file saved!')

In [9]:
def data_prep(path,download_path):

  """
    Runs the data prep pipeline
    Arguments:
        path: path of the file containing the urls of the files to be downloaded 
        download_path: file path where files to be downloaded and resulting parquette file to be exported
  """

  urls = get_links(path)
  download(urls,download_path)
  files = files_to_extract(download_path)

  conversion(download_path,files)


In [None]:
data_prep(path,download_path)

201512.zip to extract!
201512.zip conversion complete!
201511.zip to extract!
201511.zip conversion complete!
201510.zip to extract!
201510.zip conversion complete!
201509.zip to extract!
201509.zip conversion complete!
201508.zip to extract!
201508.zip conversion complete!
201507.zip to extract!
201507.zip conversion complete!
201506.zip to extract!
201506.zip conversion complete!
201505.zip to extract!
201505.zip conversion complete!
201504.zip to extract!
201504.zip conversion complete!
201503.zip to extract!
201503.zip conversion complete!
201502.zip to extract!
201502.zip conversion complete!
201501.zip to extract!
201501.zip conversion complete!
201612.zip to extract!
201612.zip conversion complete!
201611.zip to extract!
201611.zip conversion complete!
201610.zip to extract!
201610.zip conversion complete!
201609.zip to extract!
201609.zip conversion complete!
201608.zip to extract!
201608.zip conversion complete!
201607.zip to extract!
201607.zip conversion complete!
201606.zip