In [1]:
### prepare
import pandas as pd
import re
import os
import requests
from concurrent.futures import ThreadPoolExecutor

class DataIngestion():
    
    def __init__(self,
                docs_file = "../../data/unglobalcompact.csv",
                raw_folder = "../../data/raw_files/") -> None:
        self.docs_file = docs_file
        self.raw_folder = raw_folder
        pass

    def _sanize_name(self, name):
        name = re.sub(r"[^a-zA-Z ]","", name).lower()
        return re.sub(r"[ ]+", "_", name)
    
    def _get_file_atts(self, path):
        dir = os.path.dirname(path)
        base_name = os.path.basename(path)
        name, extension = os.path.splitext(base_name)
        return {"dir" : dir,
                "name" : name,
                "ext" : extension.split("?")[0]}
    
    def _get_destination(self, row, dir=None):
        atts_dict = self._get_file_atts(row["communication_on_progress_file"])
        index = str(row.name)
        company =self._sanize_name(row["name"])
        ext = atts_dict["ext"]
        if dir is None:
            dir = ""
        return dir + "_".join([index, company]) + ext
    
    def load_docs(self):
        self.docs_data = pd.read_csv(self.docs_file)   
        new_columns = {c: self._sanize_name(c) for c in self.docs_data.columns}
        self.docs_data = self.docs_data.rename(columns=new_columns)
        url_rows = self.docs_data .communication_on_progress_file.notnull(),
        self.docs_data = self.docs_data .loc[url_rows,
                ["name","type", "country", "sector", "communication_on_progress_file"]]
        self.docs_data["file_destination"] = self.docs_data\
            .apply(self._get_destination, dir=self.raw_folder, axis=1)
        
    def _download_file(self, url, file_path):
        if os.path.exists(file_path):
            print(f"File on '{file_path}' already exists. Skipping download.")
            return None
        try:
            response = requests.get(url, stream=True)
            response.raise_for_status()
            with open(file_path, 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)
            print(f"File on '{file_path}' downloaded successfully.")
            return file_path
        except requests.RequestException as e:
            print(f"Error downloading '{file_path}': {e}")
            return None

    def download_reports(self):
        url_list = self.docs_data["communication_on_progress_file"].values
        path_list = self.docs_data["file_destination"].values
        with ThreadPoolExecutor(max_workers=5) as executor:
            executor.map(self._download_file, url_list, path_list)
        # NOTE: possibly add check on what was downloaded
        return self

In [None]:
# check the file type
# convert
# map converted files to the dataframe

def get_conversion_path(file_destination, dir_name=None):
    if dir_name==None:
        dir_name = os.path.dirname(file_destination)
    base_name = os.path.basename(file_destination)
    name, extension = os.path.splitext(base_name)
    return os.path.join(*[dir_name, name+".pdf"])

In [None]:
import magic
docs["file_type"] = docs\
    .apply(lambda x:magic.from_file(x["file_destination"],mime=True),
           axis=1)

In [None]:
docs["converted_file_status"] = None
docs["converted_file_destination"] = None
pdf_rows = docs.file_type=="application/pdf"
docs.loc[pdf_rows, "converted_file_destination"] = docs.loc[pdf_rows, "file_destination"]
conv_rows_ind = docs.loc[docs.converted_file_destination.isnull(),].index

In [7]:
# convert file destination to new destination with pdf extension
import subprocess
for ind in conv_rows_ind:
    file_destination = docs.loc[ind, "file_destination"]
    converted_file_destination = get_conversion_path(file_destination)
    try:
        cmd = "libreoffice --headless --convert-to pdf "+file_destination+" --outdir "\
            + os.path.dirname(converted_file_destination)
        cmd = ["libreoffice", "--headless", "convert-to", "pdf", file_destination, "--outdir", str(os.path.dirname(converted_file_destination))]
        docs.loc[ind, "converted_file_status"] = subprocess.check_call(cmd, shell=True)
        if os.path.exists(converted_file_destination):
            docs.loc[ind,"converted_file_destination"] = converted_file_destination       
    except:
        print("Conversion of the file {} failed.".format(file_destination))
        continue
    os.system("pkill soffice.bin")    
