In [22]:
### prepare
import pandas as pd
import re
import os
import requests
from concurrent.futures import ThreadPoolExecutor
import magic, os

class DataIngestion():
    
    def __init__(self,
                docs_file = "../../data/unglobalcompact.csv",
                raw_folder = "../../data/raw_files/") -> None:
        self.docs_file = docs_file
        self.raw_folder = raw_folder
        pass

    def _sanitize_name(self, name):
        name = re.sub(r"[^a-zA-Z ]","", name).lower()
        return re.sub(r"[ ]+", "_", name)
    
    def _get_file_atts(self, path):
        dir = os.path.dirname(path)
        base_name = os.path.basename(path)
        name, extension = os.path.splitext(base_name)
        return {"dir" : dir,
                "name" : name,
                "ext" : extension.split("?")[0]}
    
    def _get_destination(self, row, dir=None):
        atts_dict = self._get_file_atts(row["communication_on_progress_file"])
        index = str(row.name)
        company =self._sanitize_name(row["name"])
        ext = atts_dict["ext"]
        if dir is None:
            dir = ""
        return dir + "_".join([index, company]) + ext
    
    def load_docs(self):
        self.docs_data = pd.read_csv(self.docs_file)   
        new_columns = {c: self._sanitize_name(c) for c in self.docs_data.columns}
        self.docs_data = self.docs_data.rename(columns=new_columns)
        url_rows = self.docs_data.communication_on_progress_file.notnull()
        self.docs_data = self.docs_data.loc[url_rows,
                ["name","type", "country", "sector", "communication_on_progress_file"]]
        self.docs_data["file_destination"] = self.docs_data\
            .apply(self._get_destination, dir=self.raw_folder, axis=1)
        return self
        
    def _download_file(self, url, file_path):
        if os.path.exists(file_path):
            print(f"File on '{file_path}' already exists. Skipping download.")
            return None
        try:
            response = requests.get(url, stream=True)
            response.raise_for_status()
            with open(file_path, 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)
            print(f"File on '{file_path}' downloaded successfully.")
            return file_path
        except requests.RequestException as e:
            print(f"Error downloading '{file_path}': {e}.")
            return None

    def download_reports(self):
        url_list = self.docs_data["communication_on_progress_file"].values
        path_list = self.docs_data["file_destination"].values
        with ThreadPoolExecutor(max_workers=5) as executor:
            executor.map(self._download_file, url_list, path_list)
        return self
    
    def _check_path(self,path):
        if os.path.exists(path):
            return path
        else:
            return None

    def _get_conversion_path(self,file_destination, dir_name=None):
        if dir_name==None:
            dir_name = os.path.dirname(file_destination)
        base_name = os.path.basename(file_destination)
        name, extension = os.path.splitext(base_name)
        return os.path.join(*[dir_name, name+".pdf"])

    def _get_metadata(self):
        self.docs_data["file_type"] = self.docs_data\
            .apply(lambda x:magic.from_file(x["file_destination"],mime=True), axis=1)
        self.docs_data["file_size"] = self.docs_data\
            .apply(lambda x:os.path.getsize(x["file_destination"])/10**6, axis=1)
        self.docs_data["converted_file_destination"] = None
        pdf_rows = self.docs_data.file_type=="application/pdf"
        self.docs_data.loc[pdf_rows, "converted_file_destination"] = self.docs_data.loc[pdf_rows, "file_destination"]
        conv_rows_ind = self.docs_data.loc[
            (self.docs_data.converted_file_destination.isnull())
            & (~self.docs_data.file_type.isin(["inode/x-empty", "application/octet-stream"]))
            & (self.docs_data.file_size<90),].index
        self.docs_data["conversion"] = False
        self.docs_data.loc[conv_rows_ind, "conversion"] = True
        return self

    def _convert_row(self, ind):
        import subprocess

        file_destination = self.docs_data.loc[ind, "file_destination"]
        converted_file_destination = self._get_conversion_path(file_destination)
        # NOTE: make sure this works
        if os.path.exists(converted_file_destination):
            print(f"File on '{converted_file_destination}' already exists. Skipping conversion.")
            self.docs_data.loc[ind, "converted_file_destination"] = converted_file_destination
        try:
            cmd = f"libreoffice --headless --convert-to pdf {file_destination}"
            status = subprocess.run(cmd, capture_output=True).stdout
            if "error" in status.lower():
                raise Exception(status)
            self.docs_data.loc[ind, "converted_file_destination"] = converted_file_destination
            print(f"File on '{converted_file_destination}' converted successfully.")
        except Exception as e:
            print(f"Error converting '{file_destination}': {e}.")
        return self

    def convert_reports(self):
        self = self._get_metadata()
        for ind in self.docs_data.loc[self.docs_data.conversion,].index[:5]:
            self = self._convert_row(ind)
        # add checks for paths
        return self    

In [23]:
Ingest = DataIngestion().load_docs().download_reports().convert_reports()
Ingest.docs_data.head()

File on '../../data/raw_files/131_deloitte_ksa.pdf' already exists. Skipping download.File on '../../data/raw_files/140_khne_nagel_gesellschaft_mbh.pdf' already exists. Skipping download.

File on '../../data/raw_files/198_pravex_jsc.pdf' already exists. Skipping download.
File on '../../data/raw_files/209_south_pole_australia.pdf' already exists. Skipping download.
File on '../../data/raw_files/319_oberthur_fiduciaire_ad.pdf' already exists. Skipping download.
File on '../../data/raw_files/322_skanska_property_poland.pdf' already exists. Skipping download.
File on '../../data/raw_files/255_saintgobain_austria_gmbh.pdf' already exists. Skipping download.
File on '../../data/raw_files/380_navistar.pdf' already exists. Skipping download.
File on '../../data/raw_files/563_astrazeneca.pdf' already exists. Skipping download.
File on '../../data/raw_files/621_sap_canada_inc.pdf' already exists. Skipping download.
File on '../../data/raw_files/453_m_mexico.pdf' already exists. Skipping downlo

Unnamed: 0,name,type,country,sector,communication_on_progress_file,file_destination,file_type,file_size,converted_file_destination,conversion
131,Deloitte KSA,Company,Saudi Arabia,Diversified,https://ungc-production.s3.us-west-2.amazonaws...,../../data/raw_files/131_deloitte_ksa.pdf,application/pdf,0.719639,../../data/raw_files/131_deloitte_ksa.pdf,False
140,Kühne + Nagel Gesellschaft m.b.H.,Company,Austria,Diversified,https://ungc-production.s3.us-west-2.amazonaws...,../../data/raw_files/140_khne_nagel_gesellscha...,application/pdf,5.052755,../../data/raw_files/140_khne_nagel_gesellscha...,False
198,Pravex JSC,Company,Ukraine,Banks,https://ungc-production.s3.us-west-2.amazonaws...,../../data/raw_files/198_pravex_jsc.pdf,application/pdf,7.298954,../../data/raw_files/198_pravex_jsc.pdf,False
209,South Pole Australia,SME,Australia,Diversified,https://ungc-production.s3.us-west-2.amazonaws...,../../data/raw_files/209_south_pole_australia.pdf,application/pdf,15.260722,../../data/raw_files/209_south_pole_australia.pdf,False
255,Saint-Gobain Austria GmbH,Company,Austria,Construction & Materials,https://ungc-production.s3.us-west-2.amazonaws...,../../data/raw_files/255_saintgobain_austria_g...,application/pdf,1.211703,../../data/raw_files/255_saintgobain_austria_g...,False


In [24]:
Ingest.docs_data.loc[Ingest.docs_data.conversion,]

Unnamed: 0,name,type,country,sector,communication_on_progress_file,file_destination,file_type,file_size,converted_file_destination,conversion
2855,EPAM Systems Ltd,Company,United Kingdom,Software & Computer Services,https://ungc-production.s3.us-west-2.amazonaws...,../../data/raw_files/2855_epam_systems_ltd.docx,application/vnd.openxmlformats-officedocument....,0.061483,,True
3766,Tata Consultancy Services (Thailand) Limited,SME,Thailand,Software & Computer Services,https://ungc-production.s3.us-west-2.amazonaws...,../../data/raw_files/3766_tata_consultancy_ser...,application/vnd.openxmlformats-officedocument....,0.016087,,True
3843,EPAM Systems Mexico S DE RL DE CV,Company,Mexico,Software & Computer Services,https://ungc-production.s3.us-west-2.amazonaws...,../../data/raw_files/3843_epam_systems_mexico_...,application/vnd.openxmlformats-officedocument....,0.061483,,True
4710,Fundacja Obywatele Natury,NGO Local,Poland,Not Applicable,https://ungc-production.s3.us-west-2.amazonaws...,../../data/raw_files/4710_fundacja_obywatele_n...,application/msword,0.039936,,True
5287,Vantaux InternationalBD Co.,SME,Bangladesh,Industrial Support Services,https://ungc-production.s3.us-west-2.amazonaws...,../../data/raw_files/5287_vantaux_internationa...,message/rfc822,0.327417,,True
...,...,...,...,...,...,...,...,...,...,...
20665,Croatian Business Council for Sustainable Deve...,Business Association Local,Croatia,Not Applicable,https://ungc-production.s3.us-west-2.amazonaws...,../../data/raw_files/20665_croatian_business_c...,application/vnd.openxmlformats-officedocument....,0.061763,,True
20820,Global Calcium Pvt Ltd,SME,India,Pharmaceuticals & Biotechnology,https://ungc-production.s3.us-west-2.amazonaws...,../../data/raw_files/20820_global_calcium_pvt_...,application/msword,0.145408,,True
20883,"Banco Sabadell, S.A",Company,Spain,Finance and credit services,https://ungc-production.s3.us-west-2.amazonaws...,../../data/raw_files/20883_banco_sabadell_sa.7z,application/x-7z-compressed,9.218664,,True
20963,Daudsons Armoury (Pvt) Limited,SME,Pakistan,Aerospace & Defense,https://ungc-production.s3.us-west-2.amazonaws...,../../data/raw_files/20963_daudsons_armoury_pv...,application/msword,0.081920,,True


In [3]:
# check the file type
# convert
# map converted files to the dataframe

def _check_path(self,path):
    if os.path.exists(path):
        return path
    else:
        return None

def _get_conversion_path(self,file_destination, dir_name=None):
    if dir_name==None:
        dir_name = os.path.dirname(file_destination)
    base_name = os.path.basename(file_destination)
    name, extension = os.path.splitext(base_name)
    return os.path.join(*[dir_name, name+".pdf"])

In [11]:

def _check_path(self,path):
    if os.path.exists(path):
        return path
    else:
        return None

def _get_conversion_path(self,file_destination, dir_name=None):
    if dir_name==None:
        dir_name = os.path.dirname(file_destination)
    base_name = os.path.basename(file_destination)
    name, extension = os.path.splitext(base_name)
    return os.path.join(*[dir_name, name+".pdf"])

def _get_metadata(self):
    self.docs_data["file_type"] = self.docs_data\
        .apply(lambda x:magic.from_file(x["file_destination"],mime=True), axis=1)
    self.docs_data["file_size"] = self.docs_data\
        .apply(lambda x:os.path.getsize(x["file_destination"])/10**6, axis=1)
    self.docs_data["converted_file_destination"] = None
    pdf_rows = self.docs_data.file_type=="application/pdf"
    self.docs_data.loc[pdf_rows, "converted_file_destination"] = self.docs_data.loc[pdf_rows, "file_destination"]
    conv_rows_ind = self.docs_data.loc[
        (self.docs_data.converted_file_destination.isnull())
        & (~self.docs_data.file_type.isin(["inode/x-empty", "application/octet-stream"]))
        & (self.docs_data.file_size<90),].index
    self.docs_data["conversion"] = False
    self.docs_data.loc[conv_rows_ind, "conversion"] = True
    return self

# remove references to self
def _convert_row(self, ind):
    file_destination = self.docs_data.loc[ind, "file_destination"]
    converted_file_destination = self._get_conversion_path(file_destination)
    if os.path.exists(converted_file_destination):
        print(f"File on '{converted_file_destination}' already exists. Skipping conversion.")
        self.docs_data.loc[ind, "converted_file_destination"] = converted_file_destination
    try:
        os.system(f"libreoffice --headless --convert-to pdf {os.path.dirname(file_destination)}")
        self.docs_data.loc[ind, "converted_file_destination"] = converted_file_destination
        print(f"File on '{converted_file_destination}' converted successfully.")
    except Exception as e:
        print(f"Error converting '{file_destination}': {e}.")
    return self

def convert_reports(self):
    self = self._get_metadata()
    for ind in self.docs_data.loc[self.docs_data.conversion,].index:
        self = self._convert_row(ind)
    return self

In [12]:
# convert file destination to new destination with pdf extension
for ind in conv_rows_ind:

    file_destination = docs.loc[ind, "file_destination"]
    converted_file_destination = get_conversion_path(file_destination)
    
    # check if the converted file already exists
    if os.path.exists(converted_file_destination):
        print(f"File {converted_file_destination} already exists. Skipping")
        docs.loc[ind,"converted_file_destination"] = converted_file_destination
        continue
        
    # if not exists, convert
    else:
        os.system("pkill soffice.bin")  
        try:
            os.system(f"libreoffice --headless --convert-to pdf {os.path.dirname(file_destination)}")
            docs.loc[ind,"converted_file_destination"] = converted_file_destination 
        except:
            print(f"Conversion of the file {file_destination} failed.")
            continue
      


name                                                               EPAM Systems Ltd
type                                                                        Company
country                                                              United Kingdom
sector                                                 Software & Computer Services
communication_on_progress_file    https://ungc-production.s3.us-west-2.amazonaws...
file_destination                    ../../data/raw_files/2855_epam_systems_ltd.docx
file_type                         application/vnd.openxmlformats-officedocument....
file_size                                                                  0.061483
converted_file_status                                                          None
converted_file_destination                                                     None
ConversionStatus                                                                0.0
ConvertedFileName                    ../../data/raw_files/2855_epam_systems_

Error: source file could not be loaded


name                                                                Lefebvre Sarrut
type                                                                        Company
country                                                                      France
sector                                                  Industrial Support Services
communication_on_progress_file    https://ungc-production.s3.us-west-2.amazonaws...
file_destination                      ../../data/raw_files/6944_lefebvre_sarrut.doc
file_type                                                        application/msword
file_size                                                                      0.32
converted_file_status                                                          None
converted_file_destination                                                     None
ConversionStatus                                                                0.0
ConvertedFileName                     ../../data/raw_files/6944_lefebvre_sar



convert /mnt/Data/git_root/sustainability-reports-industry-analysis/data/raw_files/11622_establecimientos_gastronomicos_asian_food_chile_ltda.docx -> /mnt/Data/git_root/sustainability-reports-industry-analysis/code/ingestion_processing/11622_establecimientos_gastronomicos_asian_food_chile_ltda.pdf using filter : writer_pdf_Export
name                                  Imperial Movement For Justice And Development
type                                                                      NGO Local
country                                                                Sierra Leone
sector                                                               Not Applicable
communication_on_progress_file    https://ungc-production.s3.us-west-2.amazonaws...
file_destination                  ../../data/raw_files/11639_imperial_movement_f...
file_type                         application/vnd.openxmlformats-officedocument....
file_size                                                                  0.054

Error: source file could not be loaded


name                                        Young Fishers Global Network Initiative
type                                                                     NGO Global
country                                                                     Nigeria
sector                                                               Not Applicable
communication_on_progress_file    https://ungc-production.s3.us-west-2.amazonaws...
file_destination                  ../../data/raw_files/16005_young_fishers_globa...
file_type                         application/vnd.openxmlformats-officedocument....
file_size                                                                  0.046532
converted_file_status                                                          None
converted_file_destination                                                     None
ConversionStatus                                                                0.0
ConvertedFileName                 ../../data/raw_files/16005_young_fishers_g

Error: source file could not be loaded


name                                                    Chubb Seguros Ecuador S.A.,
type                                                                            SME
country                                                                     Ecuador
sector                                                            Nonlife Insurance
communication_on_progress_file    https://ungc-production.s3.us-west-2.amazonaws...
file_destination                  ../../data/raw_files/18445_chubb_seguros_ecuad...
file_type                         application/vnd.openxmlformats-officedocument....
file_size                                                                  3.573656
converted_file_status                                                          None
converted_file_destination                                                     None
ConversionStatus                                                                NaN
ConvertedFileName                                                           



name                                      Renewable Energy Development Organization
type                                                                     NGO Global
country                                                                      Kuwait
sector                                                               Not Applicable
communication_on_progress_file    https://ungc-production.s3.us-west-2.amazonaws...
file_destination                  ../../data/raw_files/19209_renewable_energy_de...
file_type                                                                image/jpeg
file_size                                                                  0.711527
converted_file_status                                                          None
converted_file_destination                                                     None
ConversionStatus                                                                NaN
ConvertedFileName                                                           

Error: source file could not be loaded


name                                                 Daudsons Armoury (Pvt) Limited
type                                                                            SME
country                                                                    Pakistan
sector                                                          Aerospace & Defense
communication_on_progress_file    https://ungc-production.s3.us-west-2.amazonaws...
file_destination                  ../../data/raw_files/20963_daudsons_armoury_pv...
file_type                                                        application/msword
file_size                                                                   0.08192
converted_file_status                                                          None
converted_file_destination                                                     None
ConversionStatus                                                                NaN
ConvertedFileName                                                           