### **DATA PERSISTANCE LOADER**

Import the necessary packages

In [1]:
# Avro
import avro.schema
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter

import subprocess
import pandas as pd
from io import StringIO
from datetime import datetime
import json
import re
import os
import logging
import time

Set path of project

In [2]:
path = '/home/bdm/DataImporta/P1/production/VM/'
hadoop = '~/BDM_Software/hadoop/bin/hadoop '

Set up log file

In [3]:
logger = logging.getLogger()
if (logger.hasHandlers()):
    logger.handlers.clear()
handler = logging.FileHandler('logfile.log')
logger.addHandler(handler)

Check how many files need to be uploaded into the persistence zone (the count considering all countries)

In [5]:
# HDFS command to find count of directories and files inside a directory
count_files = subprocess.run(hadoop + 'fs -count /user/bdm/temporal', capture_output=True, shell=True).stdout.decode()
# From the answer, get the count of files (note that this is the count for ALL countries)
count_files = re.findall('\d*\s*(\d*)\s*\d*\s/', count_files)
count_files = count_files[0]
print('# of files to upload: '+count_files)
logger.error(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
logger.error('# of files to upload: '+count_files)

# of files to upload: 66


Countries to upload

In [6]:
# Countries to get data from
countries = ["peru", "chile", "brazil"]
# Countries that have different metadata for imports and exports
countries_meta = ["chile"]

Check which directories have data in the temporal landing zone and get those file names

Remove .DS_Store files

In [8]:
for country in countries:
    for dir in ['imp', 'exp', 'metadata','metadata/imp', 'metadata/exp']:
        s = subprocess.run(hadoop + 'fs -rm /user/bdm/temporal/'+country+'/'+dir+'/.DS_Store', capture_output=True, shell=True)




KeyboardInterrupt: 

In [9]:
# Dictionary to store the file names
filenames = {}
for country in countries:
    # For countries that have different metadata for imports and exports
    if country in countries_meta:
        filenames[country] = {"imports":[], "exports":[], "metadata":{"imports":[], "exports":[]}}
    # For countries that have the same metadata for imports and exports
    else:
        filenames[country] = {"imports":[], "exports":[], "metadata":[]}

# Iterate trough each country directory
for country in countries:

    # Get the content of the directories of the country
    # imports
    imports = subprocess.run(hadoop + 'fs -ls  /user/bdm/temporal/'+country+'/imp', capture_output=True, shell=True).stdout.decode()
    # exports
    exports= subprocess.run(hadoop + 'fs -ls  /user/bdm/temporal/'+country+'/exp', capture_output=True, shell=True).stdout.decode()
    # metadata
    if country in countries_meta:
        metadata_imports = subprocess.run(hadoop + ' fs -ls  /user/bdm/temporal/'+country+'/metadata/imp', capture_output=True, shell=True).stdout.decode()
        metadata_exports = subprocess.run(hadoop + ' fs -ls  /user/bdm/temporal/'+country+'/metadata/exp', capture_output=True, shell=True).stdout.decode()
    else:
        metadata = subprocess.run(hadoop + 'fs -ls  /user/bdm/temporal/'+country+'/metadata', capture_output=True, shell=True).stdout.decode()
    
    # Get the names of the files (if any) existing in those directories
     # imports
    #imports_files = re.findall('/.*/.*/.*/.*/.*/(.*)', imports)
    imports_files = re.findall('/user/bdm/temporal/'+country+'/imp/(.*)', imports)

     # exports
    exports_files = re.findall('/user/bdm/temporal/'+country+'/exp/(.*)', exports)

    # metadata
    if country in countries_meta:
        metadata_import_files =re.findall('/user/bdm/temporal/'+country+'/metadata/imp/(.*)', metadata_imports)
        metadata_export_files =re.findall('/user/bdm/temporal/'+country+'/metadata/exp/(.*)', metadata_exports)
    else:
        metadata_files =re.findall('/user/bdm/temporal/'+country+'/metadata/(.*)', metadata)


    # Save those names in the variable "file"
    #   For imports
    for import_file in imports_files:
        filenames[country]['imports'].append(import_file)
    #   For exports
    for export_file in exports_files:
        filenames[country]['exports'].append(export_file)
    #   For metadata
    if country in countries_meta:
        # imports
        for metadata_import_file in metadata_import_files:
            filenames[country]['metadata']['imports'].append(metadata_import_file)
        # exports
        for metadata_export_file in metadata_export_files:
            filenames[country]['metadata']['exports'].append(metadata_export_file)
    else:
        for metadata_file in metadata_files:
            filenames[country]['metadata'].append(metadata_file)

Now that we have the files to be uploaded, we need to know to which year do they correspond

In [10]:
filename=filenames['peru']["imports"][2]
filenames_year = {}
for country in countries:
    # For countries that have different metadata for imports and exports
    if country in countries_meta:
        filenames_year[country] = {"imports":{}, "exports":{}, "metadata":{"imports":{}, "exports":{}}}
    # For countries that have the same metadata for imports and exports
    else:
        filenames_year[country] = {"imports":{}, "exports":{}, "metadata":{}}


In [11]:
# Dictionary with the file names and the corresponding years
filenames_year = {}
for country in countries:
    # For countries that have different metadata for imports and exports
    if country in countries_meta:
        filenames_year[country] = {"imports":{}, "exports":{}, "metadata":{"imports":{}, "exports":{}}}
    # For countries that have the same metadata for imports and exports
    else:
        filenames_year[country] = {"imports":{}, "exports":{}, "metadata":{}}

    # Get years of files for Peru
    if country == "peru":
        # Imports
        for filename in filenames[country]["imports"]:
            filenames_year[country]["imports"][filename] = '20'+ re.search('(.{2})(?=.csv)', filename).group(1)
        # Exports
        for filename in filenames[country]["exports"]:
            filenames_year[country]["exports"][filename] = '20'+ re.search('(.{2})(?=.csv)', filename).group(1)
        # Metadata
        for filename in filenames[country]["metadata"]:
            filenames_year[country]["metadata"][filename] = datetime.now().date().strftime("%Y")

    # Get years of files for Chile
    if country == "chile":
        # Imports
        for filename in filenames[country]["imports"]:
            filenames_year[country]["imports"][filename] = re.search('(\d*)(?=.txt)', filename).group(1)
        # Exports
        for filename in filenames[country]["exports"]:
            filenames_year[country]["exports"][filename] = re.search('(\d*)(?=.txt)', filename).group(1)
        # Metadata
        # imports
        for filename in filenames[country]["metadata"]["imports"]:
            filenames_year[country]["metadata"]["imports"][filename] = datetime.now().date().strftime("%Y")
        # exports
        for filename in filenames[country]["metadata"]["exports"]:
            filenames_year[country]["metadata"]["exports"][filename] = datetime.now().date().strftime("%Y")
    
    # Get years of files for Brazil
    if country == "brazil":
        # Imports
        for filename in filenames[country]["imports"]:
            filenames_year[country]["imports"][filename] = re.search('_(\d{4})_', filename).group(1)
        # Exports
        for filename in filenames[country]["exports"]:
            filenames_year[country]["exports"][filename] = re.search('_(\d{4})_', filename).group(1)
         # Metadata
        for filename in filenames[country]["metadata"]:
            filenames_year[country]["metadata"][filename] = datetime.now().date().strftime("%Y")
            


Get the data years by country:

In [12]:
years = {}
# Get all the years existing in filenames_year
for country in countries:
    years[country] = []
    # imports
    for file, year in filenames_year[country]["imports"].items():
        years[country].append(year)
    # exports
    for file, year in filenames_year[country]["exports"].items():
        years[country].append(year)
    # metadata
    if country in countries_meta:
        # imports
        for file, year in filenames_year[country]["metadata"]["imports"].items():
            years[country].append(year)
        # exports
        for file, year in filenames_year[country]["metadata"]["exports"].items():
            years[country].append(year)
    else:
        for file, year in filenames_year[country]["metadata"].items():
            years[country].append(year)

# Remove duplicated years from dict
for country in years:   
    years[country] = list(set(years[country]))
years


{'peru': ['2022'], 'chile': ['2022'], 'brazil': ['2022']}

Check if the persistent zone already has the corresponding "years" directories to upload the files into, if not, create them:

In [13]:
for country in countries:
    for year in years[country]:

        # imports
        years_in_persistent = subprocess.run(hadoop + ' fs -ls /user/bdm/persistent/'+country+'/imp/'+year , capture_output=True, shell=True).stdout.decode()
        years_in_persistent = re.findall('/.*/(.*)\r', years_in_persistent)
        # if the year directory does not exist in the persistent folder, create it
        if year not in years_in_persistent:
            subprocess.run(hadoop + ' fs -mkdir /user/bdm/persistent/'+country+'/imp/'+year , capture_output=True, shell=True)

        # exports
        years_in_persistent = subprocess.run(hadoop + ' fs -ls /user/bdm/persistent/'+country+'/exp/'+year , capture_output=True, shell=True).stdout.decode()
        years_in_persistent = re.findall('/.*/(.*)\r', years_in_persistent)
        # if the year directory does not exist in the persistent folder, create it
        if year not in years_in_persistent:
            subprocess.run(hadoop + ' fs -mkdir /user/bdm/persistent/'+country+'/exp/'+year , capture_output=True, shell=True)

        # metadata
        if country in countries_meta:

            # imports
            years_in_persistent = subprocess.run(hadoop + ' fs -ls /user/bdm/persistent/'+country+'/metadata/imp/'+year , capture_output=True, shell=True).stdout.decode()
            years_in_persistent = re.findall('/.*/(.*)\r', years_in_persistent)
            # if the year directory does not exist in the persistent folder, create it
            if year not in years_in_persistent:
                subprocess.run(hadoop + ' fs -mkdir /user/bdm/persistent/'+country+'/metadata/imp/'+year , capture_output=True, shell=True)

            # exports
            years_in_persistent = subprocess.run(hadoop + ' fs -ls /user/bdm/persistent/'+country+'/metadata/exp/'+year , capture_output=True, shell=True).stdout.decode()
            years_in_persistent = re.findall('/.*/(.*)\r', years_in_persistent)
            # if the year directory does not exist in the persistent folder, create it
            if year not in years_in_persistent:
                subprocess.run(hadoop + ' fs -mkdir /user/bdm/persistent/'+country+'/metadata/exp/'+year , capture_output=True, shell=True)

        else:
            
            years_in_persistent = subprocess.run(hadoop + ' fs -ls /user/bdm/persistent/'+country+'/metadata/'+year , capture_output=True, shell=True).stdout.decode()
            years_in_persistent = re.findall('/.*/(.*)\r', years_in_persistent)
            # if the year directory does not exist in the persistent folder, create it
            if year not in years_in_persistent:
                subprocess.run(hadoop + ' fs -mkdir /user/bdm/persistent/'+country+'/metadata/'+year , capture_output=True, shell=True)

Now let's start the upload. Since we will save the files in AVRO format, they must be first converted and only then uploaded to the corresponding directory. Each country will have a different upload pipeline, given the differences of the data

*Peru loading pipeline*

In [14]:
# ------------------------------------------
# PERU (from temporal to persitent in AVRO)
# ------------------------------------------
def load_peru(filenames_year, path):
    
    country = "peru"

    # initial file name inside a directory of metadata or imports in the persistent zone
    initial_name = "version0.avro"

    # paths to folders
    folders = {"imports":"imp", "exports":"exp", "metadata":"metadata"}
    
    # Lists of times
    times_imp_exp =[]
    times_metadata =[]

    # iterate trough every folder
    for folder, folderpath in folders.items():

            # ---------------------
            # IMPORTS & EXPORTS
            # ---------------------
            if folder == 'imports' or folder == 'exports':

                # Iterate trough every import/export file
                for filename, year in filenames_year[country][folder].items():
                    
                    # Starting time of the conversion and uploading of this file
                    start_time = time.time()

                    # Print status
                    print("Working on: "+country+" | "+folder+" | "+filename+'...')
                    logger.error(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
                    logger.error("Working on: "+country+" | "+folder+" | "+filename+'...')

                    # Check if there is any persistent file to append the temporal file's data into (e.g. the most recent version). If there is no persistent file, create one (version0.avro)
                    all_versions = subprocess.run(hadoop + ' fs -ls /user/bdm/persistent/'+country+'/'+folderpath+'/'+year, capture_output=True, shell=True).stdout.decode()

                    # If no persistent file, create one
                    if all_versions == '':
                        # convert the temporal file to AVRO
                        # - first retrieve the data and convert it into a dataframe
                        data = subprocess.run(hadoop + ' fs -cat  /user/bdm/temporal/'+country+'/'+folderpath+'/'+filename, capture_output=True, shell=True, encoding="latin-1").stdout
                        data = pd.read_csv(StringIO(data))
                        # - then create the AVRO schema
                        # -- get the column names and create the fields argument for the schema
                        fields = []
                        for col in list(data.columns):
                            fields.append({"name":col, "type": "string"})
                        # -- complete the schema with the desired information
                        schema = {
                            "doc": datetime.now().strftime("%H%M%S%m%d%Y"),
                            "name": "trade_item",
                            "namespace": country+"_"+folder,
                            "type": "record",
                            "fields": fields
                        }
                        schema = json.dumps(schema)
                        schema = avro.schema.parse(schema)
                        # - mutate the records to AVRO format
                        records = data.to_dict('records')
                        for dicts in records:
                            # ensure all the data are strings
                            for keys in dicts:
                                dicts[keys] = str(dicts[keys])
                        # - create the AVRO file with the data in the local system
                        writer = DataFileWriter(open(path+initial_name, "wb"), DatumWriter(), schema)
                        # - append each record into the file
                        for record in records:
                            writer.append(record)
                        writer.close()
                    
                        # after creating the AVRO file, upload it into HDFS
                        load_hdfs = subprocess.run(hadoop + 'fs -put '+ path + initial_name + ' /user/bdm/persistent/'+country+'/'+folderpath+'/'+year, capture_output=True, shell=True)
                        # print status
                        if load_hdfs.returncode == 0 :
                            print('First version created: version0.avro file was created in /user/bdm/persistent/'+country+'/'+folderpath+'/'+year+'/')
                            logger.error(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
                            logger.error('First version created: version0.avro file was created in /user/bdm/persistent/'+country+'/'+folderpath+'/'+year+'/')
                            
                        else: 
                            print("An error occured, the file could not be uploaded")
                            print(load_hdfs)
                            logger.error(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
                            logger.error("An error occured, the file could not be uploaded")
                            logger.error(load_hdfs)
                        # delete the file in local
                        os.remove(path+initial_name)

                        # add the time used for this step
                        times_imp_exp.append(['First occurance', time.time() - start_time]) 


                    # if there is a persistent file (e.g. a previous version)
                    if all_versions != '':
                        # get a list with all the versions
                        all_versions = re.findall('/.*/(.*)\r', all_versions)
                        # get the most recent version
                        most_recent_version = initial_name
                        for version in all_versions:
                            if re.search('(\d*)(?=.avro)', version).group(1) > re.search('(\d*)(?=.avro)', most_recent_version).group(1):
                                most_recent_version = version
                        # retrieve that version's file from hadoop
                        old_avro = subprocess.run([hadoop + '', 'fs', '-get',  '/user/bdm/persistent/'+country+'/'+folderpath+'/'+year+'/'+most_recent_version, path], capture_output=True, shell=True)
                        # get old records (this step can be omited if we only append the new records into the file directly)
                        avro_records = []
                        reader = DataFileReader(open(path+most_recent_version, "rb"), DatumReader())
                        for record in reader:
                            avro_records.append(record)
                        # get old schema
                        schema = json.loads(reader.schema)
                        reader.close()
                        # modify schema
                        schema["doc"] = datetime.now().strftime("%H%M%S%m%d%Y")
                        schema = json.dumps(schema)
                        schema = avro.schema.parse(schema)
                        # delete the file in local
                        os.remove(path+most_recent_version)
                        # convert the temporal file to AVRO
                        # - first retrieve the data and convert it into a dataframe
                        data = subprocess.run(hadoop + ' fs -cat  /user/bdm/temporal/'+country+'/'+folderpath+'/'+filename, capture_output=True, shell=True, encoding="latin-1").stdout
                        data = pd.read_csv(StringIO(data))
                        # - mutate the records to AVRO format
                        new_records = data.to_dict('records')
                        for dicts in new_records:
                            # ensure all the data are strings
                            for keys in dicts:
                                dicts[keys] = str(dicts[keys])
                        # append the new records into the old ones
                        for record in new_records:
                            avro_records.append(record)
                        # define name for the AVRO file to upload
                        final_name = "version"+str(int(re.search('(\d*)(?=.avro)', most_recent_version).group(1))+1)+'.avro'
                        # create the AVRO file with the data in the local system
                        writer = DataFileWriter(open(path+final_name, "wb"), DatumWriter(), schema)
                        # append each record into the file
                        for record in avro_records:
                            writer.append(record)
                        writer.close()
                        # after creating the AVRO file, upload it into HDFS
                        load_hdfs = subprocess.run([hadoop + '', 'fs', '-put', path+final_name,  '/user/bdm/persistent/'+country+'/'+folderpath+'/'+year], capture_output=True, shell=True)
                        # print status
                        if load_hdfs.returncode == 0 :
                            print('Update performed: '+final_name+' file was created in /user/bdm/persistent/'+country+'/'+folderpath+'/'+year+'/')
                            logger.error(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
                            logger.error('Update performed: '+final_name+' file was created in /user/bdm/persistent/'+country+'/'+folderpath+'/'+year+'/')
                            # HERE TE CODE TO DELETE THE FILE IN THE TEMPORAL LANDING ZONE AFTER IT IS PASSED TO THE PERSISTENT
                        else: 
                            print('An error occured, update not performed')
                            print(load_hdfs)
                            logger.error(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
                            logger.error('An error occured, update not performed')
                            logger.error(load_hdfs)
                        # delete the file in local
                        os.remove(path+final_name)

                        # add the time used for this step
                        times_imp_exp.append([final_name, time.time() - start_time]) 

            
            # ---------------------
            # METADATA
            # ---------------------
            elif folder == "metadata":
                
                # Starting time of the conversion and uploading of this file
                start_time = time.time()

                # If there is medatata to be uploaded, create a directory in the metadata directory in the temporal zone to store the files. The name is %H%M%S%m%d%Y (time)
                if filenames_year[country][folder].items():
                    metadata_directory = datetime.now().strftime("%H%M%S%m%d%Y")
                    # get the year of the any of the files (all have the same year, we will take the first one)
                    year = list(filenames_year[country][folder].items())[0][1]
                    # create the folder
                    create_directory = subprocess.run(hadoop + ' fs -mkdir /user/bdm/persistent/'+country+'/'+folderpath+'/'+year+'/'+metadata_directory, capture_output=True, shell=True, encoding="latin-1")
                    if create_directory.returncode == 0 :
                        # print success status
                        print('A directory named '+metadata_directory+' has been created in /user/bdm/persistent/'+country+'/'+folderpath+'/'+year+'/')
                        logger.error(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
                        logger.error('A directory named '+metadata_directory+' has been created in /user/bdm/persistent/'+country+'/'+folderpath+'/'+year+'/')
                        
                        # Iterate trough every metadata file
                        for filename, year in filenames_year[country][folder].items():
                            # print working on
                            print("Working on: "+country+" | "+folder+" | "+filename+'...')
                            logger.error(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
                            logger.error("Working on: "+country+" | "+folder+" | "+filename+'...')
                            # CONVERT FILE TO AVRO
                            # get temporal file (for Peru, the files are in txt)
                            temporal_file = subprocess.run(hadoop + ' fs -cat  /user/bdm/temporal/'+country+'/'+folderpath+'/'+filename, capture_output=True, shell=True, encoding="latin-1").stdout
                            # convert it to dataframe (the headers use another separator, and thus, we can´t just directly import them, that's why we skip them)
                            data = pd.read_table(StringIO(temporal_file), skiprows = 1, header=None)
                            # here we process the headers in a special way
                            headers = pd.read_table(StringIO(temporal_file), nrows=1, header=None)
                            headers['concatenated'] = headers.apply(lambda x: ' '.join(x.dropna().values.tolist()), axis=1)
                            headers = headers['concatenated'].to_frame()
                            headers = headers['concatenated'].values[0].split()
                            headers_dict = {}
                            for i in range(len(headers)):
                                headers_dict[i] = headers[i]
                            # finally we add the headers to the dataframe
                            data.rename(columns=headers_dict, inplace=True)
                            # then create the AVRO schema
                            # - get the column names and create the fields argument for the schema
                            fields = []
                            for col in list(data.columns):
                                fields.append({"name":col, "type": "string"})
                            # - complete the schema with the desired information
                            filename_without_extension = filename[:filename.index(".")]
                            schema = {
                                "doc": datetime.now().strftime("%H%M%S%m%d%Y"),
                                "name": filename_without_extension,
                                "namespace": country+"_"+folder,
                                "type": "record",
                                "fields": fields
                            }
                            schema = json.dumps(schema)
                            schema = avro.schema.parse(schema)
                            # mutate the records to AVRO format
                            records = data.to_dict('records')
                            for dicts in records:
                                # ensure all the data are strings
                                for keys in dicts:
                                    dicts[keys] = str(dicts[keys])
                            # create the AVRO file with the data in the local system
                            writer = DataFileWriter(open(path+filename_without_extension+'.avro', "wb"), DatumWriter(), schema)
                            # append each record into the file
                            for record in records:
                                writer.append(record)
                            writer.close()
                        
                            # after creating the AVRO file, upload it into HDFS
                            load_hdfs = subprocess.run([hadoop + '', 'fs', '-put', path+filename_without_extension+'.avro',  '/user/bdm/persistent/'+country+'/'+folderpath+'/'+year+'/'+metadata_directory], capture_output=True, shell=True)
                            # print status
                            if load_hdfs.returncode == 0 :
                                print('File ' + filename_without_extension+'.avro was created in /user/bdm/persistent/'+country+'/'+folderpath+'/'+year+'/'+metadata_directory)
                                logger.error(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
                                logger.error('File ' + filename_without_extension+'.avro was created in /user/bdm/persistent/'+country+'/'+folderpath+'/'+year+'/'+metadata_directory)
                                # HERE THE LINE OF CODE TO DELETE THE FILE IN THE TEMPORAL LANDING ZONE AFTER IT IS PASSED TO THE PERSISTENT
                            else: 
                                print('An error occured, the file could not be created')
                                logger.error(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
                                logger.error('An error occured, the file could not be created')
                                logger.error(load_hdfs)
                            # delete the file in local
                            os.remove(path+filename_without_extension+'.avro')

                    else: 
                        # if there was an error creating the directory
                        print('An error occured, the directory could not be created')
                        logger.error(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
                        logger.error('An error occured, the directory could not be created')
                        logger.error(load_hdfs)  
                
                        # add the time used for this step
                        times_metadata.append(time.time() - start_time) 
                        
    return [times_imp_exp, times_metadata]


In [15]:
load_peru(filenames_year, path)

Working on: peru | imports | ma07130222.csv...


AttributeError: 'DataFileReader' object has no attribute 'schema'

*Chile loading pipeline*

In [61]:
# ------------------------------------------
# CHILE (from temporal to persitent in AVRO)
# ------------------------------------------
def load_chile(filenames_year, path):

    country = "chile"
    
    # initial file name inside a directory of metadata or imports in the persistent zone
    initial_name = "version0.avro"

    # paths to folders
    folders = {"imports":"imp", "exports":"exp", "metadata":"metadata"}

    # Lists of times
    times_imp_exp =[]
    times_metadata =[]

    # iterate trough every folder
    for folder, folderpath in folders.items():

        # ---------------------
        # IMPORTS & EXPORTS
        # ---------------------
        if folder == 'imports' or folder == 'exports':

            # Iterate trough every import/export file
            for filename, year in filenames_year[country][folder].items():
                
                # Starting time of the conversion and uploading of this file
                start_time = time.time()
                
                # Print status
                print("Working on: "+country+" | "+folder+" | "+filename+'...')
                logger.error(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
                logger.error("Working on: "+country+" | "+folder+" | "+filename+'...')

                # Check if there is any persistent file to append the temporal file's data into (e.g. the most recent version). If there is no persistent file, create one (version0.avro)
                all_versions = subprocess.run(hadoop + ' fs -ls /user/bdm/persistent/'+country+'/'+folderpath+'/'+year, capture_output=True, shell=True).stdout.decode()
                
                # If no persistent file, create one
                if all_versions == '':
                    # convert the temporal file to AVRO
                    # - first retrieve the data and convert it into a dataframe
                    data = subprocess.run([hadoop + '','fs','-cat', '/user/bdm/temporal/'+country+'/'+folderpath+'/'+filename], capture_output=True, shell=True, encoding="latin-1").stdout
                    data = pd.read_csv(StringIO(data), delimiter=";", header=None)
                    # convert the default assigned headers (0,1,2,3...) to string
                    headers_string = {}
                    for col in list(data.columns):
                        headers_string[col] = str(col)
                    data.rename(columns=headers_string, inplace=True)
                    # - then create the AVRO schema
                    # -- get the column names and create the fields argument for the schema
                    fields = []
                    for col in list(data.columns):
                        fields.append({"name":col, "type": "string"})
                    # -- complete the schema with the desired information
                    schema = {
                        "doc": datetime.now().strftime("%H%M%S%m%d%Y"),
                        "name": "trade_item",
                        "namespace": country+"_"+folder,
                        "type": "record",
                        "fields": fields
                    }
                    schema = json.dumps(schema)
                    schema = avro.schema.parse(schema)
                    # - mutate the records to AVRO format
                    records = data.to_dict('records')
                    for dicts in records:
                        # ensure all the data are strings
                        for keys in dicts:
                            dicts[keys] = str(dicts[keys])
                    # - create the AVRO file with the data in the local system
                    writer = DataFileWriter(open(path+initial_name, "wb"), DatumWriter(), schema)
                    # - append each record into the file
                    for record in records:
                        writer.append(record)
                    writer.close()
                
                    # after creating the AVRO file, upload it into HDFS
                    load_hdfs = subprocess.run([hadoop + '', 'fs', '-put', path+initial_name,  '/user/bdm/persistent/'+country+'/'+folderpath+'/'+year], capture_output=True, shell=True)
                    # print status
                    if load_hdfs.returncode == 0 :
                        print('First version created: version0.avro file was created in /user/bdm/persistent/'+country+'/'+folderpath+'/'+year+'/')
                        logger.error(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
                        logger.error('First version created: version0.avro file was created in /user/bdm/persistent/'+country+'/'+folderpath+'/'+year+'/')
                    else: 
                        print('An error occured, upload not performed')
                        logger.error(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
                        logger.error('An error occured, upload not performed')
                        logger.error(load_hdfs)
                    # delete the file in local
                    os.remove(path+initial_name)

                    # add the time used for this step
                    times_imp_exp.append(['First occurance', time.time() - start_time]) 

                # if there is a persistent file (e.g. a previous version)
                if all_versions != '':
                    # get a list with all the versions
                    all_versions = re.findall('/.*/(.*)\r', all_versions)
                    # get the most recent version
                    most_recent_version = initial_name
                    for version in all_versions:
                        if re.search('(\d*)(?=.avro)', version).group(1) > re.search('(\d*)(?=.avro)', most_recent_version).group(1):
                            most_recent_version = version
                    # retrieve that version's file from hadoop
                    old_avro = subprocess.run([hadoop + '', 'fs', '-get',  '/user/bdm/persistent/'+country+'/'+folderpath+'/'+year+'/'+most_recent_version, path], capture_output=True, shell=True)
                    # get old records (this step can be omited if we only append the new records into the file directly)
                    avro_records = []
                    reader = DataFileReader(open(path+most_recent_version, "rb"), DatumReader())
                    for record in reader:
                        avro_records.append(record)
                    # get old schema
                    schema = json.loads(reader.schema)
                    reader.close()
                    # modify schema
                    schema["doc"] = datetime.now().strftime("%H%M%S%m%d%Y")
                    schema = json.dumps(schema)
                    schema = avro.schema.parse(schema)
                    # delete the file in local
                    os.remove(path+most_recent_version)
                    # convert the temporal file to AVRO
                    # - first retrieve the data and convert it into a dataframe
                    data = subprocess.run([hadoop + '','fs','-cat', '/user/bdm/temporal/'+country+'/'+folderpath+'/'+filename], capture_output=True, shell=True, encoding="latin-1").stdout
                    data = pd.read_csv(StringIO(data), delimiter=";", header=None)
                    # convert the default assigned headers (0,1,2,3...) to string
                    headers_string = {}
                    for col in list(data.columns):
                        headers_string[col] = str(col)
                    data.rename(columns=headers_string, inplace=True)
                    # - mutate the records to AVRO format
                    new_records = data.to_dict('records')
                    for dicts in new_records:
                        # ensure all the data are strings
                        for keys in dicts:
                            dicts[keys] = str(dicts[keys])
                    # append the new records into the old ones
                    for record in new_records:
                        avro_records.append(record)
                    # define name for the AVRO file to upload
                    final_name = "version"+str(int(re.search('(\d*)(?=.avro)', most_recent_version).group(1))+1)+'.avro'
                    # create the AVRO file with the data in the local system
                    writer = DataFileWriter(open(path+final_name, "wb"), DatumWriter(), schema)
                    # append each record into the file
                    for record in avro_records:
                        writer.append(record)
                    writer.close()
                    # after creating the AVRO file, upload it into HDFS
                    load_hdfs = subprocess.run([hadoop + '', 'fs', '-put', path+final_name,  '/user/bdm/persistent/'+country+'/'+folderpath+'/'+year], capture_output=True, shell=True)
                    # print status
                    if load_hdfs.returncode == 0 :
                        print('Update performed: '+final_name+' file was created in /user/bdm/persistent/'+country+'/'+folderpath+'/'+year+'/')
                        logger.error(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
                        logger.error('Update performed: '+final_name+' file was created in /user/bdm/persistent/'+country+'/'+folderpath+'/'+year+'/')
                        # HERE THE LINE OF CODE TO DELETE THE FILE IN THE TEMPORAL LANDING ZONE AFTER IT IS PASSED TO THE PERSISTENT
                    else: 
                        print('An error occured, update not performed')
                        logger.error(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
                        logger.error('An error occured, update not performed')
                        logger.error(load_hdfs)
                    # delete the file in local
                    os.remove(path+final_name)

                    # add the time used for this step
                    times_imp_exp.append([final_name, time.time() - start_time]) 
        
        # ---------------------
        # METADATA
        # ---------------------
        if folder == 'metadata':

            # Starting time of the conversion and uploading of this file
            start_time = time.time()
            
            folders_metadata = {"imports":"imp", "exports":"exp"}

            # name of the directory to store the metadata
            metadata_directory_name = datetime.now().strftime("%H%M%S%m%d%Y")

            # iterate trough every metadata folder
            for folder_metadata, folder_metadata_path in folders_metadata.items():
                
                # if there is data in the metadata imports/exports to be uploaded, create the directory in the persistent to store it
                if list(filenames_year[country][folder][folder_metadata].items()):
                    # get year of first file in metadata imports/exports in temporal
                    year = list(filenames_year[country][folder][folder_metadata].items())[0][1]
                    metadata_directory = subprocess.run([hadoop + '', 'fs', '-mkdir',  '/user/bdm/persistent/'+country+'/'+folderpath+'/'+folder_metadata_path+'/'+year+'/'+metadata_directory_name], capture_output=True, shell=True)
                    if metadata_directory.returncode == 0 :
                        print("Directory named "+metadata_directory_name+' created in '+ '/user/bdm/persistent/'+country+'/'+folderpath+'/'+folder_metadata_path+'/'+year+'/')
                        logger.error(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
                        logger.error("Directory named "+metadata_directory_name+' created in '+ '/user/bdm/persistent/'+country+'/'+folderpath+'/'+folder_metadata_path+'/'+year+'/')
                    else: 
                        print('An error occured, folder could not be created')
                        logger.error(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
                        logger.error('An error occured, folder could not be created')
                        logger.error(load_hdfs)

                # Iterate trough every metadata file
                for filename, year in filenames_year[country][folder][folder_metadata].items():

                    # Print status
                    print("Working on: "+country+" | "+folder+" | "+folder_metadata+" | "+filename+'...')
                    logger.error(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
                    logger.error("Working on: "+country+" | "+folder+" | "+folder_metadata+" | "+filename+'...')

                    # send data to persistent
                    load_hdfs = subprocess.run([hadoop + '', 'fs', '-cp',  '/user/bdm/temporal/'+country+'/'+folderpath+'/'+folder_metadata_path+'/'+filename,  '/user/bdm/persistent/'+country+'/'+folderpath+'/'+folder_metadata_path+'/'+year+'/'+metadata_directory_name], capture_output=True, shell=True)
                    if load_hdfs.returncode == 0 :
                        print('File '+filename+' uploaded into '+ '/user/bdm/persistent/'+country+'/'+folderpath+'/'+folder_metadata_path+'/'+year+'/')
                        logger.error(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
                        logger.error('File '+filename+' uploaded into '+ '/user/bdm/persistent/'+country+'/'+folderpath+'/'+folder_metadata_path+'/'+year+'/')
                        # HERE THE LINE OF CODE TO DELETE THE FILE IN THE TEMPORAL LANDING ZONE AFTER IT IS PASSED TO THE PERSISTENT

                    else: 
                        print('An error occured, upload not performed')
                        logger.error(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
                        logger.error('An error occured, upload not performed')
                        logger.error(load_hdfs)

            # add the time used for this step
            times_metadata.append( time.time() - start_time) 
                        
    return [times_imp_exp, times_metadata]


*Brazil loading pipeline*

In [62]:
# ------------------------------------------
# BRAZIL (from temporal to persitent in AVRO)
# ------------------------------------------
def load_brazil(filenames_year, path):
    
    country = "brazil"

    # initial file name inside a directory of metadata or imports in the persistent zone
    initial_name = "version0.avro"

    # paths to folders
    folders = {"imports":"imp", "exports":"exp", "metadata":"metadata"}
    
    # Lists of times
    times_imp_exp =[]
    times_metadata =[]

    # iterate trough every folder
    for folder, folderpath in folders.items():

            # ---------------------
            # IMPORTS & EXPORTS
            # ---------------------
            if folder == 'imports' or folder == 'exports':
                
                # Starting time of the conversion and uploading of this file
                start_time = time.time()
                
                # Iterate trough every import/export file
                for filename, year in filenames_year[country][folder].items():

                    # Print status
                    print("Working on: "+country+" | "+folder+" | "+filename+'...')
                    logger.error(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
                    logger.error("Working on: "+country+" | "+folder+" | "+filename+'...')

                    # Check if there is any persistent file to append the temporal file's data into (e.g. the most recent version). If there is no persistent file, create one (version0.avro)
                    all_versions = subprocess.run(hadoop + ' fs -ls /user/bdm/persistent/'+country+'/'+folderpath+'/'+year, capture_output=True, shell=True).stdout.decode()
                    
                    # If no persistent file, create one
                    if all_versions == '':
                        # convert the temporal file to AVRO
                        # - first retrieve the data and convert it into a dataframe
                        data = subprocess.run(hadoop + ' fs -cat /temporal/'+country+'/'+folderpath+'/'+filename, capture_output=True, shell=True, encoding="latin-1").stdout
                        data = pd.read_csv(StringIO(data))
                        # - then create the AVRO schema
                        # -- get the column names and create the fields argument for the schema
                        fields = []
                        for col in list(data.columns):
                            fields.append({"name":col, "type": "string"})
                        # -- complete the schema with the desired information
                        schema = {
                            "doc": datetime.now().strftime("%H%M%S%m%d%Y"),
                            "name": "trade_item",
                            "namespace": country+"_"+folder,
                            "type": "record",
                            "fields": fields
                        }
                        schema = json.dumps(schema)
                        schema = avro.schema.parse(schema)
                        # - mutate the records to AVRO format
                        records = data.to_dict('records')
                        for dicts in records:
                            # ensure all the data are strings
                            for keys in dicts:
                                dicts[keys] = str(dicts[keys])
                        # - create the AVRO file with the data in the local system
                        writer = DataFileWriter(open(path+initial_name, "wb"), DatumWriter(), schema)
                        # - append each record into the file
                        for record in records:
                            writer.append(record)
                        writer.close()
                    
                        # after creating the AVRO file, upload it into HDFS
                        load_hdfs = subprocess.run([hadoop + '', 'fs', '-put', path+initial_name,  '/user/bdm/persistent/'+country+'/'+folderpath+'/'+year], capture_output=True, shell=True)
                        # print status
                        if load_hdfs.returncode == 0 :
                            print('First version created: version0.avro file was created in /user/bdm/persistent/'+country+'/'+folderpath+'/'+year+'/')
                            logger.error(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
                            logger.error('First version created: version0.avro file was created in /user/bdm/persistent/'+country+'/'+folderpath+'/'+year+'/')
                        else: 
                            print('An error occured, upload not performed')
                            logger.error(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
                            logger.error('An error occured, upload not performed')
                            logger.error(load_hdfs)

                        # delete the file in local
                        os.remove(path+initial_name)

                        # add the time used for this step
                        times_imp_exp.append(['First occurance', time.time() - start_time]) 

                    # if there is a persistent file (e.g. a previous version)
                    if all_versions != '':
                        # get a list with all the versions
                        all_versions = re.findall('/.*/(.*)\r', all_versions)
                        # get the most recent version
                        most_recent_version = initial_name
                        for version in all_versions:
                            if re.search('(\d*)(?=.avro)', version).group(1) > re.search('(\d*)(?=.avro)', most_recent_version).group(1):
                                most_recent_version = version
                        # retrieve that version's file from hadoop
                        old_avro = subprocess.run([hadoop + '', 'fs', '-get',  '/user/bdm/persistent/'+country+'/'+folderpath+'/'+year+'/'+most_recent_version, path], capture_output=True, shell=True)
                        # get old records (this step can be omited if we only append the new records into the file directly)
                        avro_records = []
                        reader = DataFileReader(open(path+most_recent_version, "rb"), DatumReader())
                        for record in reader:
                            avro_records.append(record)
                        # get old schema
                        schema = json.loads(reader.schema)
                        reader.close()
                        # modify schema
                        schema["doc"] = datetime.now().strftime("%H%M%S%m%d%Y")
                        schema = json.dumps(schema)
                        schema = avro.schema.parse(schema)
                        # delete the file in local
                        os.remove(path+most_recent_version)
                        # convert the temporal file to AVRO
                        # - first retrieve the data and convert it into a dataframe
                        data = subprocess.run(hadoop + ' fs -cat /temporal/'+country+'/'+folderpath+'/'+filename, capture_output=True, shell=True, encoding="latin-1").stdout
                        data = pd.read_csv(StringIO(data))
                        # - mutate the records to AVRO format
                        new_records = data.to_dict('records')
                        for dicts in new_records:
                            # ensure all the data are strings
                            for keys in dicts:
                                dicts[keys] = str(dicts[keys])
                        # append the new records into the old ones
                        for record in new_records:
                            avro_records.append(record)
                        # define name for the AVRO file to upload
                        final_name = "version"+str(int(re.search('(\d*)(?=.avro)', most_recent_version).group(1))+1)+'.avro'
                        # create the AVRO file with the data in the local system
                        writer = DataFileWriter(open(path+final_name, "wb"), DatumWriter(), schema)
                        # append each record into the file
                        for record in avro_records:
                            writer.append(record)
                        writer.close()
                        # after creating the AVRO file, upload it into HDFS
                        load_hdfs = subprocess.run([hadoop + '', 'fs', '-put', path+final_name,  '/user/bdm/persistent/'+country+'/'+folderpath+'/'+year], capture_output=True, shell=True)
                        # print status
                        if load_hdfs.returncode == 0 :
                            print('Update performed: '+final_name+' file was created in /user/bdm/persistent/'+country+'/'+folderpath+'/'+year+'/')
                            logger.error(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
                            logger.error('Update performed: '+final_name+' file was created in /user/bdm/persistent/'+country+'/'+folderpath+'/'+year+'/')
                            # HERE TE CODE TO DELETE THE FILE IN THE TEMPORAL LANDING ZONE AFTER IT IS PASSED TO THE PERSISTENT
                        else: 
                            print('An error occured, update not performed')
                            logger.error(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
                            logger.error('An error occured, update not performed')
                            logger.error(load_hdfs)
                        # delete the file in local
                        os.remove(path+final_name)

                        # add the time used for this step
                        times_imp_exp.append([final_name, time.time() - start_time]) 
            
            # ---------------------
            # METADATA
            # ---------------------
            elif folder == "metadata":

                # Starting time of the conversion and uploading of this file
                start_time = time.time()

                # If there is medatata to be uploaded, create a directory in the metadata directory in the temporal zone to store the files. The name is %H%M%S%m%d%Y (time)
                if filenames_year[country][folder].items():
                    metadata_directory = datetime.now().strftime("%H%M%S%m%d%Y")
                    # get the year of the any of the files (all have the same year, we will take the first one)
                    year = list(filenames_year[country][folder].items())[0][1]
                    # create the folder
                    create_directory = subprocess.run(hadoop + ' fs -mkdir /user/bdm/persistent/'+country+'/'+folderpath+'/'+year+'/'+metadata_directory, capture_output=True, shell=True, encoding="latin-1")
                    if create_directory.returncode == 0 :
                        # print success status
                        print('A directory named '+metadata_directory+' has been created in /user/bdm/persistent/'+country+'/'+folderpath+'/'+year+'/')
                        logger.error(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
                        logger.error('A directory named '+metadata_directory+' has been created in /user/bdm/persistent/'+country+'/'+folderpath+'/'+year+'/')
                        
                        # Iterate trough every metadata file
                        for filename, year in filenames_year[country][folder].items():
                            # print working on
                            print("Working on: "+country+" | "+folder+" | "+filename+'...')
                            logger.error(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
                            logger.error("Working on: "+country+" | "+folder+" | "+filename+'...') 
                            # CONVERT FILE TO AVRO
                            # get temporal file
                            data = subprocess.run([hadoop + '','fs','-cat',' /user/bdm/temporal/'+country+'/'+folderpath+'/'+filename], capture_output=True, shell=True, encoding="latin-1").stdout
                            data = pd.read_csv(StringIO(data), delimiter=";")
                            # then create the AVRO schema
                            # - get the column names and create the fields argument for the schema
                            fields = []
                            for col in list(data.columns):
                                fields.append({"name":col, "type": "string"})
                            # - complete the schema with the desired information
                            filename_without_extension = filename[:filename.index(".")].replace(" ", "")
                            schema = {
                                "doc": datetime.now().strftime("%H%M%S%m%d%Y"),
                                "name": filename_without_extension,
                                "namespace": country+"_"+folder,
                                "type": "record",
                                "fields": fields
                            }
                            schema = json.dumps(schema)
                            schema = avro.schema.parse(schema)
                            # mutate the records to AVRO format
                            records = data.to_dict('records')
                            for dicts in records:
                                # ensure all the data are strings
                                for keys in dicts:
                                    dicts[keys] = str(dicts[keys])
                            # create the AVRO file with the data in the local system
                            writer = DataFileWriter(open(path+filename_without_extension+'.avro', "wb"), DatumWriter(), schema)
                            # append each record into the file
                            for record in records:
                                writer.append(record)
                            writer.close()
                        
                            # after creating the AVRO file, upload it into HDFS
                            load_hdfs = subprocess.run([hadoop + '', 'fs', '-put', path+filename_without_extension+'.avro',  '/user/bdm/persistent/'+country+'/'+folderpath+'/'+year+'/'+metadata_directory], capture_output=True, shell=True)
                            # print status
                            if load_hdfs.returncode == 0 :
                                print('File ' + filename_without_extension+'.avro was created in /user/bdm/persistent/'+country+'/'+folderpath+'/'+year+'/'+metadata_directory)
                                logger.error(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
                                logger.error('File ' + filename_without_extension+'.avro was created in /user/bdm/persistent/'+country+'/'+folderpath+'/'+year+'/'+metadata_directory) 
                                # HERE THE LINE OF CODE TO DELETE THE FILE IN THE TEMPORAL LANDING ZONE AFTER IT IS PASSED TO THE PERSISTENT
                            else: 
                                print('An error occured, the file could not be created')
                                logger.error(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
                                logger.error('An error occured, the file could not be created') 
                                logger.error(load_hdfs)
                            # delete the file in local
                            os.remove(path+filename_without_extension+'.avro')

                    else: 
                        # if there was an error creating the directory
                        print('An error occured, the directory could not be created')
                        logger.error(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
                        logger.error('An error occured, the directory could not be created') 
                        logger.error(load_hdfs)  
    
            # add the time used for this step
            times_metadata.append( time.time() - start_time) 
                        
    return [times_imp_exp, times_metadata]

Call loading functions

In [67]:
for country in countries:
    # PERU
    if country == 'peru':
        load_peru(filenames_year, path)
    # CHILE
    if country == 'chile':
        load_chile(filenames_year, path)
    # BRAZIL
    if country == 'brazil':
        load_brazil(filenames_year, path)

Working on: peru | imports | ma07130222.csv...
An error occured, the file could not be uploaded
CompletedProcess(args='~/BDM_Software/hadoop/bin/hadoop fs -put /home/bdm/DataImporta/P1/production/VM//version0.avro/user/bdm/persistent/peru/imp/2022', returncode=1, stdout=b'', stderr=b"put: `/home/bdm/DataImporta/P1/production/VM//version0.avro/user/bdm/persistent/peru/imp/2022': No such file or directory\n")
Working on: peru | imports | ma07130322.csv...


In [47]:
filenames_year

{'peru': {'imports': {}, 'exports': {}, 'metadata': {}},
 'chile': {'imports': {},
  'exports': {},
  'metadata': {'imports': {}, 'exports': {}}},
 'brazil': {'imports': {}, 'exports': {}, 'metadata': {}}}