EXTRACT DATA AND SAVE TO LOCAL DRIVE

In [None]:
import os
import openpyxl
import requests

In [None]:
# Function to extract data from excel file
def download_files_from_excel(file_path, download_folder):
    # Print the current working directory
    print("Current working directory:", os.getcwd())

    # Check if the file exists
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return

    # Check if the download folder exists, if not, create it
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)
        print(f"Created download folder: {download_folder}")
    else:
        print(f"Download folder already exists: {download_folder}")

    # Load the workbook and select the active worksheet
    workbook = openpyxl.load_workbook(file_path)
    sheet = workbook.active
    

    # Iterate through the rows in the worksheet and get the URLs
    for row in sheet.iter_rows(values_only=True):
        for url in row:
            if url and url.startswith('https://'):
                # Get the file name by splitting the URL
                file_name = url.split('/')[-1]

                # Send a GET request to the file URL
                response = requests.get(url)

                # Check if the request was successful
                if response.status_code == 200:
                    # Define the path to save the file
                    save_path = os.path.join(download_folder, file_name)

                    # Open the file in binary write mode and save the content
                    with open(save_path, 'wb') as file:
                        file.write(response.content)
                    print(f"Downloaded {file_name} to {save_path}") 
                else:
                    print(f"Failed to download {file_name}")

# Path to the Excel file containing the URLs
file_path = "/root/DataEngineeringScripts/notebooks/IBSA.xlsx"

# Path to the folder where the downloaded files will be saved
download_folder = "/root/DataEngineeringScripts/notebooks/IBSA_data"

# Call the function to download the files
download_files_from_excel(file_path, download_folder)

LOAD DATA TO AZURE STORAGE

In [None]:
from pathlib import Path
from PyPDF2 import PdfReader
from PIL import Image
import io
import zipfile
from azure.storage.blob import BlobClient, ContentSettings
import os
import shutil
import logging



In [None]:
# Setup logging to capture errors
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Function to ingest data to blob storage
def load_and_write_files(folder_path, container_url, sas_token, destination_path):
    # Check and convert folder to path if in string
    folder_path = Path(folder_path) if isinstance(folder_path, str) else folder_path
    
    # Iterate through the files in the folder and confirm if the files exist in the folder
    for file_path in folder_path.iterdir():
        if file_path.is_file():  # If file path exists in folder
            # Convert all the file extensions in the folder to lower case (case-insensitive), for easy handling
            file_extension = file_path.suffix.lower()
            # Connect to the blob storage using the API (generated url + token_key)
            blob_url_with_sas = f"{container_url}/{destination_path}/{file_path.name}{sas_token}"
            # Initiate the connection
            blob_client = BlobClient.from_blob_url(blob_url_with_sas)
            
            # Upload the various file extensions to the blob 
            try:
                if file_extension == '.txt':
                    with open(file_path, 'rb') as data:
                        blob_client.upload_blob(data, blob_type="BlockBlob", overwrite=True)  # Overwrite to prevent file duplicate
                elif file_extension == '.pdf':                                                # BlockBlob, for handling huge data
                    with open(file_path, 'rb') as data:
                        blob_client.upload_blob(data, blob_type="BlockBlob", overwrite=True, content_settings=ContentSettings(content_type='application/pdf')) # Ensure pdf are uploaded in their original format without distortion
                elif file_extension == '.png':
                    image = Image.open(file_path)
                    img_byte_arr = io.BytesIO() # Convert image to bytes to preserve it's originality
                    image.save(img_byte_arr, format='PNG')
                    blob_client.upload_blob(img_byte_arr.getvalue(), blob_type="BlockBlob", overwrite=True, content_settings=ContentSettings(content_type='image/png')) # Ensures Image files are uploaded in their original format without distortion
                elif file_extension == '.zip':
                    with open(file_path, 'rb') as data:
                        blob_client.upload_blob(data, blob_type="BlockBlob", overwrite=True, content_settings=ContentSettings(content_type='application/zip')) # Ensure Zip files are uploaded in their original format without distortion
                elif file_extension == '.mdb':
                    with open(file_path, 'rb') as data:
                        blob_client.upload_blob(data, blob_type="BlockBlob", overwrite=True)
                else:
                    logger.warning(f"Unsupported file type: {file_path.suffix}")
            except Exception as e:
                logger.error(f"Error processing file {file_path}: {e}")
        else:
            logger.info(f"Skipping directory: {file_path}")  # Else skip directory, if file path doesn't exist in folder

folder_path = r"/root/DataEngineeringScripts/notebooks/IBSA_data"
container_url = 'https://stproponentdata.blob.core.windows.net/ibsa'
sas_token = '?sv=2023-01-03&spr=https%2Chttp&st=2024-07-12T02%3A22%3A48Z&se=2024-07-19T02%3A22%3A00Z&sr=c&sp=rwl&sig=C0jXStedNSc0LkZ%2FHk0nHrQg%2BEr4yZHVlRgb%2BjtwFwc%3D' 
destination_path = 'cred.txt4' 

# Call the function to ingest data to the blob store
load_and_write_files(folder_path, container_url, sas_token, destination_path)

logger.info("All files have been loaded and written to the blob container.")

CHECK IF DATA MEETS MINIMUM REQUIREMENTS FOR INGESTION

In [None]:
from azure.storage.blob import ContainerClient
from PyPDF2 import PdfReader
from PIL import Image
import zipfile
import pyodbc
import os
import io
import tempfile
import subprocess

In [None]:
# Function to connect to the sas_url and check criteria for each file extention
def list_and_check_blobs(sas_url):
    """Connect to Azure Blob Storage using a SAS URL and check each file in the specified folder."""
    try:
        container_client = ContainerClient.from_container_url(sas_url) # Create connection to the cloud storage using the sas_url
    except Exception as e:
        print(f"Error creating ContainerClient: {e}")
        return

    folder_path = 'cred.txt4/'  # Folder in the ibsa container, housing the different files

    # List blobs/files in the specified folder
    try:
        blobs = container_client.list_blobs(name_starts_with=folder_path)
    except Exception as e:
        print(f"Error listing blobs: {e}")
        return

    valid_files = [] # List to store the valid_files
    invalid_files = [] # List to store the ivalid files

    for blob in blobs: # Iterate through the files, and create a blob_client that interacts with the files in the blob container
        file_path = blob.name
        file_name = os.path.basename(file_path)
        blob_client = container_client.get_blob_client(file_path)

        if check_file_requirements(file_path, blob_client):
            valid_files.append(file_name)
        else:
            invalid_files.append(file_name)

    print_files("meet", valid_files)
    print_files("do NOT meet", invalid_files)

def check_file_requirements(file_path, blob_client):
    """Check if the file meets the minimum requirements based on its extension."""
    extension = file_path.split('.')[-1].lower() # seperate the extension names from the path, then convert them to lower case, for easy handling

    check_functions = {
        'txt': check_txt_file,
        'pdf': check_pdf_file,
        'png': check_png_file,
        'zip': check_zip_file,
        'mdb': check_mdb_file
    }

    check_function = check_functions.get(extension)
    if check_function:
        return check_function(blob_client)
    else:
        return False

def check_txt_file(blob_client):
    """Check if a text file is not empty.""" # Minimum requirement for txt file extension 
    try:
        blob_data = blob_client.download_blob().readall()
        return bool(blob_data.strip())
    except Exception as e:
        print(f"Error checking TXT file: {e}") 
        return False                              # Return false if txt file doesn't meet the minimum requirement 

def check_pdf_file(blob_client):
    """Check if a PDF file has at least one page.""" # Minimum requirement for pdf file extension
    try:
        blob_data = blob_client.download_blob().readall()
        with io.BytesIO(blob_data) as f:
            reader = PdfReader(f)
            return len(reader.pages) > 0
    except Exception as e:
        print(f"Error checking PDF file: {e}") 
        return False                            # Return false if pdf file doesn't meeet the minimum requirement

def check_png_file(blob_client):
    """Check if the file is a valid PNG image.""" # Minimum requirement for png file extension
    try:
        blob_data = blob_client.download_blob().readall()
        with io.BytesIO(blob_data) as f:
            img = Image.open(f)
            return img.format == 'PNG'
    except Exception as e:
        print(f"Error checking PNG file: {e}") 
        return False                           # Return false if png file doesn't meet the minimum requirement

def check_zip_file(blob_client):
    """Check if a ZIP file contains at least one file.""" # Minimum requirement for zip file extension
    try:
        blob_data = blob_client.download_blob().readall()
        with io.BytesIO(blob_data) as f:
            with zipfile.ZipFile(f) as z:
                return len(z.namelist()) > 0 # Return number of files in the zipped folder if it contains at least 1 file, i.e file > 0
    except Exception as e:
        print(f"Error checking ZIP file: {e}")  
        return False                            # Return false if zipped file doesn't meet the minimum requirement

def check_mdb_file(blob_client):
    """Check if an MDB file contains at least one table using mdbtools.""" # Minimum requirement for mdb file extension
    try:
        # Create a temporary file and write the blob data to it
        blob_data = blob_client.download_blob().readall()
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mdb") as tmp_file:
            tmp_file.write(blob_data)
            tmp_file_path = tmp_file.name

        # Use mdbtools to list tables in the MDB file
        try:
            output = subprocess.check_output(['mdb-tables', tmp_file_path], text=True)
            tables = output.strip().split()
            return len(tables) > 0    # Return number of tables in the mdb extension if it contains at least 1 table i.e table > 0
        except subprocess.CalledProcessError as e:
            print(f"Error checking MDB file with mdbtools: {e}") # Print out any error encountered while executing this function
            return False

    except Exception as e:
        print(f"Error checking MDB file: {e}") 
        return False                                 # Return false if mbd file doesn't meet the minimum requirement
    
    finally:
        # Clean up: Remove the temporary file
        if os.path.exists(tmp_file_path):
            os.remove(tmp_file_path)

def print_files(status, files):
    """Print the list of files based on their status."""
    print(f"Files that {status} the minimum requirements ({len(files)}):") # Print the number of files that meet the requirements with their names respectively and those that didn't
    for file in files:
        print(f" - {file}") # Print the different file names that meet and didn't meet the minimum requirement

# Call the main function and input the generated sas_url, to run the entire script
if __name__ == "__main__":
    sas_url = 'https://stproponentdata.blob.core.windows.net/ibsa?sv=2023-01-03&st=2024-08-01T05%3A18%3A58Z&se=2024-08-30T05%3A18%3A00Z&sr=c&sp=rl&sig=CqluqFK0qX2fFV7hQtTyjUJ%2FRy2LKKDXueY44CZNJTk%3D'  
    list_and_check_blobs(sas_url)


CHECK IF THE SUBMITTABLE FILES HAVE THE MINIMUM REQUIREMENT NEEDED TO COMPLETE DATA SUBMISSION IN DANDJOO

In [1]:
import os
import zipfile
import pandas as pd
import geopandas as gpd
from io import BytesIO
from azure.storage.blob import ContainerClient, BlobClient
import fiona
from shapely.geometry import Polygon
import shutil
import logging
import concurrent.futures
import sys
from pathlib import Path
sys.path.insert(0,str(Path.cwd().parent))
from package import *

List the blob (files in the container), Optional

In [6]:
#from azure.storage.blob import ContainerClient

#account_url = "https://stproponentdata.blob.core.windows.net"
#container_name = "ibsa"
#sas_token = "sv=2023-01-03&st=2024-08-12T03%3A07%3A16Z&se=2024-08-30T03%3A07%3A00Z&sr=c&sp=rl&sig=gWgJ6GHp7ClQE4rMAxJpYj6sO9Z4W1EQk1PhtsAo8gk%3D"
#container_client = ContainerClient(account_url, container_name, credential=sas_token)

#df: pd.DataFrame = read_container(container=container_client,blob_folder='cred.txt4')
#display(df.head(1000))

#blob_list = container_client.list_blobs()
#for blob in blob_list:
    #print(blob.name)



Getting the column names in the  '.xlsx' file extension

In [6]:
# Set up logging to log errors to a file instead of printing them out
logging.basicConfig(filename='error_log_1.txt', level=logging.ERROR, format='%(asctime)s:%(levelname)s:%(message)s')

def check_information(account_url, container_name, sas_token, destination_path):
    # Create a connection to the container
    container_client = ContainerClient(account_url, container_name, credential=sas_token)

    # List all blobs in the container
    blob_list = container_client.list_blobs()
    
    for blob in blob_list:
        if blob.name.endswith('.zip'):
            # Create a connection to the blob
            blob_client = BlobClient(account_url, container_name, blob.name, credential=sas_token)

            try:
                # Download the zip file in the blob storage
                blob_data = blob_client.download_blob().readall()
            except Exception as e:
                logging.error(f"Error downloading blob {blob.name}: {e}")
                continue

            # Extract the zip file
            with zipfile.ZipFile(BytesIO(blob_data), 'r') as zip_ref:
                # Extract only xlsx files from the zip file
                xlsx_files = [file for file in zip_ref.namelist() if file.endswith('.xlsx')]
                zip_ref.extractall('extracted_files', members=xlsx_files)

            # Check each xlsx file and print their information
            for file in xlsx_files:
                file_path = os.path.join('extracted_files', file)
                try:
                    df = pd.read_excel(file_path)
                    #print(df.head())
                    print(df.columns)
                except Exception as e:
                    logging.error(f"Error readingshapefile{file_path}: {e}")
                    continue


# Input the parameters
account_url = "https://stproponentdata.blob.core.windows.net"
container_name = "ibsa"
sas_token = "?sv=2023-01-03&st=2024-08-14T03%3A01%3A42Z&se=2024-08-31T03%3A01%3A00Z&sr=c&sp=rwl&sig=dCK9f33N4GaBKLh5pgU7Wh7L17HRVSC4PJQhfpvj%2Bok%3D" 
destination_path = 'cred.txt4'

# Call the function
check_information(account_url, container_name, sas_token, destination_path)

  warn(msg)


Index(['SURVEY_ID', 'PROJECT', 'ASSE_LEVL', 'ASSE_TYPE', 'AREA_KM2', 'AREA_HA',
       'SURV_STRT', 'SURV_END', 'SAMP_RND', 'ORGANISTN', 'REP_TITLE',
       'REP_AUTH', 'PUBL_DATE', 'COMMENTS'],
      dtype='object')


  warn(msg)


Index(['SURVEY_ID', 'PROJECT', 'ASSE_LEVL', 'ASSE_TYPE', 'AREA_KM2', 'AREA_HA',
       'SURV_STRT', 'SURV_END', 'SAMP_RND', 'ORGANISTN', 'REP_TITLE',
       'REP_AUTH', 'PUBL_DATE', 'COMMENTS'],
      dtype='object')
Index(['Site', 'Site type', 'Location', 'Datum', 'Zone', 'Easting', 'Northing',
       'Latitude', 'Longitude', 'Altitude', 'Accuracy', 'Lat-Long'],
      dtype='object')
Index(['TaxonName', 'SiteName', 'Abundance', 'HerbRef', 'WAConStat', 'DateObs',
       'Author', 'Comments', 'Citation', 'POSITION', 'DATUM', 'CO_METH',
       'ZONE', 'EAST', 'NORTH', 'LATITUDE', 'LONGITUDE'],
      dtype='object')
Index(['Id', 'SurveyName', 'SurveyType', 'Author', 'StartDate', 'EndDate',
       'Citation'],
      dtype='object')
Index(['SURVEY_ID', 'BHPB_DRILL_HOLE', 'SAMPLE_NO', 'ASSE_LEVL', 'ASSE_TYPE',
       'MGA_EAST', 'MGA_NORTH', 'MGA_ZONE', 'PROJECT', 'SURV_STRT', 'SURV_END',
       'ORGNSTN', 'REP_TITLE', 'REP_AUTH', 'PUBL_DATE', 'COMMENTS'],
      dtype='object')
Index(['SURVE

Checking the number of '.xlsx' file extensions  in the zipped file

In [7]:
# Set up logging to log errors to a file instead of printing them out
logging.basicConfig(filename='error_log_2.txt', level=logging.ERROR, format='%(asctime)s:%(levelname)s:%(message)s')

def count_xlsx_files(account_url, container_name, sas_token, destination_path):
    # Create a connection to the container
    container_client = ContainerClient(account_url, container_name, credential=sas_token)

    # List all blobs in the container
    blob_list = container_client.list_blobs()

    # Initialize a counter for CSV files
    xlsx_file_count = 0

    for blob in blob_list:
        if blob.name.endswith('.zip'):
            # Create a connection to the blob
            blob_client = BlobClient(account_url, container_name, blob.name, credential=sas_token)

            try:
                # Download the zip file in the blob storage
                blob_data = blob_client.download_blob().readall()
            except Exception as e:
                logging.error(f"Error downloading blob {blob.name}: {e}")
                continue

            # Open the zip file
            with zipfile.ZipFile(BytesIO(blob_data), 'r') as zip_ref:
                # List all files in the zip archive
                all_files = zip_ref.namelist()

                # Count xlsx files in the zip archive
                xlsx_files = [file for file in all_files if file.endswith('.xlsx')]
                xlsx_file_count += len(xlsx_files)

                # Log the count of xlsx files
                logging.info(f"Found {len(xlsx_files)} XLSX files in {blob.name}")
                print(f"Found {len(xlsx_files)} XLSX files in {blob.name}")

    # Print the total count of xlsx files
    print(f"Total number of XLSX files: {xlsx_file_count}")

# Input the parameters
account_url = "https://stproponentdata.blob.core.windows.net"
container_name = "ibsa"
sas_token = "?sv=2023-01-03&st=2024-08-14T03%3A01%3A42Z&se=2024-08-31T03%3A01%3A00Z&sr=c&sp=rwl&sig=dCK9f33N4GaBKLh5pgU7Wh7L17HRVSC4PJQhfpvj%2Bok%3D"
destination_path = 'cred.txt4'

# Call the function
count_xlsx_files(account_url, container_name, sas_token, destination_path)

Found 0 XLSX files in cred.txt4/02 Data Submission v1.zip
Found 1 XLSX files in cred.txt4/0_10001_Karijini_Helicopter_Vert_SRE_20150326_Data.zip
Found 0 XLSX files in cred.txt4/0_Biodiversity_GIS_Template_v319.gdb_20151013.zip
Found 0 XLSX files in cred.txt4/0_ESRI Shapefile.zip
Found 0 XLSX files in cred.txt4/0_IBSA-2019-0190.zip
Found 0 XLSX files in cred.txt4/0_IBSA-2019-0191.zip
Found 0 XLSX files in cred.txt4/0_IBSA-2020-0046.zip
Found 0 XLSX files in cred.txt4/0_IBSA-2020-0047.zip
Found 0 XLSX files in cred.txt4/0_IBSA-2020-0060.zip
Found 0 XLSX files in cred.txt4/0_IBSA-2021-0267.zip
Found 0 XLSX files in cred.txt4/0_Spatial data.zip
Found 0 XLSX files in cred.txt4/0_Survey_Data.zip
Found 1 XLSX files in cred.txt4/10001_Karijini_Helicopter_Vert_SRE_20150326_Data.zip
Found 0 XLSX files in cred.txt4/10030_EasternRidge_TargetedSRE_data_final.gdb.zip
Found 0 XLSX files in cred.txt4/10037 Central Pilbara Ghost Bat Data and Roost Asessment.zip
Found 0 XLSX files in cred.txt4/10_ESRI S

Creating a folder in the blob, for the '.xlsx' files with the required column names

In [None]:
# Set up logging to log errors to a file instead of printing them out
logging.basicConfig(filename='error_log_3.txt', level=logging.ERROR, format='%(asctime)s:%(levelname)s:%(message)s')

def create_xlsx_file(account_url, container_name, sas_token, destination_path):
    # Create a connection to the container
    container_client = ContainerClient(account_url, container_name, credential=sas_token)

    # List all blobs in the container
    blob_list = container_client.list_blobs()

    xlsx_files_with_columns = []
    xlsx_files_without_columns = []

    for blob in blob_list:
        if blob.name.endswith('.zip'):
            # Create a connection to the blob
            blob_client = BlobClient(account_url, container_name, blob.name, credential=sas_token)

            try:
                # Download the zip file in the blob storage
                blob_data = blob_client.download_blob().readall()
            except Exception as e:
                logging.error(f"Error downloading blob {blob.name}: {e}")
                continue

            # Extract the zip file
            with zipfile.ZipFile(BytesIO(blob_data), 'r') as zip_ref:
                # Extract only xlsx files from the zip file
                xlsx_files = [file for file in zip_ref.namelist() if file.endswith('.xlsx')]
                zip_ref.extractall('extracted_files', members=xlsx_files)

            # Log the extracted files
            logging.info(f"Extracted XLSX files from {blob.name}: {xlsx_files}")
            print(f"Extracted XLSX files from {blob.name}: {xlsx_files}")

            # Check each XLSX file for the required columns
            for file in xlsx_files:
                file_path = os.path.join('extracted_files', file)
                try:
                    df = pd.read_excel(file_path)
                except Exception as e:
                    logging.error(f"Error reading XLSX file {file_path}: {e}")
                    continue
                
                # Define the required columns for the extensions
                required_columns1 = {'TaxonName', 'DateObs', 'EAST', 'NORTH', 'LATITUDE', 'LONGITUDE'}
                required_columns2 = { 'MGA_EAST', 'MGA_NORTH', 'MGA_ZONE', 'SURV_STRT', 'SURV_END','ORGNSTN'}
                required_columns3 = {'TRIBE', 'GENUS', 'SUBGENUS', 'SPECIES', 'DATE_ASSE', 'LatDec', 'LonDec'}
                if required_columns1.issubset(df.columns) or required_columns2.issubset(df.columns) or required_columns3.issubset(df.columns):  # If the df contains the columns already defined in 'required_columns' variable
                    xlsx_files_with_columns.append(file_path)  # Append the full path
                else:
                    xlsx_files_without_columns.append(file_path)  # Append the full path

                #print(f"XLSX files with required columns: {len(xlsx_files_with_columns)}")
                #print(f"XLSX files without required columns: {len(xlsx_files_without_columns)}")
    
    # Print the total count of xlsx files
    #print(f"Total number of XLSX files with required column: {xlsx_files_with_columns}")
    # Print the total count of xlsx files
    #print(f"Total number of XLSX files without column: {xlsx_files_without_columns}")
            
    return xlsx_files_with_columns, xlsx_files_without_columns

def write_xlsx_to_blob(account_url, container_name, sas_token, folder_name, file_paths):
    # Ensure the folder exists
    os.makedirs(folder_name, exist_ok=True)
    
    for file_path in file_paths:
        file_name = os.path.basename(file_path)
        destination_file_path = os.path.join(folder_name, file_name) # Setting up the destination path for the files
        # Log the file paths
        logging.info(f"Copying from {file_path} to {destination_file_path}")
        print(f"Copying from {file_path} to {destination_file_path}")

        # Copy the file to the folder
        try:
             shutil.copy(file_path, destination_file_path)# Copy the files from the source path to the set up destination path
        except FileNotFoundError as e:
            logging.error(f"File not found: {file_path}")
            print(f"File not found: {file_path}")
            continue

        # Create a connection to the blob
        blob_client = BlobClient(account_url, container_name, f"{folder_name}/{file_name}", credential=sas_token)
        try:
            with open(destination_file_path, 'rb') as data: # write the folder_name to the blob, using the path (destination_file_path)
                print(type(data))
                blob_client.upload_blob(data, blob_type="BlockBlob", overwrite=True)
        except Exception as e:
            logging.error(f"Error uploading file {file_name}: {e}")
            print(f"Error uploading file {file_name}: {e}")
    
    # Clean up extracted files
    shutil.rmtree('extracted_files')

# Input the parameters
account_url = "https://stproponentdata.blob.core.windows.net"
container_name = "ibsa"
sas_token = "?sv=2023-01-03&st=2024-08-14T03%3A01%3A42Z&se=2024-08-31T03%3A01%3A00Z&sr=c&sp=rwl&sig=dCK9f33N4GaBKLh5pgU7Wh7L17HRVSC4PJQhfpvj%2Bok%3D"
folder_name = "xlsx_submitable"
destination_path = 'cred.txt4'


# Call the first function and assign the xlsx_files as variable to it
xlsx_files_with_columns, xlsx_files_without_columns = create_xlsx_file(account_url, container_name, sas_token, destination_path)

# Call the second function to upload the xlsx_files_with_columns to the created folder (folder_name)
write_xlsx_to_blob(account_url, container_name, sas_token, folder_name, xlsx_files_with_columns)



Getting the column names in the  '.csv' file extension

In [9]:
# Set up logging to log errors to a file instead of printing them out
logging.basicConfig(filename='error_log_1.txt', level=logging.ERROR, format='%(asctime)s:%(levelname)s:%(message)s')

def check_csv_file(account_url, container_name, sas_token, destination_path):
    # Create a connection to the container
    container_client = ContainerClient(account_url, container_name, credential=sas_token)

    # List all blobs in the container
    blob_list = container_client.list_blobs()

    for blob in blob_list:
        if blob.name.endswith('.zip'):
            # Create a connection to the blob
            blob_client = BlobClient(account_url, container_name, blob.name, credential=sas_token)

            try:
                # Download the zip file in the blob storage
                blob_data = blob_client.download_blob().readall()
            except Exception as e:
                logging.error(f"Error downloading blob {blob.name}: {e}")
                continue

            # Extract the zip file
            with zipfile.ZipFile(BytesIO(blob_data), 'r') as zip_ref:
                # Extract only CSV files from the zip file
                csv_files = [file for file in zip_ref.namelist() if file.endswith('.csv')]
                zip_ref.extractall('extracted_files', members=csv_files)

            # Check each CSV file for the required columns
            for file in csv_files:
                file_path = os.path.join('extracted_files', file)
                try:
                    df = pd.read_csv(file_path)
                    #print(df.head())
                    print(df.columns)
                except Exception as e:
                    logging.error(f"Error readingshapefile{file_path}: {e}")
                    continue


# Input the parameters
account_url = "https://stproponentdata.blob.core.windows.net"
container_name = "ibsa"
sas_token = "?sv=2023-01-03&st=2024-08-14T03%3A01%3A42Z&se=2024-08-31T03%3A01%3A00Z&sr=c&sp=rwl&sig=dCK9f33N4GaBKLh5pgU7Wh7L17HRVSC4PJQhfpvj%2Bok%3D" 
destination_path = 'cred.txt4'

check_csv_file(account_url, container_name, sas_token, destination_path)

Index(['SiteName', 'SampleType', 'Site type', 'Effort', 'Author', 'StartDate',
       'EndDate', 'Comments', 'Citation', 'Easting', 'Northing', 'Latitude',
       'Longitude'],
      dtype='object')


Checking the number of '.csv' file extensions  in the zipped file

In [7]:
# Set up logging to log errors to a file instead of printing them out
logging.basicConfig(filename='error_log_1.txt', level=logging.ERROR, format='%(asctime)s:%(levelname)s:%(message)s')

def count_csv_files(account_url, container_name, sas_token, destination_path):
    # Create a connection to the container
    container_client = ContainerClient(account_url, container_name, credential=sas_token)

    # List all blobs in the container
    blob_list = container_client.list_blobs()

    # Initialize a counter for CSV files
    csv_file_count = 0

    for blob in blob_list:
        if blob.name.endswith('.zip'):
            # Create a connection to the blob
            blob_client = BlobClient(account_url, container_name, blob.name, credential=sas_token)

            try:
                # Download the zip file in the blob storage
                blob_data = blob_client.download_blob().readall()
            except Exception as e:
                logging.error(f"Error downloading blob {blob.name}: {e}")
                continue

            # Open the zip file
            with zipfile.ZipFile(BytesIO(blob_data), 'r') as zip_ref:
                # List all files in the zip archive
                all_files = zip_ref.namelist()

                # Count CSV files in the zip archive
                csv_files = [file for file in all_files if file.endswith('.csv')]
                csv_file_count += len(csv_files)

                # Log the count of CSV files
                logging.info(f"Found {len(csv_files)} CSV files in {blob.name}")
                print(f"Found {len(csv_files)} CSV files in {blob.name}")

    # Print the total count of CSV files
    print(f"Total number of CSV files: {csv_file_count}")

# Input the parameters
account_url = "https://stproponentdata.blob.core.windows.net"
container_name = "ibsa"
sas_token = "?sv=2023-01-03&st=2024-08-14T03%3A01%3A42Z&se=2024-08-31T03%3A01%3A00Z&sr=c&sp=rwl&sig=dCK9f33N4GaBKLh5pgU7Wh7L17HRVSC4PJQhfpvj%2Bok%3D"
destination_path = 'cred.txt4'

# Call the function
count_csv_files(account_url, container_name, sas_token, destination_path)

Found 0 CSV files in cred.txt4/02 Data Submission v1.zip
Found 0 CSV files in cred.txt4/0_10001_Karijini_Helicopter_Vert_SRE_20150326_Data.zip
Found 0 CSV files in cred.txt4/0_Biodiversity_GIS_Template_v319.gdb_20151013.zip
Found 0 CSV files in cred.txt4/0_ESRI Shapefile.zip
Found 0 CSV files in cred.txt4/0_IBSA-2019-0190.zip
Found 0 CSV files in cred.txt4/0_IBSA-2019-0191.zip
Found 0 CSV files in cred.txt4/0_IBSA-2020-0046.zip
Found 0 CSV files in cred.txt4/0_IBSA-2020-0047.zip
Found 0 CSV files in cred.txt4/0_IBSA-2020-0060.zip
Found 0 CSV files in cred.txt4/0_IBSA-2021-0267.zip
Found 0 CSV files in cred.txt4/0_Spatial data.zip
Found 0 CSV files in cred.txt4/0_Survey_Data.zip
Found 0 CSV files in cred.txt4/10001_Karijini_Helicopter_Vert_SRE_20150326_Data.zip
Found 0 CSV files in cred.txt4/10030_EasternRidge_TargetedSRE_data_final.gdb.zip
Found 0 CSV files in cred.txt4/10037 Central Pilbara Ghost Bat Data and Roost Asessment.zip
Found 0 CSV files in cred.txt4/10_ESRI Shapefile.zip
Fou

Creating a folder in the blob, for the '.csv' files with the required column names

In [None]:
# Set up logging to log errors to a file instead of printing them out
logging.basicConfig(filename='error_log_1.txt', level=logging.ERROR, format='%(asctime)s:%(levelname)s:%(message)s')

def create_csv_file(account_url, container_name, sas_token, destination_path):
    # Create a connection to the container
    container_client = ContainerClient(account_url, container_name, credential=sas_token)

    # List all blobs in the container
    blob_list = container_client.list_blobs()

    csv_files_with_columns = []
    csv_files_without_columns = []

    for blob in blob_list:
        if blob.name.endswith('.zip'):
            # Create a connection to the blob
            blob_client = BlobClient(account_url, container_name, blob.name, credential=sas_token)

            try:
                # Download the zip file in the blob storage
                blob_data = blob_client.download_blob().readall()
            except Exception as e:
                logging.error(f"Error downloading blob {blob.name}: {e}")
                continue

            # Extract the zip file
            with zipfile.ZipFile(BytesIO(blob_data), 'r') as zip_ref:
                # Extract only CSV files from the zip file
                csv_files = [file for file in zip_ref.namelist() if file.endswith('.csv')]
                zip_ref.extractall('extracted_files', members=csv_files)

            # Log the extracted files
            logging.info(f"Extracted CSV files from {blob.name}: {csv_files}")
            print(f"Extracted CSV files from {blob.name}: {csv_files}")

            # Check each CSV file for the required columns
            for file in csv_files:
                file_path = os.path.join('extracted_files', file)
                try:
                    df = pd.read_csv(file_path)
                except Exception as e:
                    logging.error(f"Error reading CSV file {file_path}: {e}")
                    continue
                
                # Define the required columns for the extensions
                required_columns = {'StartDate', 'EndDate', 'Citation', 'Easting', 'Northing', 'Latitude', 'Longitude'}
                if required_columns.issubset(df.columns):  # If the df contains the columns already defined in 'required_columns' variable
                    csv_files_with_columns.append(file_path)  # Append the full path
                else:
                    csv_files_without_columns.append(file_path)  # Append the full path
            
    return csv_files_with_columns, csv_files_without_columns

def write_csv_to_blob(account_url, container_name, sas_token, folder_name, file_paths):
    # Ensure the folder exists
    os.makedirs(folder_name, exist_ok=True)
    
    for file_path in file_paths:
        file_name = os.path.basename(file_path)
        destination_file_path = os.path.join(folder_name, file_name) # Setting up the destination path for the files
        # Log the file paths
        logging.info(f"Copying from {file_path} to {destination_file_path}")
        print(f"Copying from {file_path} to {destination_file_path}")

        # Copy the file to the folder
        try:
             shutil.copy(file_path, destination_file_path)# Copy the files from the source path to the set up destination path
        except FileNotFoundError as e:
            logging.error(f"File not found: {file_path}")
            print(f"File not found: {file_path}")
            continue

        # Create a connection to the blob
        blob_client = BlobClient(account_url, container_name, f"{folder_name}/{file_name}", credential=sas_token)
        try:
            with open(destination_file_path, 'rb') as data: # write the folder_name to the blob, using the path (destination_file_path)
                print(type(data))
                blob_client.upload_blob(data, blob_type="BlockBlob", overwrite=True)
        except Exception as e:
            logging.error(f"Error uploading file {file_name}: {e}")
            print(f"Error uploading file {file_name}: {e}")
    
    # Clean up extracted files
    shutil.rmtree('extracted_files')

# Input the parameters
account_url = "https://stproponentdata.blob.core.windows.net"
container_name = "ibsa"
sas_token = "?sv=2023-01-03&st=2024-08-14T03%3A01%3A42Z&se=2024-08-31T03%3A01%3A00Z&sr=c&sp=rwl&sig=dCK9f33N4GaBKLh5pgU7Wh7L17HRVSC4PJQhfpvj%2Bok%3D"
folder_name = "csv_submitable"
destination_path = 'cred.txt4'

# Call the first function and assign the csv_files as variable to it
csv_files_with_columns, csv_files_without_columns = create_csv_file(account_url, container_name, sas_token, destination_path)

# Call the second function to upload the csv_files_with_columns to the created folder (folder_name)
write_csv_to_blob(account_url, container_name, sas_token, folder_name, csv_files_with_columns)

Getting the column names in  the  '.shp' file extension

In [11]:
# Set up logging to log errors to a file instead of printing them out
logging.basicConfig(filename='error_log_2.txt', level=logging.ERROR, format='%(asctime)s:%(levelname)s:%(message)s')

def check_shp_file(account_url, container_name, sas_token, destination_path):
    # Create a connection to the container
    container_client = ContainerClient(account_url, container_name, credential=sas_token)

    # List all blobs in the container
    blob_list = container_client.list_blobs()

    for blob in blob_list:
        if blob.name.endswith('.zip'):
            # Create a connection to the blob
            blob_client = BlobClient(account_url, container_name, blob.name, credential=sas_token)

            try:
                # Download the zip file in the blob storage
                blob_data = blob_client.download_blob().readall()
                print(f"Downloaded blob: {blob.name}")
            except Exception as e:
                logging.error(f"Error downloading blob {blob.name}: {e}")
                continue

            # Extract the zip file
            with zipfile.ZipFile(BytesIO(blob_data), 'r') as zip_ref:
                zip_ref.extractall('extracted_files')
                
                # Extract only shp files from the zip file
                shp_files = [file for file in os.listdir('extracted_files') if file.endswith('.shp')][0]
                shp_files_path = os.path.join('extracted_files', shp_files)
                
                print(f"Extracted files: {shp_files_path}")

            # Check each SHP file columns
            for shp_files in shp_files_path:
                #file_path = os.path.join('extracted_files', file)
                #print(f"Processing file: {file_path}")
                try:
                    gdf = gpd.read_file(shp_files_path)
                    column_names = gdf.columns.tolist()
                    print(f"Column names:", column_names)
                    #print(f"First few rows in {file_path}: {df.head()}")
                except Exception as e:
                    logging.error(f"Error reading shapefile {shp_files_path}: {e}")
                    continue

# Input the parameters
account_url = "https://stproponentdata.blob.core.windows.net"
container_name = "ibsa"
sas_token = "?sv=2023-01-03&st=2024-08-14T03%3A01%3A42Z&se=2024-08-31T03%3A01%3A00Z&sr=c&sp=rwl&sig=dCK9f33N4GaBKLh5pgU7Wh7L17HRVSC4PJQhfpvj%2Bok%3D" 
destination_path = 'cred.txt4'

check_shp_file(account_url, container_name, sas_token, destination_path)

Downloaded blob: cred.txt4/02 Data Submission v1.zip
Extracted files: extracted_files/Survey_Data_Possum_Observations.shp
Column names: ['Waypoint_N', 'Zone', 'mE', 'mN', 'Species', 'Abundance', 'F7', 'Date', 'Comment', 'FID_1', 'TaxonName', 'SiteName', 'Abundanc_1', 'MuseumRef', 'WAConStat', 'SRE_Sts', 'ObsMethod', 'FaunaType', 'DateObs', 'Author', 'Comments', 'Citation', 'geometry']
Column names: ['Waypoint_N', 'Zone', 'mE', 'mN', 'Species', 'Abundance', 'F7', 'Date', 'Comment', 'FID_1', 'TaxonName', 'SiteName', 'Abundanc_1', 'MuseumRef', 'WAConStat', 'SRE_Sts', 'ObsMethod', 'FaunaType', 'DateObs', 'Author', 'Comments', 'Citation', 'geometry']
Column names: ['Waypoint_N', 'Zone', 'mE', 'mN', 'Species', 'Abundance', 'F7', 'Date', 'Comment', 'FID_1', 'TaxonName', 'SiteName', 'Abundanc_1', 'MuseumRef', 'WAConStat', 'SRE_Sts', 'ObsMethod', 'FaunaType', 'DateObs', 'Author', 'Comments', 'Citation', 'geometry']
Column names: ['Waypoint_N', 'Zone', 'mE', 'mN', 'Species', 'Abundance', 'F7', '

Checking the number of '.shp' file extensions  in the zipped file

In [12]:
# Set up logging to log errors to a file instead of printing them out
logging.basicConfig(filename='error_log_2.txt', level=logging.ERROR, format='%(asctime)s:%(levelname)s:%(message)s')

def count_shp_files(account_url, container_name, sas_token, destination_path):
    # Create a connection to the container
    container_client = ContainerClient(account_url, container_name, credential=sas_token)

    # List all blobs in the container
    blob_list = container_client.list_blobs()

    # Initialize a counter for SHP files
    shp_files_count = 0

    for blob in blob_list:
        if blob.name.endswith('.zip'):
            # Create a connection to the blob
            blob_client = BlobClient(account_url, container_name, blob.name, credential=sas_token)

            try:
                # Download the zip file in the blob storage
                blob_data = blob_client.download_blob().readall()
            except Exception as e:
                logging.error(f"Error downloading blob {blob.name}: {e}")
                continue

           # Extract the zip file
            with zipfile.ZipFile(BytesIO(blob_data), 'r') as zip_ref:
                # List all files in the zip archive
                all_files = zip_ref.namelist()

                
                # Count SHP files in the zip archive
                shp_files = [file for file in all_files if file.endswith('.shp')]
                shp_files_count += len(shp_files)
                
                # Log the count of SHP files
                logging.info(f"Found {len(shp_files)} SHP files in {blob.name}")
                print(f"Found {len(shp_files)} SHP files in {blob.name}")

    # Print the total count of SHP files
    print(f"Total number of SHP files: {shp_files_count}")
 
# Input the parameters
account_url = "https://stproponentdata.blob.core.windows.net"
container_name = "ibsa"
sas_token = "?sv=2023-01-03&st=2024-08-14T03%3A01%3A42Z&se=2024-08-31T03%3A01%3A00Z&sr=c&sp=rwl&sig=dCK9f33N4GaBKLh5pgU7Wh7L17HRVSC4PJQhfpvj%2Bok%3D"
destination_path = 'cred.txt4'

# Call the function
count_shp_files(account_url, container_name, sas_token, destination_path)

Found 3 SHP files in cred.txt4/02 Data Submission v1.zip
Found 0 SHP files in cred.txt4/0_10001_Karijini_Helicopter_Vert_SRE_20150326_Data.zip
Found 0 SHP files in cred.txt4/0_Biodiversity_GIS_Template_v319.gdb_20151013.zip
Found 3 SHP files in cred.txt4/0_ESRI Shapefile.zip
Found 4 SHP files in cred.txt4/0_IBSA-2019-0190.zip
Found 2 SHP files in cred.txt4/0_IBSA-2019-0191.zip
Found 3 SHP files in cred.txt4/0_IBSA-2020-0046.zip
Found 2 SHP files in cred.txt4/0_IBSA-2020-0047.zip
Found 4 SHP files in cred.txt4/0_IBSA-2020-0060.zip
Found 6 SHP files in cred.txt4/0_IBSA-2021-0267.zip
Found 3 SHP files in cred.txt4/0_Spatial data.zip
Found 2 SHP files in cred.txt4/0_Survey_Data.zip
Found 0 SHP files in cred.txt4/10001_Karijini_Helicopter_Vert_SRE_20150326_Data.zip
Found 0 SHP files in cred.txt4/10030_EasternRidge_TargetedSRE_data_final.gdb.zip
Found 0 SHP files in cred.txt4/10037 Central Pilbara Ghost Bat Data and Roost Asessment.zip
Found 4 SHP files in cred.txt4/10_ESRI Shapefile.zip
Fou

Creating a folder in the blob, for the '.SHP' files with the required column names

In [2]:
# Set environment variables
os.environ['SHAPE_RESTORE_SHX'] = 'YES'   # Set the '.shx' to yes. The '.shx' is essential for reading the shapefile(.shp) correctly
os.environ['OGR_GEOMETRY_ACCEPT_UNCLOSED_RING'] = 'NO'

logging.basicConfig(filename='error_log_6.txt', level=logging.ERROR, format='%(asctime)s:%(levelname)s:%(message)s')

# Validate the invalid geometry
def validate_geometry(geom):
    if isinstance(geom, Polygon) and not geom.is_valid:
        return geom.buffer(0)
    return geom

def process_blob(blob, account_url, container_name, sas_token):
    blob_client = BlobClient(account_url, container_name, blob.name, credential=sas_token)
    try:
        blob_data = blob_client.download_blob().readall()
    except Exception as e:
        logging.error(f"Error downloading blob {blob.name}: {e}")
        return None, None
    try:
        with zipfile.ZipFile(BytesIO(blob_data), 'r') as zip_ref:
            zip_ref.extractall('extracted_files')
            shp_files = [file for file in os.listdir('extracted_files') if file.endswith('.shp')]
    except zipfile.BadZipFile as e:
        logging.error(f"Bad zip file {blob.name}: {e}")
        return None, None

    shp_files_with_columns = []
    shp_files_without_columns = []

    for shp_file in shp_files:
        shp_file_path = os.path.join('extracted_files', shp_file)
        try:
            gdf = gpd.read_file(shp_file_path)
            gdf['geometry'] = gdf['geometry'].apply(validate_geometry)
            
            # Handle invalid dates
            gdf['DateObs'] = pd.to_datetime(gdf['DateObs'], format='%d/%m/%Y', errors='coerce')
            gdf = gdf.dropna(subset=['DateObs'])
            
        except Exception as e:
            logging.error(f"Error reading shapefile {shp_file_path}: {e}")
            continue

        if 'Measured 3D Point' in gdf.geom_type.unique(): # Convert the measured point in 3d to 2d
            gdf = gdf.set_geometry(gdf.geometry.apply(lambda geom: geom.to_2d()))

        required_columns = {'mE', 'mN', 'Species', 'TaxonName', 'DateObs'}
        if required_columns.issubset(gdf.columns):
            shp_files_with_columns.append(shp_file_path)
        else:
            shp_files_without_columns.append(shp_file_path)

    return shp_files_with_columns, shp_files_without_columns

def create_shp_file(account_url, container_name, sas_token, destination_path):
    container_client = ContainerClient(account_url, container_name, credential=sas_token)
    blob_list = container_client.list_blobs()

    shp_files_with_columns = []
    shp_files_without_columns = []

    with concurrent.futures.ProcessPoolExecutor(max_workers=2) as executor: # Helps to reduce the runtime of the code
        futures = [executor.submit(process_blob, blob, account_url, container_name, sas_token) for blob in blob_list]
        for future in concurrent.futures.as_completed(futures):
            try:
                with_columns, without_columns = future.result()
                if with_columns:
                    shp_files_with_columns.extend(with_columns)
                if without_columns:
                    shp_files_without_columns.extend(without_columns)
            except Exception as e:
                logging.error(f"Error processing blob: {e}")

    return shp_files_with_columns, shp_files_without_columns

def write_shp_to_blob(account_url, container_name, sas_token, folder_name, shp_files_paths):
    os.makedirs(folder_name, exist_ok=True)
    
    for shp_file_path in shp_files_paths:
        shp_file_name = os.path.basename(shp_file_path)
        destination_shp_file_path = os.path.join(folder_name, shp_file_name)
        
        logging.info(f"Copying from {shp_file_path} to {destination_shp_file_path}")
        print(f"Copying from {shp_file_path} to {destination_shp_file_path}")

        try:
            shutil.copy(shp_file_path, destination_shp_file_path)
        except FileNotFoundError as e:
            logging.error(f"File not found: {shp_file_path}")
            print(f"File not found: {shp_file_path}")
            continue

        blob_client = BlobClient(account_url, container_name, f"{folder_name}/{shp_file_name}", credential=sas_token)
        try:
            with open(destination_shp_file_path, 'rb') as data:
                blob_client.upload_blob(data, blob_type="BlockBlob", overwrite=True)
        except Exception as e:
            logging.error(f"Error uploading file {shp_file_name}: {e}")
            print(f"Error uploading file {shp_file_name}: {e}")
    
    shutil.rmtree('extracted_files')

# Input the parameters
account_url = "https://stproponentdata.blob.core.windows.net"
container_name = "ibsa"
sas_token = "?sv=2023-01-03&st=2024-08-14T03%3A01%3A42Z&se=2024-08-31T03%3A01%3A00Z&sr=c&sp=rwl&sig=dCK9f33N4GaBKLh5pgU7Wh7L17HRVSC4PJQhfpvj%2Bok%3D"
folder_name = "shp_submitable"
destination_path = 'cred.txt4'

# Call the first function and assign the create_shp_file function
shp_files_with_columns, shp_files_without_columns = create_shp_file(account_url, container_name, sas_token, destination_path)

# Call the write_to_blob function to upload the csv_files_with_columns to the created folder (folder_name)
write_shp_to_blob(account_url, container_name, sas_token, folder_name, shp_files_with_columns)

Copying from extracted_files/Survey_Data_CPS_8178-1_Habitat_Trees.shp to shp_submitable/Survey_Data_CPS_8178-1_Habitat_Trees.shp
Copying from extracted_files/Survey_Data_CPS_8178-1_Habitat_Trees.shp to shp_submitable/Survey_Data_CPS_8178-1_Habitat_Trees.shp
Copying from extracted_files/Survey_Data_CPS_8178-1_Habitat_Trees.shp to shp_submitable/Survey_Data_CPS_8178-1_Habitat_Trees.shp
Copying from extracted_files/Survey_Data_CPS_8178-1_Habitat_Trees.shp to shp_submitable/Survey_Data_CPS_8178-1_Habitat_Trees.shp
Copying from extracted_files/Survey_Data_CPS_8178-1_Habitat_Trees.shp to shp_submitable/Survey_Data_CPS_8178-1_Habitat_Trees.shp
Copying from extracted_files/Survey_Data_CPS_8178-1_Habitat_Trees.shp to shp_submitable/Survey_Data_CPS_8178-1_Habitat_Trees.shp
Copying from extracted_files/Survey_Data_CPS_8178-1_Habitat_Trees.shp to shp_submitable/Survey_Data_CPS_8178-1_Habitat_Trees.shp
Copying from extracted_files/Survey_Data_CPS_8178-1_Habitat_Trees.shp to shp_submitable/Survey_Da

Convert the logged error (for the .'shp' file extensions) to a csv in other to view the corrupted files

In [3]:
import csv
import re

# Define the input log file and output CSV file
def convert_log_to_csv(log_file, csv_file):
    # Define a regular expression pattern to match the log entries
    log_pattern = re.compile(r'^(?P<timestamp>[^:]+):(?P<level>[^:]+):(?P<message>.+)$')
    
    # Open the log file and the CSV file
    with open(log_file, 'r') as log, open(csv_file, 'w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        
        # Write the header row
        csv_writer.writerow(['Timestamp', 'Level', 'Message'])
        
        # Read and parse each line in the log file
        for line in log:
            match = log_pattern.match(line)
            if match:
                timestamp = match.group('timestamp')
                level = match.group('level')
                message = match.group('message')
                
                # Write the parsed log entry to the CSV file
                csv_writer.writerow([timestamp, level, message])


    print(f"Log file has been converted to {csv_file}")            
    # Print the location of the saved CSV file
    #print(f"CSV file saved at: {os.path.abspath(csv_file)}")

# Specify the parameter
log_file = 'error_log_6.txt'

# Call the function to convert the error log 3.txt to csv 
convert_log_to_csv(log_file, 'faulty_file3.csv')


Log file has been converted to faulty_file3.csv
