In [10]:
import pandas as pd
import xml.etree.ElementTree as ET
import logging
from datetime import datetime
import glob
import importlib

# Reset logging configuration
logging.shutdown()
importlib.reload(logging)

# Configure logging
logging.basicConfig(
    filename="C:/Users/91798/Desktop/ME36/source/log_file.txt",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

def extract_csv(file_path):
    start_time = datetime.now()
    logging.info("Extracting data from CSV file: %s", file_path)
    data = pd.read_csv(file_path)
    logging.info("Finished extracting CSV file in %s", datetime.now() - start_time)
    return data.to_dict(orient='records')

def extract_json(file_path):
    start_time = datetime.now()
    logging.info("Extracting data from JSON file: %s", file_path)
    data = pd.read_json(file_path, lines = True)
    logging.info("Finished extracting JSON file in %s", datetime.now() - start_time)
    return data.to_dict(orient='records')

def extract_xml(file_path):
    start_time = datetime.now()
    logging.info("Extracting data from XML file: %s", file_path)
    tree = ET.parse(file_path)
    root = tree.getroot()
    data = []
    for child in root:
        record = {elem.tag: elem.text for elem in child}
        data.append(record)
    logging.info("Finished extracting XML file in %s", datetime.now() - start_time)
    return data

def extract_files(folder_path):
    start_time = datetime.now()
    logging.info("Extracting data from files in folder: %s", folder_path)
    combined_data = []
    for file_path in glob.glob(f"{folder_path}/*"):
        if file_path.endswith('.csv'):
            combined_data.extend(extract_csv(file_path))
        elif file_path.endswith('.json'):
            combined_data.extend(extract_json(file_path))
        elif file_path.endswith('.xml'):
            combined_data.extend(extract_xml(file_path))
    
    #combined_data = pd.DataFrame(combined_data).drop_duplicates().to_dict(orient='records')
    logging.info("Finished extracting all files in %s", datetime.now() - start_time)
    return combined_data

def transform(data):
    start_time = datetime.now()
    logging.info("Transforming data")
    transformed_data = []
    for record in data:
        transformed_record = {}
        for key, value in record.items():
            # Convert heights from inches to meters
            if key == 'height':
                transformed_record[key] = float(value) * 0.0254  # Convert inches to meters
            # Convert weights from pounds to kilograms
            elif key == 'weight':
                transformed_record[key] = float(value) * 0.453592  # Convert pounds to kilograms
            else:
                transformed_record[key] = value
        transformed_data.append(transformed_record)
    logging.info("Finished transforming data in %s", datetime.now() - start_time)
    return transformed_data

def load_csv(data, output_file):
    start_time = datetime.now()
    logging.info("Loading data into CSV file: %s", output_file)
    if not data:
        logging.warning("No data to load")
        return

    df = pd.DataFrame(data)
    df.to_csv(output_file, index=False)
    logging.info("Data successfully loaded into %s in %s", output_file, datetime.now() - start_time)

def etl_pipeline(folder_path, output_file):
    logging.info("Starting ETL pipeline")

    # Extract phase
    combined_data = extract_files(folder_path)

    # Transform phase
    transformed_data = transform(combined_data)

    # Load phase
    load_csv(transformed_data, output_file)

    logging.info("ETL pipeline completed successfully")

# Example usage
if __name__ == "__main__":
    etl_pipeline(
        folder_path="C:/Users/91798/Desktop/ME36/source",
        output_file="C:/Users/91798/Desktop/ME36/source/transformed_data.csv"
    )
