# Extract, Transform, and Load Data using Python

# Introduction
Extract, Transform and Load (ETL) operations are of extreme importance in the role of a Data engineer. A data engineer extracts data from multiple sources and different file formats, transforms the extracted data to predefined settings and then loads the data to a database for further processing. In this lab, you will get hands-on practice of performing these operations.

# Objectives
After completing this lab, you will be able to:

Read CSV, JSON, and XML file types.
Extract the required data from the different file types.
Transform data to the required format.
Save the transformed data in a ready-to-load format, which can be loaded into an RDBMS.

# Gather the data files

In [1]:
import os
import requests
import zipfile

# Step 1: Download the ZIP file

url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-PY0221EN-SkillsNetwork/labs/module%206/Lab%20-%20Extract%20Transform%20Load/data/source.zip"
zip_file_name = "source.zip"

# Download the file

response = requests.get(url)

with open(zip_file_name, 'wb') as file:       # writes the binary content of the downloaded file to a local file source.zip.
    file.write(response.content)
    
print("ZIP file downloaded.")

# Step 2: Unzip the file

with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:   # Unzipping zip_file_name into new local file source_data
    zip_ref.extractall("source_data")
    
print("ZIP file extracted.")

# Step 3: Verify the files

print("Extracted files:")

print(os.listdir("source_data"))

# os.listdir() is a function in the os module which is used to list all the files and directories in a specified directory. 
# It provides an easy way to inspect the contents of a folder.

ZIP file downloaded.
ZIP file extracted.
Extracted files:
['source1.csv', 'source1.json', 'source1.xml', 'source2.csv', 'source2.json', 'source2.xml', 'source3.csv', 'source3.json', 'source3.xml']


# Extraction

Imports

In [2]:
import glob 
import pandas as pd 
import xml.etree.ElementTree as ET 
from datetime import datetime 

Log File and Target File

In [3]:
log_file = "log_file.txt" 
target_file = "transformed_data.csv" 

This script is designed to extract and consolidate data from files in CSV, JSON, and XML formats into a single DataFrame

In [4]:
def extract_from_csv(file_to_process): 
    
    dataframe = pd.read_csv(file_to_process) 
    return dataframe 
  
def extract_from_json(file_to_process):
    
    dataframe = pd.read_json(file_to_process, lines = True) 
    return dataframe 

# The parameter lines = True specifies that the file contains JSON objects on separate lines.
  
def extract_from_xml(file_to_process): 
    
    dataframe = pd.DataFrame(columns = ["name", "height", "weight"]) 
    
    # Initializes an empty DataFrame with columns name, height, weight.
    
    tree = ET.parse(file_to_process) 
    root = tree.getroot() 
    
    # Parses the XML file using ET.parse and retrieves the root element.
    
    for person in root:  # Iterates through each <person> element in the XML, extracting:
        
        name = person.find("name").text                        # name (string)
        
        height = float(person.find("height").text)             # height (converted to float)
        
        weight = float(person.find("weight").text)             # weight (converted to float)
        
        dataframe = pd.concat([dataframe, pd.DataFrame([{"name":name, "height":height, "weight":weight}])], ignore_index = True) 
        
        # Appends each person's data as a row to the dataframe.
        
    return dataframe 

In [5]:
def extract(): 
    extracted_data = pd.DataFrame(columns = ['name', 'height', 'weight'])  
    
# create an empty data frame to hold extracted data 
     
    # process all csv files 
    for csvfile in glob.glob("source_data/*.csv"): 
        extracted_data = pd.concat([extracted_data, pd.DataFrame(extract_from_csv(csvfile))], ignore_index = True) 
         
    # process all json files 
    for jsonfile in glob.glob("source_data/*.json"): 
        extracted_data = pd.concat([extracted_data, pd.DataFrame(extract_from_json(jsonfile))], ignore_index = True) 
     
    # process all xml files 
    for xmlfile in glob.glob("source_data/*.xml"): 
        extracted_data = pd.concat([extracted_data, pd.DataFrame(extract_from_xml(xmlfile))], ignore_index = True) 
         
    return extracted_data 

# Transformation

In [6]:
def transform(data): 
    
    # Convert inches to meters and round off to two decimals 
    # 1 inch is 0.0254 meters 
    
    data['height'] = round(data.height * 0.0254, 2) 
     
    # Convert pounds to kilograms and round off to two decimals 
    # 1 pound is 0.45359237 kilograms 
    
    data['weight'] = round(data.weight * 0.45359237, 2) 
     
    return data 

# Loading and Logging

In [7]:
def load_data(target_file, transformed_data):
    transformed_data.to_csv(target_file, index = False)  # Prevent index from being saved

    
# Saves the transformed data to a CSV file
  
def log_progress(message): 
    
    timestamp_format = '%Y-%h-%d-%H:%M:%S'     # Year-Monthname-Day-Hour-Minute-Second 
    
    now = datetime.now()                       # get current timestamp 
    
    timestamp = now.strftime(timestamp_format) 
    
    with open(log_file,"a") as f: 
        f.write(timestamp + ',' + message + '\n') 

# Testing ETL operations and log progress

In [8]:
# Log the initialization of the ETL process 
log_progress("ETL Job Started") 
  
# Log the beginning of the Extraction process 
log_progress("Extract phase Started") 
extracted_data = extract() 
  
# Log the completion of the Extraction process 
log_progress("Extract phase Ended") 
  
# Log the beginning of the Transformation process 
log_progress("Transform phase Started") 
transformed_data = transform(extracted_data) 
print("Transformed Data") 
print(transformed_data) 
  
# Log the completion of the Transformation process 
log_progress("Transform phase Ended") 
  
# Log the beginning of the Loading process 
log_progress("Load phase Started") 
load_data(target_file, transformed_data) 
  
# Log the completion of the Loading process 
log_progress("Load phase Ended") 
  
# Log the completion of the ETL process 
log_progress("ETL Job Ended") 

Transformed Data
     name  height  weight
0    alex    1.67   51.25
1    ajay    1.82   61.91
2   alice    1.76   69.41
3    ravi    1.73   64.56
4     joe    1.72   65.45
5    alex    1.67   51.25
6    ajay    1.82   61.91
7   alice    1.76   69.41
8    ravi    1.73   64.56
9     joe    1.72   65.45
10   alex    1.67   51.25
11   ajay    1.82   61.91
12  alice    1.76   69.41
13   ravi    1.73   64.56
14    joe    1.72   65.45
15   jack    1.74   55.93
16    tom    1.77   64.18
17  tracy    1.78   61.90
18   john    1.72   50.97
19   jack    1.74   55.93
20    tom    1.77   64.18
21  tracy    1.78   61.90
22   john    1.72   50.97
23   jack    1.74   55.93
24    tom    1.77   64.18
25  tracy    1.78   61.90
26   john    1.72   50.97
27  simon    1.72   50.97
28  jacob    1.70   54.73
29  cindy    1.69   57.81
30   ivan    1.72   51.77
31  simon    1.72   50.97
32  jacob    1.70   54.73
33  cindy    1.69   57.81
34   ivan    1.72   51.77
35  simon    1.72   50.97
36  jacob    1.70   5