## Extraction with Glob Library

## Extract CSV using Pandas

## Extract JSON using Pandas

## Remember, these ETL techiniques are used with files (xml, json, csv ...), there are some other ETL techinques to process semi, unstructured data, then transform and load into data repository.

In [30]:
# Example of using ETL techinques with xml, csv, json file processing:
import pandas as pd 
import xml.etree.ElementTree as ET #retrieve data from XML file
from datetime import datetime 

log_file = "log_file.txt" 
target_file = "transformed_data.csv"

# EXTRACTION
# function extract_from_csv to extract data from csv file:
def extract_from_csv(csv_file):
    dataframe = pd.read_csv(csv_file)
    return dataframe

# function extract_from_json to extract data from json file:
def extract_from_json(json_file):
    dataframe = pd.read_json(json_file)
    return dataframe

# function extract_from_xml to extract data from xml file:
# first, parse the data from the file using the element function
# then extract relevant information from this data
# and append to pandas dataframe
def extract_from_xml(xml_file):
    df = pd.DataFrame(columns=["name", "height", "weight"])
    tree = ET.parse(xml_file)
    root = tree.getroot()
    for person in root:
        name = person.find("name").text
        height = float(person.find("height")).text
        weight = float(person.find("weight")).text
        df = pd.concat([df, pd.DataFrame([{"name" : name, "height":height, "weight" : weight}])], ignore_index = True)
    return df

# extract function:
def extract(): 
    extracted_data = pd.DataFrame(columns=['name','height','weight']) # create an empty data frame to hold extracted data 
     
    # process all csv files 
    for csvfile in glob.glob("*.csv"): 
        extracted_data = pd.concat([extracted_data, pd.DataFrame(extract_from_csv(csvfile))], ignore_index=True) 
         
    # process all json files 
    for jsonfile in glob.glob("*.json"): 
        extracted_data = pd.concat([extracted_data, pd.DataFrame(extract_from_json(jsonfile))], ignore_index=True) 
     
    # process all xml files 
    for xmlfile in glob.glob("*.xml"): 
        extracted_data = pd.concat([extracted_data, pd.DataFrame(extract_from_xml(xmlfile))], ignore_index=True) 
         
    return extracted_data

# TRANSFORMATION :
# Convert data if needed :
def transform(data): 
    # Convert inches to meters and round off to two decimals 
    # 1 inch is 0.0254 meters 
    data['height'] = round(data.height * 0.0254,2) 
    
    # Convert pounds to kilograms and round off to two decimals 
    # 1 pound is 0.45359237 kilograms 
    data['weight'] = round(data.weight * 0.45359237,2)
     
    return data 

# LOADING AND LOGGINg:
# Load data into a csv file:
def load_data(target_file, transformed_data):
    transformed_data.to_csv(target_file)

# Logging :
def log_progress(message):
    timestamp_format = '%Y-%h-%d-%H:%M:%S'
    # Year-Month-Day-Hour-Minute-Second
    now = datetime.now()
    timestamp = now.strftime(timestamp_format)
    with open(log_file, "a") as f:
        f.write(timestamp + " " + message + '\n')
    
# TEST THE ETL PROCESS:
# Log the initialization of the ETL process 
log_progress("ETL Job Started") 
 
# Log the beginning of the Extraction process 
log_progress("Extract phase Started") 
extracted_data = extract() 
 
# Log the completion of the Extraction process 
log_progress("Extract phase Ended") 
 
# Log the beginning of the Transformation process 
log_progress("Transform phase Started") 
transformed_data = transform(extracted_data) 
print("Transformed Data") 
print(transformed_data) 
 
# Log the completion of the Transformation process 
log_progress("Transform phase Ended") 
 
# Log the beginning of the Loading process 
log_progress("Load phase Started") 
load_data(target_file,transformed_data) 
 
# Log the completion of the Loading process 
log_progress("Load phase Ended") 
 
# Log the completion of the ETL process 
log_progress("ETL Job Ended")


ValueError: Trailing data