In [10]:
import glob
import pandas as pd
import elementpath
import xml.etree.ElementTree as ET
from datetime import datetime

In [12]:
tmpfile = "dealership_temp.tmp"
logfile = "dealership_logfile.txt"
targetfile = "dealership_transformed_data.csv"

## Extract

In [13]:
def extract_from_csv(file_to_process):
    dataframe = pd.read_csv(file_to_process)
    return dataframe

def extract_from_json(file_to_process):
    dataframe = pd.read_json(file_to_process, lines=True)
    return dataframe

def extract_from_xml(file_to_process):
    dataframe = pd.DataFrame(columns=['car_model','year_of_manufacture','price', 'fuel'])
    tree = ET.parse(file_to_process)
    root = tree.getroot()
    for person in root:
        car_model = person.find("car_model").text
        year_of_manufacture = int(person.find("year_of_manufacture").text)
        price = float(person.find("price").text)
        fuel = person.find("fuel").text
        dataframe = dataframe.append({"car_model": car_model,
                                     "year_of_manufacture": year_of_manufacture,
                                     "price": price,
                                     "fuel": fuel}, ignore_index = True)
        
    return dataframe

In [16]:
def extract():
    extracted_data = pd.DataFrame(columns=['car_model','year_of_manufacture','price', 'fuel'])
    
    # for csv_file
    for csv_file in glob.glob('datasource/*.csv'):
        extracted_data = extracted_data.append(extract_from_csv(csv_file), ignore_index = True)
    
    # For JSON File
    for json_file in glob.glob('datasource/*.json'):
        extracted_data = extracted_data.append(extract_from_json(json_file), ignore_index=True)
    
    # For XML File
    for xml_file in glob.glob('datasource/*.xml'):
        extracted_data = extracted_data.append(extract_from_xml(xml_file), ignore_index=True)
    
    return extracted_data

## Load

In [17]:
def transform(data):
    data['price'] = round(data.price, 2)
    return data

## Loading and Logging

In [21]:
def load(target_file, data_to_load):
    data_to_load.to_csv(target_file)

In [19]:
def log(message):
    timestamp_format = '%H:%M:%S-%h-%d-%Y'
    now = datetime.now()
    timestamp = now.strftime(timestamp_format)
    with open("dealership_logfile.txt","a") as f: f.write(timestamp + ',' + message + 'n')

## Running ETL Process

In [23]:
log("ETL Job Started\n")

log("Extract phase Started\n")
extracted_data = extract() 
log("Extract phase Ended\n")

log("Transform phase Started\n")
transformed_data = transform(extracted_data)
log("Transform phase Ended\n")

log("Load phase Started\n")
load(targetfile,transformed_data)
log("Load phase Ended\n")

log("ETL Job Ended")