<a href="https://colab.research.google.com/github/frios2020/ETL-MADE-EASY/blob/main/Registro_de_Actuaciones_2016_2022.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## ETL Processed paperworks at the Peruvian Consulate in Paterson New Jersey during March 2016 - December 2022.





In [None]:
import pandas as pd              # this module helps in processing CSV files
import glob                      # this module helps in selecting files 
from datetime import datetime    # this module helps to manipulate datetime fields 
from google.colab import drive   # this module helps to connect to google drive

## Set Paths


In [None]:
logfile    = "logfile.txt"                                        # all event logs will be stored in this file
targetfile = "transformed_data.csv"                               # file where transformed data is stored
folder     = "/content/drive/MyDrive/RAW DATA/ACTUACIONES/*.xlsx" # path in google drive where the files to process are.

## Connecting Google Drive

In [None]:
drive.mount('/content/drive')

In [None]:
files = glob.glob(folder)
print("Total number of files: ", len(files))

In [None]:
for x in files:
  print(x)

# Extract

  # XLS Extract Function

In [None]:
def extract_from_xls(file_to_process):
    dataframe = pd.read_excel(file_to_process, parse_dates=['Fecha Actuación']) # Parametrer parse_dates make "Fecha Actuacion" like datetime.
    return dataframe

In [None]:
def extract():
    extracted_data = pd.DataFrame()                           # Create an empty data frame to hold extracted data
    for xlsfile in glob.glob(folder):                         # Process all xls files and add in onedataframe using append function.
        extracted_data = extracted_data.append(extract_from_xls(xlsfile), ignore_index=True)
    return extracted_data

## Cleaning and transforming data

In [None]:
#from numpy import datetime64
def transform(data):
  data = data.dropna(axis=1,how='all')                                       # Delete columns where all values are NaN.
  data['Fecha'] = data['Fecha Actuación'].dt.date                            # Create a field "fecha"
  data['Hora'] = pd.to_datetime(data['Fecha Actuación']).dt.strftime('%H:%M')# Create a field "hora".
  data['Anio']=data['Fecha Actuación'].dt.year                               # Create a field "anio".
  data['Mes']=data['Fecha Actuación'].dt.month                               # Create a field "mes".
  data['Dia']=data['Fecha Actuación'].dt.dayofweek                           # Create a fiekd "dia" Monday is 0 and Sunday is 6
  
  # Rename columns
  data.rename(columns = {'N° Item':'Num_Item', 'Corr. Actuación':'Corr_General',
                                 'Fecha Actuación':'Fecha_Actuacion','Nombre del Interesado':'Nombres',
                                 'Autoadhesivo Consular':'Autoadhesivo_Consular','Naturaleza del Acto':'Descripcion_Actuacion',
                                 'N° Tarifa':'Num_Tarifa','N° Actuación':'Num_Actuacion','Moneda Extranjera $':'Moneda_Extranjera',
                                 'Soles Consular S/C':'Soles_Consulares','T. C. Consular':'TC_Consular','Observación':'Observacion'
                                 }, inplace = True)
  # Fix data types of columns
  data['Fecha_Actuacion'] = data['Fecha_Actuacion'].astype('datetime64[m]') # Just hours and minutes.
  data['Num_Item']=data.Num_Item.astype(int)
  data['Corr_General']=data.Corr_General.astype(int)
  data['Autoadhesivo_Consular']=data.Autoadhesivo_Consular.astype(str)
  data['Num_Actuacion']=data.Num_Actuacion.astype(int)
  data['Moneda_Extranjera']=data.Moneda_Extranjera.astype(int)
  data['Soles_Consulares']=data.Soles_Consulares.astype(int)
  data['TC_Consular']=data.TC_Consular.astype(int)

  return data

## Load data

In [None]:
def load(targetfile,data_to_load):
    data_to_load.to_csv(targetfile, encoding="utf-16")

## Logging

In [None]:
def log(message):
    timestamp_format = '%Y-%h-%d-%H:%M:%S' # Year-Monthname-Day-Hour-Minute-Second
    now = datetime.now() # get current timestamp
    timestamp = now.strftime(timestamp_format)
    with open("logfile.txt","a") as f:
        f.write(timestamp + ',' + message + '\n')

## Running ETL Process


In [None]:
log("ETL Job Started")

In [None]:
log("Extract phase Started")
extracted_data = extract()
log("Extract phase Ended")
extracted_data

In [None]:
log("Transform phase Started")
transformed_data = transform(extracted_data)
log("Transform phase Ended")
transformed_data 

In [None]:
log("Load phase Started")
load(targetfile,transformed_data)
log("Load phase Ended")

In [None]:
log("ETL Job Ended")