In [None]:
"""
Title: Sales and Inventory Data ETL Pipeline

Description:
This script demonstrates a range of data engineering skills, including:
- **Database Connection and Query**: Establishes a connection to a PostgreSQL database and performs SQL queries.
- **Data Manipulation and Transformation**: Utilizes pandas to manipulate and clean data.
- **API Integration**: Integrates with Salesforce API, showcasing the ability to work with different systems.
- **Process Automation**: Automates the workflow of data extraction, transformation, and loading (ETL), exporting results to Excel and CSV files.

These capabilities are essential for any data engineering role and make this script a valuable addition to a data engineering portfolio.
"""

import requests
import json
import pandas as pd
import psycopg2
import numpy as np

# Function to create a database connection
def connect_db():
    con = psycopg2.connect(host='your-host',
                           database='your-database',
                           user='your-user',
                           password='your-password')
    return con

def query_db(sql):
    con = connect_db()
    cur = con.cursor()
    cur.execute(sql)
    recset = cur.fetchall()
    records = []
    for rec in recset:
        records.append(rec)
    con.close()
    return records

bdespec = query_db("""
 with consulta as (
 select distinct to_char(dt_venda, 'dd/mm/yyyy') as sale_date,
                 to_char(dt_create, 'dd/mm/yyyy') as delivery_date,
                 id_order,
                 nr_pedido,
                 id_vendedor,
                 id_loja,
                 id_especificador,
                 id_cliente,
                 round(cast(vl_vnp as numeric), 2) + vl_frete as value
   from your_schema.stage_order so
  where dt_venda between '2022-09-09' and '2022-09-13'
    and dt_cancelamento is null and id_especificador is not null
)
select case dr.ds_name
            when 'Especificador' then da.nr_cpf
            when 'Escritório Especificador' then da.cnpj__c
            when 'Cliente PF' then da.nr_cpf
            when 'Cliente PJ' then da.cnpj__c
       end  as cpfcnpj,
       UPPER(da.nm_account) as name,
       lower(da.ds_email)  as email,
       c.sale_date as "date",
       c.delivery_date as "delivery date",
       c.value as "value",
       id_order,
       nr_pedido,
       nr_pedido || 'Specifier' as "code"
  from consulta c
  left join your_schema.dim_account da on da.id = c.id_especificador
  left join your_schema.dim_recordtype dr on dr.id = da.id_recordtype
""")
bdespec = pd.DataFrame(bdespec, columns=['cpfcnpj', 'name', 'email', 'date', 'delivery date', 'value', 'id_order', 'nr_pedido', 'code'])
bdespec.head()

import locale
locale.setlocale(locale.LC_ALL, 'en_US.utf8')
pd.set_option("float_format", locale.currency)

from simple_salesforce import Salesforce
sf = Salesforce(
username='your-username',
password='your-password',
security_token='your-security-token')

sf_data = sf.query_all(s)

def clean_crlf(validationData, field):
    newData = validationData[field]
    for data in range(len(newData)):
        for key, value in newData[data].items():
            try:
                newData[data][key] = value.replace('\r', ' ').replace('\n', '')
            except:
                pass
    return {field: newData}

sf_data = clean_crlf(sf_data, "records")
sf_df = pd.DataFrame(sf_data["records"]).drop(columns='attributes')
sf_df = sf_df.replace({';', ''}, regex=True)
# sf_df.to_csv('path_to_save\\order_delta.csv', encoding="utf-8", sep=';',  index=False)

sf_df.head()

t = "Id, OrderId, Production_Prediction__c"
f = "OrderItem"
w = " WHERE LastModifiedDate >= LAST_N_DAYS:30 "
s = "SELECT " + t + " FROM " + f + w

sf_data = sf.query_all(s)
sf_df = pd.DataFrame(sf_data['records']).drop(columns='attributes')
sf_df = sf_df.replace({';', ''}, regex=True)
# sf_df.to_csv('path_to_save\\orderItem_delta.csv', encoding="utf-8",  sep=';', index=False)

sf_df.head()

bdespecx = bdespec.merge(sf_df, how='left', left_on='id_order', right_on='OrderId')
bdespecx.head()

bdespecx.to_excel('path_to_save\\specifiers_test2.xlsx',  index=False)
bdespecx['Multiplier'] = np.where(bdespecx['Production_Prediction__c'] >= 'IN STOCK', 2, 1)

bdespecx['Adjusted_Value'] = bdespecx['value'] * bdespecx['Multiplier']
bdespecx.head()

bdespecx['name'] = bdespecx['name'].str.replace('[-]|[:]|[()]|[[]]|[.]|[*]|[+]|[&]|[/]|[]]|[[]', '')
bdespecx.head()

bdespecx['name'] = bdespecx['name'].str.replace('Ç','C').str.replace('é','e').str.replace('Ã','A').str.replace('Á','A').str.replace('Ê','E').str.replace('É','E').str.replace('Í','I').str.replace('Ú','U').str.replace('À','A').str.replace('à','a').str.replace('á','a').str.replace('ú','u').str.replace('í','i').str.replace('ó','o').str.replace('ã','a').str.replace('õ','o').str.replace('Õ','O')
bdespecx.head()

bdespec_f.drop_duplicates(subset=['code'])

bdespec_f.to_excel('path_to_save\\specifiers_test1.xlsx',  index=False)

bdcliente = query_db("""
 with consulta as (
 select distinct to_char(dt_venda, 'dd/mm/yyyy') as sale_date,
                 to_char(dt_create, 'dd/mm/yyyy') as delivery_date,
                 nr_pedido,
                 id_vendedor,
                 id_loja,
                 id_especificador,
                 id_cliente,
                 round(cast(vl_vnp as numeric), 2) + vl_frete as value
   from your_schema.stage_order so
  where dt_venda between '2022-09-09' and '2022-09-13'
    and dt_cancelamento is null
)
""")

from simple_salesforce import Salesforce
sf = Salesforce(
username='your-username', 
password='your-password', 
security_token='your-security-token')
t = "Id,Name,Store_Manager__c"
f = "Account"
w = " WHERE RecordTypeId = 'your-recordtype-id' "
s = "SELECT " + t + " FROM " + f + w
sf_data = sf.query_all(s)
sf_df = pd.DataFrame(sf_data['records']).drop(columns='attributes')
sf_df = sf_df.replace({';', ''}, regex=True)
# sf_df.to_csv('path_to_save\\orderItem_delta.csv',encoding="utf-8",  sep=';', index=False)

sf_df.head()
bdgerente2 = bdgerente_venda.merge(sf_df, how='left', left_on='id_store', right_on='Id')
bdgerente2.head()

bdgerente_user = query_db("""
                select distinct du.id as user_id,
                        dc.name as manager_name,
                        dc.cpf__c as manager_cpf,
                        dc.email as manager_email
                from your_schema.dim_users du
                left join your_schema.dim_contact dc on dc.id = du.id_contact
""")
bdgerente_user2 = pd.DataFrame(bdgerente_user, columns=['user_id', 'manager_name', 'manager_cpf', 'manager_email'])
bdgerente_user2.head()
bdgerente['Multiplied'] = bdgerente['value'].apply(lambda x: x * 2)
bdgerente.head()

conditions = [(bdgerente['ProductsInStock'] < 0), (bdgerente['ProductsInStock'] > 0)]
values = bdgerente['Multiplied'], bdgerente['value']
bdgerente['Multiplier'] = np.select(conditions, values)
bdgerente.head()

bdgerente.to_csv('path_to_save\\manager_test.csv', encoding="utf-8", sep=';', index=False)
