# ETL Processes

This notebook is used to create the tables in the Nubank Postgres database and execute the ETL process puting the data in csv files inside the Postgres DB.

It was created by me with the purpose to execute a few tests of the created querys used in the ```case_answers``` jupyter notebook that is in the same folder.

If you want to use this to run the querys you need to follow the next steps:

1. Install docker if you don`t have it. You can find instructions [here](https://docs.docker.com/desktop/).

2. Run the command: ```docker run -p 5432:5432 --name nubank -e POSTGRES_PASSWORD=nubank -d postgres``` to create a docker container named Nubank with a postgres image.

3. Run the cells bellow in this notebook to create the tables in the nubank database

In [4]:
import os
import glob
import psycopg2

import pandas as pd
from sql_queries import *

In [5]:
conn = psycopg2.connect("host=127.0.0.1 user=postgres password=nubank")
conn.set_session(autocommit=True)
cur = conn.cursor()

In [6]:
cur.execute("DROP DATABASE IF EXISTS nubank")
cur.execute("CREATE DATABASE nubank WITH ENCODING 'utf8' TEMPLATE template0")
#close connection to default database
conn.close()    

conn = psycopg2.connect("host=127.0.0.1 dbname=nubank user=postgres password=nubank")
cur = conn.cursor()

In [7]:
#conn = psycopg2.connect("host=127.0.0.1 dbname=nubank user=postgres password=nubank")
#cur = conn.cursor()

In [8]:
for query in drop_table_queries:
        cur.execute(query)
        conn.commit()

for query in create_table_queries:
        cur.execute(query)
        conn.commit()

In [9]:
#VERIFY THE DB TABLES
cur.execute("select relname from pg_class where relkind='r' and relname !~ '^(pg_|sql_)';")
print(cur.fetchall())

[('accounts_table',), ('city_table',), ('customers_table',), ('country_table',), ('d_month_table',), ('d_time_table',), ('d_week_table',), ('d_weekday_table',), ('d_year_table',), ('pix_movements_table',), ('state_table',), ('transfer_ins_table',), ('transfer_outs_table',), ('fact_movements_table',), ('dim_time_table',), ('dim_customers_table',)]


In [10]:
def insert_function(data, query):
        for i, row in data.iterrows():
                cur.execute(query, list(row))
        conn.commit()


# Process table data
In this first part, you'll perform ETL on the tables

### Insert data in accounts_table

In [11]:
accounts_df = pd.read_csv("./tables/tables/accounts_table.csv")
insert_function(accounts_df, accounts_table_insert)

### Insert data in city_table

In [12]:
city_df = pd.read_csv("./tables/tables/city_table.csv")
city_df = city_df[['city_id','city','state_id']]
insert_function(city_df, city_table_insert)

### Insert data in customers_table

In [13]:
customers_df = pd.read_csv("./tables/tables/customers_table.csv")
customers_df.head()
insert_function(customers_df, customers_table_insert)

### Insert data in country_table

In [14]:
country_df = pd.read_csv("./tables/tables/country_table.csv")
country_df = country_df[["country_id", "country"]]
insert_function(country_df, country_table_insert)

### Insert data in d_month_table

In [15]:
d_month_table_df = pd.read_csv("./tables/tables/d_month_table.csv")
insert_function(d_month_table_df, d_month_table_insert)

### Insert data in d_time_table

In [16]:
d_time_table_df = pd.read_csv("./tables/tables/d_time_table.csv")
tmp_df = "./tables/tables/tmp.csv"
d_time_table_df.to_csv(tmp_df,index=False , header=False)
f = open(tmp_df, 'r')
cur.copy_from(f, "d_time_table", sep=",")
conn.commit()

### Insert data in d_week_table

In [17]:
d_week_table_df = pd.read_csv("./tables/tables/d_week_table.csv")
insert_function(d_week_table_df, d_week_table_insert)

### Insert data in d_weekday_table

In [18]:
d_weekday_table_df = pd.read_csv("./tables/tables/d_weekday_table.csv")
insert_function(d_weekday_table_df, d_weekday_table_insert)

### Insert data in d_year_table

In [19]:
d_year_table_df = pd.read_csv("./tables/tables/d_year_table.csv")
insert_function(d_year_table_df, d_year_table_insert)

### Insert data in state_table

In [20]:
state_table_df = pd.read_csv("./tables/tables/state_table.csv")
state_table_df = state_table_df[["state_id", "state", "country_id"]]
insert_function(state_table_df, state_table_insert)

### Insert data in pix_movements_table

In [21]:
pix_movements_table_df = pd.read_csv("./tables/tables/pix_movements_table.csv")
tmp_df = "./tables/tables/tmp.csv"
pix_movements_table_df.to_csv(tmp_df,index=False , header=False)
f = open(tmp_df, 'r')
cur.copy_from(f, "pix_movements_table", sep=",")
conn.commit()

### Insert data in transfer_ins_table

In [22]:
transfer_ins_table_df = pd.read_csv("./tables/tables/transfer_ins_table.csv")
tmp_df = "./tables/tables/tmp.csv"
transfer_ins_table_df.to_csv(tmp_df,index=False , header=False)
f = open(tmp_df, 'r')
cur.copy_from(f, "transfer_ins_table", sep=",")
conn.commit()

### Insert data in transfer_outs_table

In [23]:
transfer_outs_table_df = pd.read_csv("./tables/tables/transfer_outs_table.csv")
tmp_df = "./tables/tables/tmp.csv"
transfer_outs_table_df.to_csv(tmp_df,index=False , header=False)
f = open(tmp_df, 'r')
cur.copy_from(f, "transfer_outs_table", sep=",")
conn.commit()

### Insert data in fact_movements_table

In [24]:

from_pix_df = pix_movements_table_df.join(accounts_df.set_index('account_id'), on='account_id',rsuffix="p_")
from_in_df = transfer_ins_table_df.join(accounts_df.set_index('account_id'), on='account_id',rsuffix="in_")
from_out_df = transfer_outs_table_df.join(accounts_df.set_index('account_id'), on='account_id',rsuffix="out_")

In [25]:
from_pix_df["in_or_out"].replace({"pix_out": "out", "pix_in": "in"}, inplace=True)
from_pix_df.insert(3,'type', 'pix')
from_pix_df = from_pix_df[['id', 'account_id','customer_id','in_or_out','type' , 'pix_amount','pix_requested_at','pix_completed_at','status' ]]
from_pix_df = from_pix_df.rename(columns={'pix_amount': 'amount', 'pix_requested_at': 'transaction_requested_at', 'pix_completed_at': 'transaction_completed_at'})
from_in_df.insert(3,'in_or_out', 'in')
from_out_df.insert(3,'in_or_out', 'out')
from_in_df.insert(3,'type', 'transfer_in')
from_out_df.insert(3,'type', 'transfer_out')

from_in_df = from_in_df[['id', 'account_id','customer_id','in_or_out','type' , 'amount','transaction_requested_at','transaction_completed_at','status' ]]
from_out_df = from_out_df[['id', 'account_id','customer_id','in_or_out','type' , 'amount','transaction_requested_at','transaction_completed_at','status' ]]

In [26]:
fact_movements_table = from_pix_df.append([from_in_df, from_out_df])

In [27]:
tmp_df = "./tables/tables/tmp.csv"
fact_movements_table.to_csv(tmp_df,index=False , header=False)
f = open(tmp_df, 'r')
cur.copy_from(f, "fact_movements_table", sep=",")
conn.commit()

### Insert data in the dim_time_table



In [28]:
dim_time_df = d_time_table_df.join(d_year_table_df.set_index('year_id'), on='year_id',rsuffix="y_")
dim_time_df = dim_time_df.join(d_month_table_df.set_index('month_id'), on='month_id',rsuffix="y_")
dim_time_df = dim_time_df.join(d_week_table_df.set_index('week_id'), on='week_id',rsuffix="y_")
dim_time_df = dim_time_df.join(d_weekday_table_df.set_index('weekday_id'), on='weekday_id',rsuffix="y_")
dim_time_df = dim_time_df[['time_id', 'action_timestamp', 'action_week', 'action_month', 'action_year',	'action_weekday']]

In [29]:
tmp_df = "./tables/tables/tmp.csv"
dim_time_df.to_csv(tmp_df,index=False , header=False)
f = open(tmp_df, 'r')
cur.copy_from(f, "dim_time_table", sep=",")
conn.commit()

### Insert data in the dim_customer_table

In [30]:
dim_customer_df = customers_df.join(city_df.set_index('city_id'), on='customer_city',rsuffix="city_")
dim_customer_df = dim_customer_df.join(state_table_df.set_index('state_id'), on='state_id',rsuffix="cost_")
dim_customer_df = dim_customer_df.join(country_df.set_index('country_id'), on='country_id',rsuffix="cost_")
dim_customer_df = dim_customer_df[['customer_id', 'first_name', 'last_name', 'city', 'state', 'country', 'cpf']]

In [31]:
tmp_df = "./tables/tables/tmp.csv"
dim_customer_df.to_csv(tmp_df,index=False , header=False)
f = open(tmp_df, 'r')
cur.copy_from(f, "dim_customers_table", sep=",")
conn.commit()

### Close connection

In [32]:
conn.close()