# Iniciando o projeto com as importação das dependencias


### FASE 1 - CONFIGURAÇÃO


In [1]:
# imports do projeto

import pandas as pd
import gdown as gd
import os
from pathlib import Path
from tabulate import tabulate

### CONSTANTES

In [2]:
# constanttes
SOURCE_FOLDER = 'source'
DF_FILE_NAME = 'df_fraud_credit'
PARQUET = 'parquet'
CSV = 'csv'

### FUNÇÕES


In [3]:
# função que criar os relatórios de Qualidade de Dados

def montar_estatisticas_dataFrame(originial, tratado):
    total_registros = len(originial)
    total_tratado = len(tratado)
    dados_invalidos = total_registros - total_tratado
    dados_validos = total_registros - dados_invalidos
    percentual_erro = dados_invalidos / total_registros

    # Exibir as métricas
    print("\n📌 **Data Quality Report**\n")
    print(f"📊 Total de registros importados: {total_registros}")
    print(f"✅ Registros válidos: {dados_validos}")
    print(f"❌ Registros com erro: {dados_invalidos}")
    print(f"% Registros com erro: {percentual_erro:.2f}%")

In [4]:
# função para criação de pasta

def criar_pasta(folderName):
    os.makedirs(folderName, exist_ok=True)

In [5]:
# função para montar a string do caminho do arquivo
def makeFolderName(folder, fileName, fileFormat):
    return "./{}/{}.{}".format(folder, fileName, fileFormat)

In [6]:
# função que checa se o arquivo já existe na pasta antes de fazer o download
def fileExistis():
    csvFile = makeFolderName(SOURCE_FOLDER, DF_FILE_NAME, CSV)
    parquetFile = makeFolderName(SOURCE_FOLDER, DF_FILE_NAME, PARQUET)

    file_path = Path(csvFile)
    if not file_path.exists():
        print("Start downloading fileName: {}.csv".format(DF_FILE_NAME))
        url = "https://drive.google.com/uc?export=download&id=1Vumu8jo3P3umuUtBZb6mn7YoVIo4X0ON"
        gd.download(url, csvFile, quiet=False)
        print("Finished download fileName: {}".format(DF_FILE_NAME))
        # conversão do dataFrame do formato csv para parquet para reduzir o tamanho do arquivo
        originalData = pd.read_csv(csvFile)
        print("Start parquetFile converter: {}".format(DF_FILE_NAME))
        originalData.to_parquet(parquetFile, engine="pyarrow", index=False)
        print("Finished parquetFile converter: {}".format(DF_FILE_NAME))
    else:
        # conversão do dataFrame do formato csv para parquet para reduzir o tamanho do arquivo
        print("Start load file {}.csv".format(DF_FILE_NAME))
        originalData = pd.read_csv(csvFile)
        print("Finished load file {}.csv".format(DF_FILE_NAME))
        print("Start parquetFile converter: {}.parquet".format(DF_FILE_NAME))
        originalData.to_parquet(parquetFile, engine="pyarrow", index=False)
        print("Finished parquetFile converter: {}.parquet".format(DF_FILE_NAME))

### FASE 2 - INGESTÃO


In [7]:
# criar a pasta pouso para receber os dados
criar_pasta(SOURCE_FOLDER)

In [8]:
# download dos dados da origem
fileExistis()


Start downloading fileName: df_fraud_credit.csv


Downloading...
From (original): https://drive.google.com/uc?export=download&id=1Vumu8jo3P3umuUtBZb6mn7YoVIo4X0ON
From (redirected): https://drive.google.com/uc?export=download&id=1Vumu8jo3P3umuUtBZb6mn7YoVIo4X0ON&confirm=t&uuid=f0fb2e73-b039-4224-87de-7dce421eb63d
To: c:\PROJETOS\NOTEBOOK\pipeline\source\df_fraud_credit.csv
100%|██████████| 1.55G/1.55G [03:34<00:00, 7.23MB/s]


Finished download fileName: df_fraud_credit
Start parquetFile converter: df_fraud_credit
Finished parquetFile converter: df_fraud_credit


In [9]:
# carregando o dataFrame a partir do arquivo no formato parquet
parquetFile = makeFolderName(SOURCE_FOLDER, DF_FILE_NAME, PARQUET)
initialData = pd.read_parquet(parquetFile, engine="pyarrow")

### FASE 3 - TRATAMENTO

In [10]:
# realizar filtro das ocorrências onde as regiões aparencem != 0
zeroFilter = initialData.location_region != "0"
filterData = initialData.loc[zeroFilter]

# realizar filtro das ocorrências de risk_score != none
noneRiskScoreFilter = filterData.risk_score != "none"
filterData = filterData.loc[noneRiskScoreFilter]

# realizar filtro das ocorrencias com amount != none
noneAmountFilter = filterData.amount != "none"
filterData = filterData.loc[noneAmountFilter]

### FASE 4 - CONVERSÃO

In [13]:
# transformação da coluna timestamp em dateTime
filterData["timestamp"] = pd.to_datetime(filterData["timestamp"], unit="s")

In [14]:
# transformar risk_score em numerico
filterData["risk_score"] = pd.to_numeric(filterData["risk_score"].round(3))

In [15]:
# converter todos os dados da coluna amount para o formato Float64
filterData["amount"] = filterData["amount"].values.astype("float64")

### FASE 5 - AGREGAÇÃO


In [16]:
riskScoreTable = (
    filterData.groupby("location_region")["risk_score"]
    .mean()
    .round(3)
    .reset_index()
    .sort_values(by="risk_score", ascending=False)
)

In [17]:
salesData = filterData[filterData["transaction_type"] == "sale"]

In [18]:
recentTransactions = salesData.sort_values(
    by=["receiving_address", "timestamp"], ascending=[True, False]
)
recentTransactions = recentTransactions.drop_duplicates(subset=["receiving_address"], keep="first")


In [19]:
topRecentTransactions = recentTransactions.nlargest(3, "amount")[
    ["receiving_address", "amount", "timestamp"]
]

### FASE 6 - REPORTS

In [20]:
print("\n📌 **Region by Average Risk **\n")
print(tabulate(riskScoreTable, headers="keys", tablefmt="pretty", showindex=False))


📌 **Region by Average Risk **

+-----------------+------------+
| location_region | risk_score |
+-----------------+------------+
|  North America  |   45.155   |
|  South America  |   45.139   |
|      Asia       |   44.995   |
|     Africa      |   44.902   |
|     Europe      |   44.599   |
+-----------------+------------+


In [21]:
print("\n📌 **Top 3 'Receiving Address' with the highest 'Amount'**\n")
print(tabulate(topRecentTransactions, headers="keys", tablefmt="pretty", showindex=False))


📌 **Top 3 'Receiving Address' with the highest 'Amount'**

+--------------------------------------------+---------+---------------------+
|             receiving_address              | amount  |      timestamp      |
+--------------------------------------------+---------+---------------------+
| 0x841342e50c508ec4ffdef9b5208719c1dbed7968 | 76568.0 | 2024-01-02 03:53:01 |
| 0xe8aacdea4f2d7658e711de611bad8e3b5d6b2c7b | 76563.0 | 2024-01-01 02:49:56 |
| 0x231dd8e2959e878a59a26ebdbf6f7d122403f350 | 76559.0 | 2024-01-02 06:31:09 |
+--------------------------------------------+---------+---------------------+


In [23]:
montar_estatisticas_dataFrame(initialData, filterData)


📌 **Data Quality Report**

📊 Total de registros importados: 9291894
✅ Registros válidos: 9142488
❌ Registros com erro: 149406
% Registros com erro: 0.02%
