In [19]:
import os
from pyspark.sql import SparkSession
from generate_df import table_to_df
from IPython.display import display, HTML
from tabulate import tabulate
from functools import reduce
from pyspark.sql import DataFrame
from pyspark.sql.functions import lit
from pyspark.sql.functions import regexp_extract, col, substring, instr, length
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType

In [2]:
# Definir o caminho do Spark corretamente
os.environ['SPARK_HOME'] = '/home/daiane/spark-3.5.1-bin-hadoop3/'

# Definir o caminho do Java corretamente
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-1.17.0-openjdk-amd64/'

# Iniciar uma sessão Spark
spark = SparkSession.builder \
    .appName("Exemplo Spark") \
    .getOrCreate()

# Testar a sessão Spark
spark

24/06/16 15:51:30 WARN Utils: Your hostname, victor-Lenovo-ideapad-330-15IKB resolves to a loopback address: 127.0.1.1; using 192.168.0.125 instead (on interface wlp2s0)
24/06/16 15:51:30 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/16 15:51:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Dataframes das tabelas 


In [3]:
df_agencias = table_to_df('finance_raw_data', 'agencias', spark)
df_bancos = table_to_df('finance_raw_data', 'bancos', spark)
df_cooperativas_credito = table_to_df('finance_raw_data', 'cooperativas_credito', spark)
df_sociedades = table_to_df('finance_raw_data', 'sociedades', spark)
df_address_adm_consorcio = table_to_df('finance_raw_data', 'administradoras_consorcio', spark)

#colunas para a dim_endereco: id, cnpj, address_type, registration_date, date_end, street,
#complement, number,  neighborhood, city, postalcode, state

colunas = ["cnpj", "data", "endereco","complemento", "bairro",  "municipio","cep", "uf"]


df_address_adm_consorcio_selected = df_address_adm_consorcio.select(colunas).withColumn("address_type", lit("sede"))
df_bancos_selected = df_bancos.select(colunas).withColumn("address_type", lit("sede"))
df_cooperativas_credito_selected = df_cooperativas_credito.select(colunas).withColumn("address_type", lit("sede"))
df_sociedades_selected = df_sociedades.select(colunas).withColumn("address_type", lit("sede"))
df_agencias_selected = df_agencias.select(colunas).withColumn("address_type", lit("agencia"))

dataframes_selected = [df_address_adm_consorcio_selected, df_bancos_selected, df_cooperativas_credito_selected, df_sociedades_selected, df_agencias_selected]

dataframes_selected = [
    df.withColumn("number", F.lit(None).cast("integer"))
      .withColumn("date_end", F.lit(None).cast("date"))
    for df in dataframes_selected
]


def unionAll(dataframes):
    return reduce(DataFrame.unionAll, dataframes)

df_enderecos = unionAll(dataframes_selected)

#colunas_ordem = ["cnpj",  "address_type", "data", "date_end", "endereco","complemento", "number", "bairro",  "municipio","cep", "uf"]

#df_enderecos = df_enderecos.select(colunas_ordem)

df_enderecos.persist()

df_enderecos.count()



24/06/16 15:52:27 WARN TaskSetManager: Stage 0 contains a task of very large size (1496 KiB). The maximum recommended task size is 1000 KiB.
24/06/16 15:52:31 WARN TaskSetManager: Stage 1 contains a task of very large size (1496 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

67453

In [4]:
df_address_adm_consorcio_sample = df_address_adm_consorcio.createOrReplaceTempView("df_address_adm_consorcio_sample")

result_df = spark.sql("SELECT * FROM df_address_adm_consorcio_sample LIMIT 10")

rows = result_df.collect()

columns = result_df.columns

table = tabulate(rows, headers=columns, tablefmt='html')

#display(HTML(table))

### Tratamento do tipo de logradouro


In [5]:
tipos_logradouros = ["AREA", "ACESSO", "ACAMPAMENTO", "AEROPORTO", "ALAMEDA", "AVENIDA", "BLOCO",
                     "CANAL", "CONDOMINIO", "DISTRITO", "ESTRADA", "RUA", "VIA", "TRAVESSA"]

#amostra_df = df_enderecos.sample(False, 0.5)

#df = amostra_df.filter(df_enderecos["endereco"].contains("PC. "))

df_enderecos_corrigido = df_enderecos.withColumn(
    "endereco",
    F.when(
        F.col("endereco").rlike(r"^R\. ") | F.col("endereco").rlike(r"^R "),
        F.regexp_replace(F.col("endereco"), r"^R\.? ", "RUA ")
    ).when(
        F.col("endereco").rlike(r"^AV\. ") | F.col("endereco").rlike(r"^AV "),
        F.regexp_replace(F.col("endereco"), r"^AV\.? ", "AVENIDA ")
    ).when(
        F.col("endereco").rlike(r"^TV\. ") | F.col("endereco").rlike(r"^TV "),
        F.regexp_replace(F.col("endereco"), r"^TV\.? ", "TRAVESSA ")
    ).when(
        F.col("endereco").rlike(r"^PC "),
        F.regexp_replace(F.col("endereco"), r"^PC ", "PRACA ")
    ).otherwise(F.col("endereco"))
)

df_enderecos_corrigido.persist()

df_enderecos_corrigido_1 = df_enderecos_corrigido.limit(50)

table = tabulate(df_enderecos_corrigido_1.collect(), headers=df_enderecos_corrigido_1.columns, tablefmt='html')
#table = tabulate(amostra_df.collect(), headers=amostra_df.columns, tablefmt='html')


display(HTML(table))

24/06/16 15:55:39 WARN CacheManager: Asked to cache already cached data.


cnpj,data,endereco,complemento,bairro,municipio,cep,uf,address_type,number,date_end
92459213,2024-05-23 19:44:42.617722,BR/158 KM 60 N. 1.500,,CERRITO,SANTA MARIA,97060090,RS,sede,,
45441789,2024-05-23 19:44:42.617722,AVENIDA SENADOR ROBERTO SIMONSEN 304,,SANTO ANTONIO,SÃO CAETANO DO SUL,9530401,SP,sede,,
9111444,2024-05-23 19:44:42.617722,"RUA HORTÊNCIA HELENA DE AMORIM BRITO, NO 13.008",SALA 07B SALA 08B SALA 09B SALA 10B,JARDIM AMERICA,CABEDELO,58102660,PB,sede,,
13170428,2024-05-23 19:44:42.617722,"AVENIDA JOÃO RIBEIRO, 652",,SANTO ANTONIO,ARACAJU,49065000,SE,sede,,
6046109,2024-05-23 19:44:42.617722,"RUA RIO GRANDE DO SUL, 448",,LOURDES,GOVERNADOR VALADARES,35030580,MG,sede,,
33868654,2024-05-23 19:44:42.617722,AVENIDA ALMIRANTE BARROSO 63,17 ANDAR,CENTRO,RIO DE JANEIRO,20031913,RJ,sede,,
6044551,2024-05-23 19:44:42.617722,RUA PEDRELINA DE MACEDO E SILVA 100,SALA 03,CENTRO,DOURADINA,87485000,PR,sede,,
33350620,2024-05-23 19:44:42.617722,SBS-QRA 2-LT.19-BL.A-SL.701/708-ED.CASA DE S.PAULO,,ASA SUL,BRASÍLIA,70078900,DF,sede,,
13548938,2024-05-23 19:44:42.617722,"RUA BAEPENDI, 162",CASA,ONDINA,SALVADOR,40170070,BA,sede,,
8570707,2024-05-23 19:44:42.617722,"AVENIDA PRUDENTE DE MORAIS, 1610",,TIROL,NATAL,59020510,RN,sede,,


### Tratamento do número do logradouro

Identificando o número na coluna "endereco" e copiando-o pra coluna "number"


In [8]:
amostra_df = df_enderecos_corrigido.sample(False, 0.3)

regex = r"(\d+(?:\.\d+)?)"

tipos_logradouros = ["AREA", "ACESSO", "ACAMPAMENTO", "AEROPORTO", "ALAMEDA", "AVENIDA", "BLOCO",
                     "CANAL", "CONDOMINIO", "DISTRITO", "ESTRADA", "RUA", "VIA", "TRAVESSA"]

df_endereco_numero_tratado = df_enderecos_corrigido.withColumn("number",
    F.when(
        F.col("endereco").like("%BR %") |
        F.col("endereco").like("%BR/%") |
        F.col("endereco").like("%RODOVIA%") |
        F.col("endereco").rlike(r"^\b(" + "|".join(tipos_logradouros) + r")\b \d+") |
        F.col("endereco").rlike(r"^\b(" + "|".join(tipos_logradouros) + r")\b \d+[A-Z]*$") |
        (F.col("endereco").like("%QUADRA%") & ~F.col("endereco").like("%LOTE%")),
        ""
    ).when(
        F.col("endereco").like("%QUADRA%") & F.col("endereco").like("%LOTE%"),
        F.regexp_extract(F.col("endereco"), r".*LOTE (\d+)", 1)
    ).when(
        F.col("endereco").like("%N %") | F.col("endereco").like("%N.") | F.col("endereco").like("%Nº"),
        F.regexp_extract(F.col("endereco"), r"N[ .º]*?(\d+)", 1)
    ).otherwise(F.regexp_extract(F.col("endereco"), regex, 0))
)

df_endereco_numero_tratado.persist()

#quadra = amostra.filter(df_enderecos["endereco"].contains("BR"))

#rows = df_endereco_numero_tratado.collect()

#columns = df_endereco_numero_tratado.columns

#table = tabulate(rows, headers=columns, tablefmt='html')

#display(HTML(table))

# Contar registros não vazios (não nulos e não vazios)
non_empty_count = df_endereco_numero_tratado.filter((col("number").isNotNull()) & (col("number") != "")).count()

# Contar registros vazios (nulos ou vazios)
empty_count = df_endereco_numero_tratado.filter((col("number").isNull()) | (col("number") == "")).count()

total_count = df_endereco_numero_tratado.count()

non_empty_percentage = (non_empty_count / total_count) * 100
empty_percentage = (empty_count / total_count) * 100


print(f"Registros não vazios: {non_empty_count} ({non_empty_percentage:.2f}%)")
print(f"Registros vazios: {empty_count} ({empty_percentage:.2f}%)")

24/06/16 16:36:21 WARN TaskSetManager: Stage 27 contains a task of very large size (1496 KiB). The maximum recommended task size is 1000 KiB.
24/06/16 16:36:23 WARN TaskSetManager: Stage 28 contains a task of very large size (1496 KiB). The maximum recommended task size is 1000 KiB.
24/06/16 16:36:23 WARN TaskSetManager: Stage 31 contains a task of very large size (1496 KiB). The maximum recommended task size is 1000 KiB.
24/06/16 16:36:24 WARN TaskSetManager: Stage 34 contains a task of very large size (1496 KiB). The maximum recommended task size is 1000 KiB.


Registros não vazios: 47598 (70.56%)
Registros vazios: 19855 (29.44%)


In [30]:
from pyspark.sql.functions import col, expr, length, substring


df = df_endereco_numero_tratado.withColumn("apos_numero", expr("substring(endereco, locate(number, endereco) + length(number) + 1, length(endereco))"))

# Cria uma view temporária contendo apenas a coluna temporária "apos_numero"
df.createOrReplaceTempView("view_temporariaa")

df_col_temp.persist()

# Exibe o DataFrame resultante
#df_col_temp.show(truncate=False)

# Exibe a view temporária
spark.sql("SELECT apos_numero FROM view_temporariaa").show(truncate=False)


spark.sql(" SELECT apos_numero, COUNT(*) as freq from view_temporariaa group by apos_numero order by freq desc ").show()


24/06/16 18:24:19 WARN CacheManager: Asked to cache already cached data.


+-----------------------------------------+
|apos_numero                              |
+-----------------------------------------+
|R/158 KM 60  N. 1.500                    |
|                                         |
|                                         |
|                                         |
|                                         |
|                                         |
|                                         |
|LT.19-BL.A-SL.701/708-ED.CASA DE S.PAULO |
|                                         |
|                                         |
|                                         |
| 6.- ANDAR, SALA 602                     |
|                                         |
| 1º ANDAR                                |
|                                         |
|  3  ANDAR                               |
|                                         |
|UCLEO CIDADE DE DEUS  S N                |
|ODOVIA FERNAO DIAS, KM 2, BR 381, N. 2111|
|                               

24/06/16 18:24:20 WARN TaskSetManager: Stage 50 contains a task of very large size (1496 KiB). The maximum recommended task size is 1000 KiB.

+--------------------+-----+
|         apos_numero| freq|
+--------------------+-----+
|                    |40526|
|VENIDA BRIGADEIRO...|  541|
|     VENIDA PAULISTA|  325|
|VENIDA PRESIDENTE...|  269|
|                   A|  255|
|VENIDA DAS NAÇÕES...|  145|
|    RAIA DE BOTAFOGO|  145|
|VENIDA TANCREDO N...|  139|
|                   -|  121|
| UA JOAQUIM FLORIANO|  120|
| VENIDA DOM PEDRO II|  111|
|   UA XV DE NOVEMBRO|  109|
|       LAMEDA SANTOS|  101|
|UA LEOPOLDO COUTO...|   93|
|VENIDA PRESIDENTE...|   91|
|    LAMEDA RIO NEGRO|   77|
|   VENIDA DA SAUDADE|   76|
|    UA DA ASSEMBLEIA|   76|
|VENIDA BRIGADEIRO...|   73|
|  UA ALVES GUIMARÃES|   71|
+--------------------+-----+
only showing top 20 rows



                                                                                

In [26]:
from pyspark.sql import SparkSession, Row
import pyspark.sql.functions as F
from tabulate import tabulate
from IPython.display import display, HTML


enderecos = [
    {"endereco": "RUA C -139, QUADRA 337, LOTE 03"},
    {"endereco": "RODOVIA PRESIDENTE DUTRA N. 2660 (PARTE)"},
    {"endereco": "RUA LIBERO BADARÓ, 158"},
    {"endereco": "AVENIDA CAPITAO ACACIO, 363 - PARTE"},
    {"endereco": "AV BENJAMIN CONSTANT 1058 5 ANDAR"},
    {"endereco": "AVENIDA MATO GROSSO 690N"},
    {"endereco": "RODOVIA PE 35 KM 3"},
    {"endereco": "AV. XV DE NOVEMBRO, 500 - SALA 02 - 1. ANDAR"},
    {"endereco": "RODOVIA MG-22, KM 105"},
    {"endereco": "PRAÇA CAPITÃO VICENTE DIAS, 70 - SOBRE LOJA"},
    {"endereco": "SCS QUADRA 4 BLOCO A N 230 4 ANDAR"},
    {"endereco": "RUA RUI BARBOSA 1360 1 ANDAR"},
    {"endereco": "AVENIDA JORNALISTA ROBERTO MARINHO 85 3 ANDAR"},
    {"endereco": "AV. ANHANGUERA,Nº3.559, 1º ANDAR"},
    {"endereco": "RUA HORTÊNCIA HELENA DE AMORIM BRITO, NO 13.008"},
    {"endereco": "RUA SENADOR MANOEL BARATA N 909 3 ANDAR"},
    {"endereco": "RUA B N°20 89"}
]

# Convertendo a lista de dicionários em uma lista de linhas do Spark
rows = [Row(**endereco) for endereco in enderecos]

# Criando o DataFrame Spark
df_enderecos_teste = spark.createDataFrame(rows)

regex = r"(\d+(?:\.\d+)?)"



amostra_teste = df_enderecos_teste.withColumn("number",
    F.when(F.col("endereco").like("%BR %"), "") \
    .when(F.col("endereco").like("%BR/%"), "") \
    .when(F.col("endereco").like("%RODOVIA%"), "") \
    .when(F.col("endereco").like("%QUADRA%") & ~F.col("endereco").like("%LOTE%"),
          "") \
    .when(F.col("endereco").like("%QUADRA%") & F.col("endereco").like("%LOTE%"), 
          F.regexp_extract(F.col("endereco"), r".*LOTE (\d+)", 1)) \
    .when(F.col("endereco").like("%N %") | F.col("endereco").like("%N.") | F.col("endereco").like("%Nº"),
          F.regexp_extract(F.col("endereco"), r"N[ .º]*?(\d+)", 1)) \
    .otherwise(F.regexp_extract(F.col("endereco"), regex, 0))).limit(60)

#quadra = amostra.filter(df_enderecos["endereco"].contains("BR"))

rows_ = amostra_teste.collect()

columns_ = amostra_teste.columns

table_ = tabulate(rows_, headers=columns_, tablefmt='html')

display(HTML(table_))

endereco,number
"RUA C -139, QUADRA 337, LOTE 03",3.0
RODOVIA PRESIDENTE DUTRA N. 2660 (PARTE),
"RUA LIBERO BADARÓ, 158",158.0
"AVENIDA CAPITAO ACACIO, 363 - PARTE",363.0
AV BENJAMIN CONSTANT 1058 5 ANDAR,
AVENIDA MATO GROSSO 690N,690.0
RODOVIA PE 35 KM 3,
"AV. XV DE NOVEMBRO, 500 - SALA 02 - 1. ANDAR",500.0
"RODOVIA MG-22, KM 105",
"PRAÇA CAPITÃO VICENTE DIAS, 70 - SOBRE LOJA",70.0
