In [1]:
import os
from pyspark.sql import SparkSession
from generate_df import table_to_df
from IPython.display import display, HTML
from tabulate import tabulate
from functools import reduce
from pyspark.sql import DataFrame
from pyspark.sql.functions import lit
from pyspark.sql.functions import regexp_extract, col, substring, instr, length
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType

In [2]:
# Definir o caminho do Spark corretamente
os.environ['SPARK_HOME'] = '/home/daiane/spark-3.5.1-bin-hadoop3/'

# Definir o caminho do Java corretamente
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-1.17.0-openjdk-amd64/'

# Iniciar uma sessão Spark
spark = SparkSession.builder \
    .appName("Exemplo Spark") \
    .getOrCreate()

# Testar a sessão Spark
spark

24/06/17 20:33:32 WARN Utils: Your hostname, victor-Lenovo-ideapad-330-15IKB resolves to a loopback address: 127.0.1.1; using 192.168.1.74 instead (on interface wlp2s0)
24/06/17 20:33:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/17 20:33:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/06/17 20:33:35 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


### Dataframes das tabelas 


In [3]:
df_agencias = table_to_df('finance_raw_data', 'agencias', spark)
df_bancos = table_to_df('finance_raw_data', 'bancos', spark)
df_cooperativas_credito = table_to_df('finance_raw_data', 'cooperativas_credito', spark)
df_sociedades = table_to_df('finance_raw_data', 'sociedades', spark)
df_address_adm_consorcio = table_to_df('finance_raw_data', 'administradoras_consorcio', spark)

#colunas para a dim_endereco: id, cnpj, address_type, registration_date, date_end, street,
#complement, number,  neighborhood, city, postalcode, state

colunas = ["cnpj", "data", "endereco","complemento", "bairro",  "municipio","cep", "uf"]


df_address_adm_consorcio_selected = df_address_adm_consorcio.select(colunas).withColumn("address_type", lit("sede"))
df_bancos_selected = df_bancos.select(colunas).withColumn("address_type", lit("sede"))
df_cooperativas_credito_selected = df_cooperativas_credito.select(colunas).withColumn("address_type", lit("sede"))
df_sociedades_selected = df_sociedades.select(colunas).withColumn("address_type", lit("sede"))
df_agencias_selected = df_agencias.select(colunas).withColumn("address_type", lit("agencia"))

dataframes_selected = [df_address_adm_consorcio_selected, df_bancos_selected, df_cooperativas_credito_selected, df_sociedades_selected, df_agencias_selected]

dataframes_selected = [
    df.withColumn("number", F.lit(None).cast("integer"))
      .withColumn("date_end", F.lit(None).cast("date"))
    for df in dataframes_selected
]

def unionAll(dataframes):
    return reduce(DataFrame.unionAll, dataframes)

df_enderecos = unionAll(dataframes_selected)

#colunas_ordem = ["cnpj",  "address_type", "data", "date_end", "endereco","complemento", "number", "bairro",  "municipio","cep", "uf"]

#df_enderecos = df_enderecos.select(colunas_ordem)

# df_enderecos.persist()

# df_enderecos.count()



-

### Tratamento do tipo de logradouro


In [4]:
tipos_logradouros = ["AREA", "ACESSO", "ACAMPAMENTO", "AEROPORTO", "ALAMEDA", "AVENIDA", "BLOCO",
                     "CANAL", "CONDOMINIO", "DISTRITO", "ESTRADA", "RUA", "VIA", "TRAVESSA"]

#amostra_df = df_enderecos.sample(False, 0.5)

#df = amostra_df.filter(df_enderecos["endereco"].contains("PC. "))

df_tipo_corrigido = df_enderecos.withColumn(
    "endereco",
    F.when(
        F.col("endereco").rlike(r"^R\. ") | F.col("endereco").rlike(r"^R "),
        F.regexp_replace(F.col("endereco"), r"^R\.? ", "RUA ")
    ).when(
        F.col("endereco").rlike(r"^AV\. ") | F.col("endereco").rlike(r"^AV "),
        F.regexp_replace(F.col("endereco"), r"^AV\.? ", "AVENIDA ")
    ).when(
        F.col("endereco").rlike(r"^TV\. ") | F.col("endereco").rlike(r"^TV "),
        F.regexp_replace(F.col("endereco"), r"^TV\.? ", "TRAVESSA ")
    ).when(
        F.col("endereco").rlike(r"^PC "),
        F.regexp_replace(F.col("endereco"), r"^PC ", "PRACA ")
    ).otherwise(F.col("endereco"))
)

df_tipo_corrigido.persist()

# df_tipo_corrigido_1 = df_tipo_corrigido.limit(50)

# table = tabulate(df_tipo_corrigido_1.collect(), headers=df_enderecos_corrigido_1.columns, tablefmt='html')
# #table = tabulate(amostra_df.collect(), headers=amostra_df.columns, tablefmt='html')


# display(HTML(table))

DataFrame[cnpj: string, data: string, endereco: string, complemento: string, bairro: string, municipio: string, cep: string, uf: string, address_type: string, number: int, date_end: date]

### Tratamento do número do logradouro

Identificando o número na coluna "endereco" e copiando-o pra coluna "number"


In [8]:
amostra_df = df_tipo_corrigido.sample(False, 0.1)

regex = r"(\d+(?:\.\d+)?)"

tipos_logradouros = ["AREA", "ACESSO", "ACAMPAMENTO", "AEROPORTO", "ALAMEDA", "AVENIDA", "BLOCO",
                     "CANAL", "CONDOMINIO", "DISTRITO", "ESTRADA", "RUA", "VIA", "TRAVESSA"]



df_numero_tratado = amostra_df.withColumn("number",
    F.when(
        (F.col("endereco").like("%BR %")) |
        (F.col("endereco").like("%BR/%")) |
        (F.col("endereco").like("%RODOVIA%")) |
        (F.col("endereco").rlike(r"^\b(" + "|".join(tipos_logradouros) + r")\b \d+")) |
        (F.col("endereco").rlike(r"^\b(" + "|".join(tipos_logradouros) + r")\b \d+[A-Z]*$")) |
        ((col("endereco").like("%QUADRA%")) & (~col("endereco").like("%LOTE%"))) |
        (col("endereco").rlike(r'\d+-\d+')),
        ""
    ).otherwise(
        F.when(
            (col("endereco").like("%QUADRA%")) & (col("endereco").like("%LOTE%")),
            regexp_extract(col("endereco"), r".LOTE (\d+)", 1)
        ).when(
            (col("endereco").like("%N %")) | (col("endereco").like("%N.")) | (col("endereco").like("%Nº")),
            regexp_extract(col("endereco"), r"N[ .º]?(\d+)", 1)
        ).otherwise(
            regexp_extract(col("endereco"), regex, 0)
        )
    )
).withColumn("endereco",
    F.when(
        (col("number") != "") &
        ~(col("endereco").rlike(r'\d+-\d+')),  # Adicionando esta condição para não substituir se for padrão 'dígito-dígito'
        F.regexp_replace(col("endereco"), col("number"), "EXTRAIRAPARTIRDAQUI")
    ).otherwise(col("endereco"))
)


# df_endereco_numero_tratado = df_enderecos_corrigido.withColumn("number",
#     F.when(
#         F.col("endereco").like("%BR %") |
#         F.col("endereco").like("%BR/%") |
#         F.col("endereco").like("%RODOVIA%") |
#         F.col("endereco").rlike(r"^\b(" + "|".join(tipos_logradouros) + r")\b \d+") |
#         F.col("endereco").rlike(r"^\b(" + "|".join(tipos_logradouros) + r")\b \d+[A-Z]*$") |
#         (F.col("endereco").like("%QUADRA%") & ~F.col("endereco").like("%LOTE%")),
#         ""
#     ).when(
#         F.col("endereco").like("%QUADRA%") & F.col("endereco").like("%LOTE%"),
#         F.regexp_extract(F.col("endereco"), r".*LOTE (\d+)", 1)
#     ).when(
#         F.col("endereco").like("%N %") | F.col("endereco").like("%N.") | F.col("endereco").like("%Nº"),
#         F.regexp_extract(F.col("endereco"), r"N[ .º]*?(\d+)", 1)
#     ).otherwise(F.regexp_extract(F.col("endereco"), regex, 0))
# )

# df_numero_tratado.persist()


# df_numero_tratado_ = df_numero_tratado.filter(df_enderecos["endereco"].contains("EXTRAIRAPARTIRDAQUI-"))

rows = df_numero_tratado.collect()

columns = df_numero_tratado.columns

table = tabulate(rows, headers=columns, tablefmt='html')

display(HTML(table))


# non_empty_count = df_numero_tratado.filter((col("number").isNotNull()) & (col("number") != "")).count()

# empty_count = df_numero_tratado.filter((col("number").isNull()) | (col("number") == "")).count()

# total_count = df_numero_tratado.count()

# non_empty_percentage = (non_empty_count / total_count) * 100
# empty_percentage = (empty_count / total_count) * 100


# print(f"Registros não vazios: {non_empty_count} ({non_empty_percentage:.2f}%)")
# print(f"Registros vazios: {empty_count} ({empty_percentage:.2f}%)")

24/06/17 20:50:06 WARN TaskSetManager: Stage 0 contains a task of very large size (1496 KiB). The maximum recommended task size is 1000 KiB.
24/06/17 20:50:10 WARN TaskSetManager: Stage 1 contains a task of very large size (1496 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

cnpj,data,endereco,complemento,bairro,municipio,cep,uf,address_type,number,date_end
85322204,2024-05-23 19:44:42.617722,RUA JOAQUIM VAZ EXTRAIRAPARTIRDAQUI,,PRAIA COMPRIDA,SÃO JOSÉ,88102650,SC,sede,1615.0,
5551841,2024-05-23 19:44:42.617722,RUA SENADOR MANOEL BARATA N 909 3 ANDAR,ENTRE AV PRESIDENTE VARGAS E FREI GIL,CAMPINA,BELÉM,66010147,PA,sede,,
91338228,2024-05-23 19:44:42.617722,"AVENIDA DORIVAL CANDIDO LUZ DE OLIVEIRA, EXTRAIRAPARTIRDAQUI",,CENTRO,GRAVATAÍ,94030001,RS,sede,501.0,
3832228,2024-05-23 19:44:42.617722,RUA 7 DE SETEMBRO 821,,CENTRO,BARBACENA,36200078,MG,sede,,
6043050,2024-05-23 19:44:42.617722,SAUN QUADRA 5 BLOCO B TORRE SUL 1 ANDAR,ED BANCO DO BRASIL,ASA NORTE,BRASÍLIA,70040250,DF,sede,,
95783262,2024-05-23 19:44:42.617722,RUA RUI BARBOSA EXTRAIRAPARTIRDAQUI 1 ANDAR,,CENTRO,TURVO,88930000,SC,sede,1360.0,
91107581,2024-05-23 19:44:42.617722,"RST EXTRAIRAPARTIRDAQUI, KM 121, N- 650",5° ANDAR,SANTA CATARINA,FARROUPILHA,95180000,RS,sede,453.0,
50533876,2024-05-23 19:44:42.617722,AVENIDA PAULISTA,17- ANDAR,BELA VISTA,SÃO PAULO,01310100,SP,sede,,
87636643,2024-05-23 19:44:42.617722,AVENIDA PEDRO PINTO DE SOUZA EXTRAIRAPARTIRDAQUI,CENTRO,CENTRO,ERECHIM,99700096,RS,sede,299.0,
42421776,2024-05-23 19:44:42.617722,PRACA ALFREDO EGYDIO DE SOUZA ARANHA EXTRAIRAPARTIRDAQUI,TORRE OLAVO SETUBAL 7 ANDAR PARTE A,PARQUE JABAQUARA,SÃO PAULO,04344902,SP,sede,100.0,


In [14]:
pattern = "EXTRAIRAPARTIRDAQUI(.*)"

df_apos_numero = df_numero_tratado.withColumn("apos_numero", regexp_extract("endereco", pattern, 1))


df_apos_numero.createOrReplaceTempView("view_temporariaa")

# spark.sql("SELECT apos_numero FROM view_temporariaa").show(truncate=False)
df = spark.sql(" SELECT apos_numero, COUNT(*) as freq from view_temporariaa group by apos_numero order by freq desc ").show()

# df = spark.sql("select endereco, apos_numero from view_temporariaa where apos_numero = '-EXTRAIRAPARTIRDAQUI'").show()

table = tabulate(df.collect(), headers=df.columns, tablefmt='html')

display(HTML(table))


# spark.sql(" SELECT apos_numero, COUNT(*) as freq from view_temporariaa group by apos_numero order by freq desc ").show()


24/06/17 21:00:39 WARN TaskSetManager: Stage 9 contains a task of very large size (1496 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

+--------------------+----+
|         apos_numero|freq|
+--------------------+----+
|                    |6187|
|                   A|  16|
|                   ,|  12|
|                   -|  11|
|         , 26. ANDAR|  10|
|                  -A|   9|
|   3   5  E 6  AN...|   8|
|          , 2° ANDAR|   8|
|             - PARTE|   7|
|                   N|   7|
|        , QUADRA 147|   7|
|                   E|   7|
|            1  ANDAR|   6|
|               PARTE|   6|
|            5  ANDAR|   6|
|                   .|   6|
|            - CENTRO|   6|
|          , SALA 801|   6|
|         , 1- ANDAR,|   6|
|   - 4 ANDAR - CJ 41|   6|
+--------------------+----+
only showing top 20 rows



AttributeError: 'NoneType' object has no attribute 'collect'

In [26]:
from pyspark.sql import SparkSession, Row
import pyspark.sql.functions as F
from tabulate import tabulate
from IPython.display import display, HTML


enderecos = [
    {"endereco": "RUA C -139, QUADRA 337, LOTE 03", "number": },
    {"endereco": "RODOVIA PRESIDENTE DUTRA N. 2660 (PARTE)"},
    {"endereco": "RUA LIBERO BADARÓ, 158"},
    {"endereco": "AVENIDA CAPITAO ACACIO, 363 - PARTE"},
    {"endereco": "AV BENJAMIN CONSTANT 1058 5 ANDAR"},
    {"endereco": "AVENIDA MATO GROSSO 690N"},
    {"endereco": "RODOVIA PE 35 KM 3"},
    {"endereco": "AV. XV DE NOVEMBRO, 500 - SALA 02 - 1. ANDAR"},
    {"endereco": "RODOVIA MG-22, KM 105"},
    {"endereco": "PRAÇA CAPITÃO VICENTE DIAS, 70 - SOBRE LOJA"},
    {"endereco": "SCS QUADRA 4 BLOCO A N 230 4 ANDAR"},
    {"endereco": "RUA RUI BARBOSA 1360 1 ANDAR"},
    {"endereco": "AVENIDA JORNALISTA ROBERTO MARINHO 85 3 ANDAR"},
    {"endereco": "AV. ANHANGUERA,Nº3.559, 1º ANDAR"},
    {"endereco": "RUA HORTÊNCIA HELENA DE AMORIM BRITO, NO 13.008"},
    {"endereco": "RUA SENADOR MANOEL BARATA N 909 3 ANDAR"},
    {"endereco": "RUA B N°20 89"}
]

rows = [Row(**endereco) for endereco in enderecos]

df_enderecos_teste = spark.createDataFrame(rows)

regex = r"(\d+(?:\.\d+)?)"



amostra_teste = df_enderecos_teste.withColumn("number",
    F.when(F.col("endereco").like("%BR %"), "") \
    .when(F.col("endereco").like("%BR/%"), "") \
    .when(F.col("endereco").like("%RODOVIA%"), "") \
    .when(F.col("endereco").like("%QUADRA%") & ~F.col("endereco").like("%LOTE%"),
          "") \
    .when(F.col("endereco").like("%QUADRA%") & F.col("endereco").like("%LOTE%"), 
          F.regexp_extract(F.col("endereco"), r".*LOTE (\d+)", 1)) \
    .when(F.col("endereco").like("%N %") | F.col("endereco").like("%N.") | F.col("endereco").like("%Nº"),
          F.regexp_extract(F.col("endereco"), r"N[ .º]*?(\d+)", 1)) \
    .otherwise(F.regexp_extract(F.col("endereco"), regex, 0))).limit(60)

#quadra = amostra.filter(df_enderecos["endereco"].contains("BR"))

rows_ = amostra_teste.collect()

columns_ = amostra_teste.columns

table_ = tabulate(rows_, headers=columns_, tablefmt='html')

display(HTML(table_))

endereco,number
"RUA C -139, QUADRA 337, LOTE 03",3.0
RODOVIA PRESIDENTE DUTRA N. 2660 (PARTE),
"RUA LIBERO BADARÓ, 158",158.0
"AVENIDA CAPITAO ACACIO, 363 - PARTE",363.0
AV BENJAMIN CONSTANT 1058 5 ANDAR,
AVENIDA MATO GROSSO 690N,690.0
RODOVIA PE 35 KM 3,
"AV. XV DE NOVEMBRO, 500 - SALA 02 - 1. ANDAR",500.0
"RODOVIA MG-22, KM 105",
"PRAÇA CAPITÃO VICENTE DIAS, 70 - SOBRE LOJA",70.0


In [None]:
df_endereco_numero_tratado = df_enderecos_corrigido.withColumn("number",
    F.when(
        F.col("endereco").like("%BR %") |
        F.col("endereco").like("%BR/%") |
        F.col("endereco").like("%RODOVIA%") |
        F.col("endereco").rlike(r"^\b(" + "|".join(tipos_logradouros) + r")\b \d+") |
        F.col("endereco").rlike(r"^\b(" + "|".join(tipos_logradouros) + r")\b \d+[A-Z]*$") |
        (F.col("endereco").like("%QUADRA%") & ~F.col("endereco").like("%LOTE%")),
        ""
    ).when(
        F.col("endereco").like("%QUADRA%") & F.col("endereco").like("%LOTE%"),
        F.regexp_extract(F.col("endereco"), r".*LOTE (\d+)", 1)
    ).when(
        F.col("endereco").like("%N %") | F.col("endereco").like("%N.") | F.col("endereco").like("%Nº"),
        F.regexp_extract(F.col("endereco"), r"N[ .º]*?(\d+)", 1)
    ).otherwise(F.regexp_extract(F.col("endereco"), regex, 0))
).withColumn("endereco", 
    F.when(F.col("number") != "", 
           F.regexp_replace(F.col("endereco"), F.col("number"), "EXTRAIRAPARTIRDAQUI")
    ).otherwise(F.col("endereco"))
)