In [33]:
import os
from pyspark.sql import SparkSession
from generate_df import table_to_df
from IPython.display import display, HTML
from tabulate import tabulate
from functools import reduce
from pyspark.sql import DataFrame
from pyspark.sql.functions import lit
from pyspark.sql.functions import regexp_extract, col, substring, instr, length
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType
from unidecode import unidecode

In [3]:
# Definir o caminho do Spark corretamente
os.environ['SPARK_HOME'] = '/home/daiane/spark-3.5.1-bin-hadoop3/'

# Definir o caminho do Java corretamente
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-1.17.0-openjdk-amd64/'

# Iniciar uma sessão Spark
spark = SparkSession.builder \
    .appName("Exemplo Spark") \
    .getOrCreate()

# Testar a sessão Spark
spark

24/06/18 10:59:34 WARN Utils: Your hostname, victor-Lenovo-ideapad-330-15IKB resolves to a loopback address: 127.0.1.1; using 10.0.0.126 instead (on interface wlp2s0)
24/06/18 10:59:34 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/18 10:59:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Dataframes das tabelas 


In [4]:
df_agencias = table_to_df('finance_raw_data', 'agencias', spark)
df_bancos = table_to_df('finance_raw_data', 'bancos', spark)
df_cooperativas_credito = table_to_df('finance_raw_data', 'cooperativas_credito', spark)
df_sociedades = table_to_df('finance_raw_data', 'sociedades', spark)
df_address_adm_consorcio = table_to_df('finance_raw_data', 'administradoras_consorcio', spark)

#colunas para a dim_endereco: id, cnpj, address_type, registration_date, date_end, street,
#complement, number,  neighborhood, city, postalcode, state

colunas = ["cnpj", "data", "endereco","complemento", "bairro",  "municipio","cep", "uf"]


df_address_adm_consorcio_selected = df_address_adm_consorcio.select(colunas).withColumn("address_type", lit("sede"))
df_bancos_selected = df_bancos.select(colunas).withColumn("address_type", lit("sede"))
df_cooperativas_credito_selected = df_cooperativas_credito.select(colunas).withColumn("address_type", lit("sede"))
df_sociedades_selected = df_sociedades.select(colunas).withColumn("address_type", lit("sede"))
df_agencias_selected = df_agencias.select(colunas).withColumn("address_type", lit("agencia"))

dataframes_selected = [df_address_adm_consorcio_selected, df_bancos_selected, df_cooperativas_credito_selected, df_sociedades_selected, df_agencias_selected]

dataframes_selected = [
    df.withColumn("number", F.lit(None).cast("integer"))
      .withColumn("date_end", F.lit(None).cast("date"))
    for df in dataframes_selected
]

def unionAll(dataframes):
    return reduce(DataFrame.unionAll, dataframes)

df_enderecos = unionAll(dataframes_selected)

#colunas_ordem = ["cnpj",  "address_type", "data", "date_end", "endereco","complemento", "number", "bairro",  "municipio","cep", "uf"]

#df_enderecos = df_enderecos.select(colunas_ordem)

df_enderecos.persist()

# df_enderecos.count()



DataFrame[cnpj: string, data: string, endereco: string, complemento: string, bairro: string, municipio: string, cep: string, uf: string, address_type: string, number: int, date_end: date]

-

### Tratamento do tipo de logradouro


In [5]:
tipos_logradouros = ["AREA", "ACESSO", "ACAMPAMENTO", "AEROPORTO", "ALAMEDA", "AVENIDA", "BLOCO",
                     "CANAL", "CONDOMINIO", "DISTRITO", "ESTRADA", "RUA", "VIA", "TRAVESSA"]

#amostra_df = df_enderecos.sample(False, 0.5)

#df = amostra_df.filter(df_enderecos["endereco"].contains("PC. "))

df_tipo_corrigido = df_enderecos.withColumn(
    "endereco",
    F.when(
        F.col("endereco").rlike(r"^R\. ") | F.col("endereco").rlike(r"^R "),
        F.regexp_replace(F.col("endereco"), r"^R\.? ", "RUA ")
    ).when(
        F.col("endereco").rlike(r"^AV\. ") | F.col("endereco").rlike(r"^AV "),
        F.regexp_replace(F.col("endereco"), r"^AV\.? ", "AVENIDA ")
    ).when(
        F.col("endereco").rlike(r"^TV\. ") | F.col("endereco").rlike(r"^TV "),
        F.regexp_replace(F.col("endereco"), r"^TV\.? ", "TRAVESSA ")
    ).when(
        F.col("endereco").rlike(r"^PC "),
        F.regexp_replace(F.col("endereco"), r"^PC ", "PRACA ")
    ).otherwise(F.col("endereco"))
)

df_tipo_corrigido.persist()

# df_tipo_corrigido_1 = df_tipo_corrigido.limit(50)

# table = tabulate(df_tipo_corrigido_1.collect(), headers=df_enderecos_corrigido_1.columns, tablefmt='html')
# #table = tabulate(amostra_df.collect(), headers=amostra_df.columns, tablefmt='html')


# display(HTML(table))

DataFrame[cnpj: string, data: string, endereco: string, complemento: string, bairro: string, municipio: string, cep: string, uf: string, address_type: string, number: int, date_end: date]

### Tratamento do número do logradouro

Identificando o número na coluna "endereco" e copiando-o pra coluna "number"


In [6]:
amostra_df = df_tipo_corrigido.sample(False, 0.1)

regex = r"(\d+(?:\.\d+)?)"

tipos_logradouros = ["AREA", "ACESSO", "ACAMPAMENTO", "AEROPORTO", "ALAMEDA", "AVENIDA", "BLOCO",
                     "CANAL", "CONDOMINIO", "DISTRITO", "ESTRADA", "RUA", "VIA", "TRAVESSA"]



df_numero_tratado = amostra_df.withColumn("number",
    F.when(
        (F.col("endereco").like("%BR %")) |
        (F.col("endereco").like("%BR/%")) |
        (F.col("endereco").like("%RODOVIA%")) |
        (F.col("endereco").rlike(r"^\b(" + "|".join(tipos_logradouros) + r")\b \d+")) |
        (F.col("endereco").rlike(r"^\b(" + "|".join(tipos_logradouros) + r")\b \d+[A-Z]*$")) |
        ((col("endereco").like("%QUADRA%")) & (~col("endereco").like("%LOTE%"))) |
        (col("endereco").rlike(r'\d+-\d+')),
        ""
    ).otherwise(
        F.when(
            (col("endereco").like("%QUADRA%")) & (col("endereco").like("%LOTE%")),
            regexp_extract(col("endereco"), r".LOTE (\d+)", 1)
        ).when(
            (col("endereco").like("%N %")) | (col("endereco").like("%N.")) | (col("endereco").like("%Nº")),
            regexp_extract(col("endereco"), r"N[ .º]?(\d+)", 1)
        ).otherwise(
            regexp_extract(col("endereco"), regex, 0)
        )
    )
).withColumn("endereco",
    F.when(
        (col("number") != "") &
        ~(col("endereco").rlike(r'\d+-\d+')),  # Adicionando esta condição para não substituir se for padrão 'dígito-dígito'
        F.regexp_replace(col("endereco"), col("number"), "EXTRAIRAPARTIRDAQUI")
    ).otherwise(col("endereco"))
)


# df_endereco_numero_tratado = df_enderecos_corrigido.withColumn("number",
#     F.when(
#         F.col("endereco").like("%BR %") |
#         F.col("endereco").like("%BR/%") |
#         F.col("endereco").like("%RODOVIA%") |
#         F.col("endereco").rlike(r"^\b(" + "|".join(tipos_logradouros) + r")\b \d+") |
#         F.col("endereco").rlike(r"^\b(" + "|".join(tipos_logradouros) + r")\b \d+[A-Z]*$") |
#         (F.col("endereco").like("%QUADRA%") & ~F.col("endereco").like("%LOTE%")),
#         ""
#     ).when(
#         F.col("endereco").like("%QUADRA%") & F.col("endereco").like("%LOTE%"),
#         F.regexp_extract(F.col("endereco"), r".*LOTE (\d+)", 1)
#     ).when(
#         F.col("endereco").like("%N %") | F.col("endereco").like("%N.") | F.col("endereco").like("%Nº"),
#         F.regexp_extract(F.col("endereco"), r"N[ .º]*?(\d+)", 1)
#     ).otherwise(F.regexp_extract(F.col("endereco"), regex, 0))
# )

# df_numero_tratado.persist()


# df_numero_tratado_ = df_numero_tratado.filter(df_enderecos["endereco"].contains("EXTRAIRAPARTIRDAQUI-"))

rows = df_numero_tratado.collect()

columns = df_numero_tratado.columns

table = tabulate(rows, headers=columns, tablefmt='html')

display(HTML(table))


# non_empty_count = df_numero_tratado.filter((col("number").isNotNull()) & (col("number") != "")).count()

# empty_count = df_numero_tratado.filter((col("number").isNull()) | (col("number") == "")).count()

# total_count = df_numero_tratado.count()

# non_empty_percentage = (non_empty_count / total_count) * 100
# empty_percentage = (empty_count / total_count) * 100


# print(f"Registros não vazios: {non_empty_count} ({non_empty_percentage:.2f}%)")
# print(f"Registros vazios: {empty_count} ({empty_percentage:.2f}%)")

24/06/18 11:00:57 WARN TaskSetManager: Stage 0 contains a task of very large size (1496 KiB). The maximum recommended task size is 1000 KiB.
24/06/18 11:01:00 WARN TaskSetManager: Stage 1 contains a task of very large size (1496 KiB). The maximum recommended task size is 1000 KiB.
24/06/18 11:01:02 WARN TaskSetManager: Stage 2 contains a task of very large size (1496 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

cnpj,data,endereco,complemento,bairro,municipio,cep,uf,address_type,number,date_end
13548938,2024-05-23 19:44:42.617722,"RUA BAEPENDI, EXTRAIRAPARTIRDAQUI",CASA,ONDINA,SALVADOR,40170070,BA,sede,162.0,
8570707,2024-05-23 19:44:42.617722,"AVENIDA PRUDENTE DE MORAIS, EXTRAIRAPARTIRDAQUI",,TIROL,NATAL,59020510,RN,sede,1610.0,
68318773,2024-05-23 19:44:42.617722,"AVENIDA FERNANDO CORREA DA COSTA, EXTRAIRAPARTIRDAQUI",SOBRE LOJA,JARDIM KENNEDY,CUIABÁ,78065000,MT,sede,1944.0,
14723388,2024-05-23 19:44:42.617722,AVENIDA HIGIENOPOLIS EXTRAIRAPARTIRDAQUI,,PARQUE GUANABARA,LONDRINA,86050000,PR,sede,2400.0,
4058605,2024-05-23 19:44:42.617722,AVENIDA FRANCISCO LACERDA DE AGUIAR EXTRAIRAPARTIRDAQUI,2 ANDAR SALA 09,GILBERTO MACHADO,CACHOEIRO DE ITAPEMIRIM,29303300,ES,sede,96.0,
3832228,2024-05-23 19:44:42.617722,RUA 7 DE SETEMBRO 821,,CENTRO,BARBACENA,36200078,MG,sede,,
83553883,2024-05-23 19:44:42.617722,RUA XV DE NOVEMBRO EXTRAIRAPARTIRDAQUI,SALA 202,CENTRO,RIO DO SUL,89160000,SC,sede,45.0,
76515071,2024-05-23 19:44:42.617722,RUA ROCKFELLER EXTRAIRAPARTIRDAQUI (CP 6604,,REBOUCAS,CURITIBA,80230130,PR,sede,1118.0,
91341925,2024-05-23 19:44:42.617722,RUA 28 DE SETEMBRO 1779,SEDE,CENTRO,SANTA CRUZ DO SUL,96810530,RS,sede,,
58113812,2024-05-23 19:44:42.617722,ALAMEDA EUROPA EXTRAIRAPARTIRDAQUI,,TAMBORE,SANTANA DE PARNAÍBA,06543325,SP,sede,150.0,


In [28]:
pattern = "EXTRAIRAPARTIRDAQUI(.*)"

df_pos_numero = df_numero_tratado.withColumn("apos_numero", regexp_extract("endereco", pattern, 1))

# Primeiro, limpar os caracteres especiais do início da string
df_pos_numero_tratado = df_pos_numero.withColumn("apos_numero", F.regexp_replace(F.col("apos_numero"), r'^[^a-zA-Z0-9]+', ''))

# Adicionar " - " ao final se a string não ficar vazia após a limpeza
df_pos_numero_tratado = df_pos_numero_tratado.withColumn(
    "apos_numero",
    F.when(F.col("apos_numero") != '', F.concat(F.col("apos_numero"), F.lit(' - '))).otherwise('')
)

df_pos_numero_tratado = df_pos_numero_tratado.withColumn("complemento",
                                                       F.concat(F.col("apos_numero"), F.col("complemento")))

df_endereco_sem_numero = df_numero_tratado.withColumn("endereco",
                                                      F.regexp_replace(col("endereco"), pattern, ""))    

caractere_especial = r'[^\w\s]$'

df_endereco_sem_especial = df_endereco_sem_numero.withColumn("endereco", 
                        F.regexp_replace(F.trim(col("endereco")), caractere_especial, ""))                                                                       
                                                                                   
df_pos_numero_tratado = df_endereco_sem_especial.drop("apos_numero")

# df_pos_numero_tratado.persist()

df_pos_numero_tratado.createOrReplaceTempView("view_temporariaa")

spark.sql("SELECT endereco, number, complemento FROM view_temporariaa").show(50, truncate=False)

# df = spark.sql(" SELECT apos_numero, COUNT(*) as freq from view_temporariaa group by apos_numero order by freq desc ").show(50)

# df = spark.sql("select * from view_temporariaa ")

# table = tabulate(df_pos_numero_tratado.collect(), headers=df_pos_numero_tratado.columns, tablefmt='html')

# display(HTML(table))

# df_pos_numero_tratado.printSchema()



+----------------------------------------+------+-----------------------------------+
|endereco                                |number|complemento                        |
+----------------------------------------+------+-----------------------------------+
|RUA BAEPENDI                            |162   |CASA                               |
|AVENIDA PRUDENTE DE MORAIS              |1610  |NULL                               |
|AVENIDA FERNANDO CORREA DA COSTA        |1944  |SOBRE LOJA                         |
|AVENIDA HIGIENOPOLIS                    |2400  |NULL                               |
|AVENIDA FRANCISCO LACERDA DE AGUIAR     |96    |2  ANDAR  SALA 09                  |
|RUA 7 DE SETEMBRO 821                   |      |NULL                               |
|RUA XV DE NOVEMBRO                      |45    |SALA 202                           |
|RUA ROCKFELLER                          |1118  |NULL                               |
|RUA 28 DE SETEMBRO  1779                |      |SEDE 

#### Removendo acentos


In [32]:
!pip install unidecode

Defaulting to user installation because normal site-packages is not writeable
Collecting unidecode
  Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 KB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m:01[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.8


In [38]:
def clean_accent_(texto):
    return unidecode(texto) if texto else None

clean_accent = F.udf(clean_accent_, StringType())

df_sem_acento = df_pos_numero_tratado.withColumn("endereco", F.upper(clean_accent(col("endereco"))))\
                                     .withColumn("complemento", F.upper(clean_accent(col("complemento"))))\
                                     .withColumn("uf", F.upper(clean_accent(col("uf"))))\
                                    .withColumn("municipio", F.upper(clean_accent(col("municipio"))))

df_sem_acento.show(20, truncate=False)

[Stage 41:>                                                         (0 + 1) / 1]

+--------+--------------------------+------------------------------------+-------------------------+-----------------+-----------------------+--------+---+------------+------+--------+
|cnpj    |data                      |endereco                            |complemento              |bairro           |municipio              |cep     |uf |address_type|number|date_end|
+--------+--------------------------+------------------------------------+-------------------------+-----------------+-----------------------+--------+---+------------+------+--------+
|13548938|2024-05-23 19:44:42.617722|RUA BAEPENDI                        |CASA                     |ONDINA           |SALVADOR               |40170070|BA |sede        |162   |NULL    |
|08570707|2024-05-23 19:44:42.617722|AVENIDA PRUDENTE DE MORAIS          |NULL                     |TIROL            |NATAL                  |59020510|RN |sede        |1610  |NULL    |
|68318773|2024-05-23 19:44:42.617722|AVENIDA FERNANDO CORREA DA COSTA    |S

                                                                                