In [1]:
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import col, from_json, substring, when, length, split, lit
from pyspark.sql.types import StructType, StructField, StringType

In [2]:
# Nomes de aquivos e pastas
csv_pj = 'dataset/empresas_estabelecimentos_30m.csv'
parquet_dir_name = 'output/embeddings_nome_30m'

In [3]:
spark = SparkSession.builder \
        .config("spark.driver.memory", "128g") \
        .appName("gera_embedding_nome") \
        .getOrCreate()

23/12/06 10:15:15 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
# Schema para Json Empresa
json_schema_em = StructType([
    StructField("porteEmpresa", StringType(), True),
    StructField("capitalSocial", StringType(), True),
    StructField("cpfResponsavel", StringType(), True),
    StructField("nomeEmpresarial", StringType(), True),
    StructField("naturezaJuridica", StringType(), True),
    StructField("qualificacaoResponsavel", StringType(), True)
])

# Schema para Json Estabelecimento
json_schema_es = StructType([
    StructField("uf", StringType(), True),
    StructField("cep", StringType(), True),
    StructField("ddd1", StringType(), True),
    StructField("ddd2", StringType(), True),
    StructField("pais", StringType(), True),
    StructField("email", StringType(), True),
    StructField("bairro", StringType(), True),
    StructField("numero", StringType(), True),
    StructField("municipio", StringType(), True),
    StructField("telefone1", StringType(), True),
    StructField("telefone2", StringType(), True),
    StructField("cnaeFiscal", StringType(), True),
    StructField("logradouro", StringType(), True),
    StructField("complemento", StringType(), True),
    StructField("dataCadastro", StringType(), True),
    StructField("nomeFantasia", StringType(), True),
    StructField("cidadeExterior", StringType(), True),
    StructField("tipoLogradouro", StringType(), True),
    StructField("cnaesSecundarias", StringType(), True),
    StructField("situacaoEspecial", StringType(), True),
    StructField("situacaoCadastral", StringType(), True),
    StructField("dataSituacaoEspecial", StringType(), True),
    StructField("dataSituacaoCadastral", StringType(), True),
    StructField("motivoSituacaoCadastral", StringType(), True),
    StructField("identificadorMatrizFilial", StringType(), True)
])

# Definindo o esquema para leitura do CSV
schema = StructType([
    StructField("nu_cnpj_raiz", StringType(), True),
    StructField("te_dados_em", StringType(), True),
    StructField("id_estabelecimento", StringType(), True),
    StructField("te_dados_es", StringType(), True)
])

# Lendo o CSV com o esquema definido
pj = spark.read \
    .option("delimiter", ",") \
    .option("multiline", "true") \
    .option("escape", "\"") \
    .csv(csv_pj, schema=schema, header=True)

pj = pj.withColumn("te_dados_em", from_json(col("te_dados_em"), json_schema_em)) \
      .withColumn("te_dados_es", from_json(col("te_dados_es"), json_schema_es)) \
      .filter(substring(col("id_estabelecimento"), 0, 4) == '0001') \
      .withColumnRenamed("nu_cnpj_raiz", "id")

pj = pj.orderBy(col("id").asc()).dropDuplicates(['id'])

In [5]:
# Gera dataframe com nome fantasia
nomes_pj = pj.select(col('id'),
                     col('te_dados_es.nomeFantasia').alias('nomeFantasia')) \
                     .filter(col('te_dados_es.nomeFantasia') != '')

#### Tratamento Nome Fantasia

In [6]:
from sentence_transformers import SentenceTransformer
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, FloatType

# Função para criar embeddings usando SentenceTransformer
def generate_embeddings_with_model(text, model):
    embeddings = model.encode(text)
    return embeddings.tolist()

# Criando a UDF
# Criando o modelo SentenceTransformer
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

# Criando a UDF com o modelo como parâmetro
generate_embeddings_udf = udf(lambda text: generate_embeddings_with_model(text, model), ArrayType(FloatType()))

In [7]:
# lista = generate_embeddings_with_model("Essa é outra casa do chico", model=model)
# print(lista)

In [8]:
# Gera embeddings
nomes_pj = nomes_pj.withColumn('embeddings', generate_embeddings_udf(col('nomeFantasia')))

In [9]:
# Persista o DataFrame para consulta posterior
nomes_pj.write.mode('overwrite').parquet(parquet_dir_name)

23/12/06 10:15:26 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [10]:
spark.stop()