In [1]:
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import col, from_json, substring, when, length, split, lit
from pyspark.sql.types import StructType, StructField, StringType

In [2]:
# Nomes de aquivos e pastas
csv_pj = 'dataset/empresas_estabelecimentos_1m.csv'
parquet_dir_name = 'output/embeddings_nome_1m'

In [3]:
spark = SparkSession.builder \
        .config("spark.driver.memory", "32g") \
        .appName("gera_embedding_nome") \
        .getOrCreate()

23/12/05 13:43:01 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
# Schema para Json Empresa
json_schema_em = StructType([
    StructField("porteEmpresa", StringType(), True),
    StructField("capitalSocial", StringType(), True),
    StructField("cpfResponsavel", StringType(), True),
    StructField("nomeEmpresarial", StringType(), True),
    StructField("naturezaJuridica", StringType(), True),
    StructField("qualificacaoResponsavel", StringType(), True)
])

# Schema para Json Estabelecimento
json_schema_es = StructType([
    StructField("uf", StringType(), True),
    StructField("cep", StringType(), True),
    StructField("ddd1", StringType(), True),
    StructField("ddd2", StringType(), True),
    StructField("pais", StringType(), True),
    StructField("email", StringType(), True),
    StructField("bairro", StringType(), True),
    StructField("numero", StringType(), True),
    StructField("municipio", StringType(), True),
    StructField("telefone1", StringType(), True),
    StructField("telefone2", StringType(), True),
    StructField("cnaeFiscal", StringType(), True),
    StructField("logradouro", StringType(), True),
    StructField("complemento", StringType(), True),
    StructField("dataCadastro", StringType(), True),
    StructField("nomeFantasia", StringType(), True),
    StructField("cidadeExterior", StringType(), True),
    StructField("tipoLogradouro", StringType(), True),
    StructField("cnaesSecundarias", StringType(), True),
    StructField("situacaoEspecial", StringType(), True),
    StructField("situacaoCadastral", StringType(), True),
    StructField("dataSituacaoEspecial", StringType(), True),
    StructField("dataSituacaoCadastral", StringType(), True),
    StructField("motivoSituacaoCadastral", StringType(), True),
    StructField("identificadorMatrizFilial", StringType(), True)
])

# Definindo o esquema para leitura do CSV
schema = StructType([
    StructField("nu_cnpj_raiz", StringType(), True),
    StructField("te_dados_em", StringType(), True),
    StructField("id_estabelecimento", StringType(), True),
    StructField("te_dados_es", StringType(), True)
])

# Lendo o CSV com o esquema definido
pj = spark.read \
    .option("delimiter", ",") \
    .option("multiline", "true") \
    .option("escape", "\"") \
    .csv(csv_pj, schema=schema, header=True)

pj = pj.withColumn("te_dados_em", from_json(col("te_dados_em"), json_schema_em)) \
      .withColumn("te_dados_es", from_json(col("te_dados_es"), json_schema_es)) \
      .filter(substring(col("id_estabelecimento"), 0, 4) == '0001') \
      .withColumnRenamed("nu_cnpj_raiz", "id")

pj = pj.orderBy(col("id").asc()).dropDuplicates(['id'])

#### Tratamento Nome Fantasia

In [5]:
nomes_pj = pj.select(col('id'),
                     col('te_dados_es.nomeFantasia').alias('nomeFantasia')) \
                     .filter(col('te_dados_es.nomeFantasia') != '')
nomes = nomes_pj.rdd.map(lambda row: row.nomeFantasia).collect()

# Imprimir os nomes que não são strings vazias
valores_vazios = nomes_pj.where(col('nomeFantasia') == '').count()
frequencia_vazios = valores_vazios / nomes_pj.count()
print("Frequência de campos vazios (strings vazias): ", frequencia_vazios)


23/12/05 13:43:56 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

Frequência de campos vazios (strings vazias):  0.0


                                                                                

In [6]:
# Passo 1 - Pre calculando o enbeddings
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
embeddings = model.encode(nomes)
print(embeddings)

[[-0.03331316  0.17094629 -0.02804424 ... -0.20079358 -0.13701853
   0.08032994]
 [ 0.02203857  0.17953454 -0.17352705 ... -0.10183495  0.01377693
   0.13287692]
 [ 0.16314214 -0.0413441   0.02480316 ... -0.21233012 -0.01675784
   0.09499262]
 ...
 [-0.11648611 -0.24241914  0.14828391 ...  0.01394305  0.2153851
   0.09290294]
 [-0.04606455  0.20873569 -0.04624428 ... -0.0976361  -0.06093702
  -0.04194095]
 [ 0.03695896  0.09149241  0.03300897 ... -0.01022805  0.05449221
   0.17492387]]


In [7]:
num_linhas = embeddings.shape[0]
print (num_linhas)

582074


In [8]:
# Criando um RDD de linhas para o DataFrame com 'id', 'nomeFantasia' e 'embedding'
rdd = nomes_pj.rdd.zipWithIndex().map(lambda x: (x[0]['id'], x[0]['nomeFantasia'], embeddings[x[1]].tolist()))

                                                                                

In [9]:
# from sklearn.metrics.pairwise import cosine_similarity
# distance_matrix = cosine_similarity(["empresa a", "empresa b"], embeddings)

In [10]:
# Convertendo o RDD de linhas em RDD de Rows
rows_rdd = rdd.map(lambda x: Row(id=x[0], nomeFantasia=x[1], embedding=x[2]))

In [11]:
df_with_embeddings = spark.createDataFrame(rows_rdd, ['id', 'nomeFantasia', 'embedding'])

                                                                                

In [12]:
df_with_embeddings.show()

                                                                                

+--------+--------------------+--------------------+
|      id|        nomeFantasia|           embedding|
+--------+--------------------+--------------------+
|03230095|              VENDAO|[-0.0333131551742...|
|03230099|     INTENSIVA SAUDE|[0.02203856967389...|
|03230101|ASSOCIACAO SANTA ...|[0.16314214468002...|
|03230103|RESTAURANTE DA BA...|[0.18409758806228...|
|03230105|              SILFER|[-0.0655299648642...|
|03230109|BAR E MERCEARIA DOZE|[0.29833957552909...|
|03230110|PRAMOVEIS DECORACOES|[-7.4515491724014...|
|03230112|ESCAVATER LOCACAO...|[0.07552856206893...|
|03230114|KIBOM SELF - SERVICE|[-0.2181864529848...|
|03230115|  ETICA PROFISSIONAL|[0.28472706675529...|
|03230116|CICLOPECAS O GETULIO|[-0.0597080402076...|
|03230119|     GENESIS SYSTEMS|[-0.4516184329986...|
|03230122|ESTRUTURA METALIC...|[0.30958387255668...|
|03230124|D & D CONSULTORIA...|[-0.0089319590479...|
|03230126|         STORO MODAS|[-0.0249068960547...|
|03230129|CONDOMNIO COMERCI...|[-0.06105111166

In [13]:
# Persista o DataFrame para consulta posterior
df_with_embeddings.write.mode('overwrite').parquet(parquet_dir_name)

                                                                                

In [14]:
spark.stop()