## Importar pacotes e configurar SPARK

In [1]:
import os

# paths to spark and python3
os.environ['PYSPARK_SUBMIT_ARGS'] = '--executor-memory 1G pyspark-shell'
os.environ["SPARK_HOME"] = "/home/pacha/spark"
os.environ["PYSPARK_PYTHON"]="/usr/bin/python3"

# execute PYSPARK
exec(open('/home/pacha/spark/python/pyspark/shell.py').read())

# importar pacotes
from pyspark.sql import functions as SF
import pyspark.sql.types as ST
from pyspark.sql.functions import udf

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.3
      /_/

Using Python version 3.6.8 (default, Jan 14 2019 11:02:34)
SparkSession available as 'spark'.


## Importar e tratar dados

In [2]:
df = spark.read.csv('BASE_JUR001.csv',
                    sep='\t',
                    encoding='utf-8',
                    header=True,
                    inferSchema=False)

In [15]:
# Tratar inicio do CNPJ sem 0
@SF.udf('string')
def fillCNPJ(value):
    aux = '00000000000000' + value
    return aux[len(value):]

cursor = cursor.withColumn('CNPJ', fillCNPJ('CNPJ'))

In [4]:
# criar base CURSOR e base sem duplicados
cursor = df.dropDuplicates(['CNPJ'])

In [17]:
cursor.select('REF_DATE').show(10)

+--------------------+
|            REF_DATE|
+--------------------+
|2018-08-28T10:01:...|
|2018-10-26T14:12:...|
|2018-08-24T06:02:...|
|2018-08-28T13:05:...|
|2018-08-28T13:53:...|
|2018-08-28T14:50:...|
|2018-10-31T17:56:...|
|2018-08-28T15:31:...|
|2018-08-28T15:41:...|
|2018-09-07T06:33:...|
+--------------------+
only showing top 10 rows



In [19]:
# tratar timestamp
cursor = cursor.withColumn('data',SF.to_timestamp(df['REF_DATE']))

+-------------------+
|               data|
+-------------------+
|2018-08-28 07:01:57|
|2018-10-26 11:12:35|
|2018-08-24 03:02:06|
|2018-08-28 10:05:03|
|2018-08-28 10:53:36|
+-------------------+
only showing top 5 rows



### 1 e 2. Número de emails/Telefones

In [5]:
df.select('CNPJ','EMAIL', 'TELEFONE_1', 'TELEFONE_2').describe().show()

+-------+--------------------+--------------------+--------------+--------------+
|summary|                CNPJ|               EMAIL|    TELEFONE_1|    TELEFONE_2|
+-------+--------------------+--------------------+--------------+--------------+
|  count|              214685|              131687|        174193|         48828|
|   mean|2.137532510591555...|                 0.0|          null|          null|
| stddev|1.900184742990202...|                 0.0|          null|          null|
|    min|      00000703000110|'jane@pizza1.com.br'|      () 3621-|  () 1299-6380|
|    max|      98714462000175|  zzgomes@bol.com.br|(99) 9999-9999|(99) 9999-9999|
+-------+--------------------+--------------------+--------------+--------------+



In [6]:
# agrupar telefone1, telefone 2 e email por CNPJ
dfg = (
    cursor.groupBy('CNPJ').agg({
    'EMAIL' : 'count',
    'TELEFONE_1': 'count',
    'TELEFONE_2': 'count'}).toDF('CNPJ', 'email_count', 'telefone1_count', 'telefone2_count')

)
dfg.show(10)

+--------------+-----------+---------------+---------------+
|          CNPJ|email_count|telefone1_count|telefone2_count|
+--------------+-----------+---------------+---------------+
|00182140000128|          1|              0|              1|
|00231887000129|          1|              0|              0|
|00256837000104|          1|              0|              0|
|00501368000133|          0|              0|              0|
|00560191000146|          1|              1|              1|
|00642972000180|          0|              0|              0|
|00663069000103|          0|              0|              0|
|00691020000156|          0|              0|              0|
|00701583000188|          1|              0|              1|
|00844020000149|          1|              0|              1|
+--------------+-----------+---------------+---------------+
only showing top 10 rows



In [7]:
# somar colunas telefone
dfg = dfg.withColumn('telefone_num', SF.col('telefone1_count') + SF.col('telefone2_count'))
dfg.show(10)

+--------------+-----------+---------------+---------------+------------+
|          CNPJ|email_count|telefone1_count|telefone2_count|telefone_num|
+--------------+-----------+---------------+---------------+------------+
|00182140000128|          1|              0|              1|           1|
|00231887000129|          1|              0|              0|           0|
|00256837000104|          1|              0|              0|           0|
|00501368000133|          0|              0|              0|           0|
|00560191000146|          1|              1|              1|           2|
|00642972000180|          0|              0|              0|           0|
|00663069000103|          0|              0|              0|           0|
|00691020000156|          0|              0|              0|           0|
|00701583000188|          1|              0|              1|           1|
|00844020000149|          1|              0|              1|           1|
+--------------+-----------+----------

### 3. Número de Filiais

In [8]:
# funcao para selecionar 8 primeiros digitos
@SF.udf(ST.StringType())
def selectNumString(stringui, num):
    selecao = stringui[:num]
    return selecao

# criar coluna com 8 primeiros digitos
cursor = cursor.withColumn('grupo_empresarial', selectNumString(df.CNPJ, SF.lit(8) ))

In [9]:
# agrupar numero de CNPJ por grupo empresarial
grupodf = (
    cursor.groupby('grupo_empresarial').agg({'*': 'count'})
        .toDF('grupo_empresarial', 'numero_de_filiais')
)
grupodf.show(10)

+-----------------+-----------------+
|grupo_empresarial|numero_de_filiais|
+-----------------+-----------------+
|         10906560|                1|
|         10827622|                1|
|         05306058|                1|
|         30645523|                1|
|         24481801|                1|
|         15913798|                1|
|         28882940|                1|
|         24820304|                1|
|         19581877|                1|
|         81519308|                1|
+-----------------+-----------------+
only showing top 10 rows



### 4. Concentração Empresas por CEP

In [10]:
# criar grupo CEP
cursor = cursor.withColumn('cep_grupo1', selectNumString(df.CEP, SF.lit(1) ))
cursor = cursor.withColumn('cep_grupo5', selectNumString(df.CEP, SF.lit(5) ))

In [11]:
cursor.select('CEP','cep_grupo1', 'cep_grupo5').show(5)

+--------+----------+----------+
|     CEP|cep_grupo1|cep_grupo5|
+--------+----------+----------+
|60175375|         6|     60175|
|44001535|         4|     44001|
|14409267|         1|     14409|
|13212390|         1|     13212|
|02036010|         0|     02036|
+--------+----------+----------+
only showing top 5 rows



In [12]:
# agrupar numero de CNPJ por CEPgrupo
CEPdf = (
    cursor.groupby('cep_grupo5').agg({'*': 'count'})
       .toDF('cep_grupo5', 'numero_de_empresas_cep5')
)
CEPdf.show(10)

+----------+-----------------------+
|cep_grupo5|numero_de_empresas_cep5|
+----------+-----------------------+
|     88140|                      6|
|     88311|                     10|
|     13610|                     21|
|     05422|                     73|
|     80820|                      5|
|     26112|                      9|
|     18130|                     21|
|     91910|                     29|
|     60822|                     21|
|     02053|                      7|
+----------+-----------------------+
only showing top 10 rows



## Combinar bases

In [13]:
# mergir com base cursor
cursor = cursor.join(grupodf, ['grupo_empresarial'], 'left')
cursor = cursor.join(CEPdf, ['cep_grupo5'], 'left')
cursor = cursor.join(dfg, ['CNPJ'])

In [14]:
cursor.select('CNPJ', 'grupo_empresarial','email_count','telefone_num','numero_de_filiais', 'numero_de_empresas_cep5').show(10)

+--------------+-----------------+-----------+------------+-----------------+-----------------------+
|          CNPJ|grupo_empresarial|email_count|telefone_num|numero_de_filiais|numero_de_empresas_cep5|
+--------------+-----------------+-----------+------------+-----------------+-----------------------+
|00182140000128|         00182140|          1|           1|                1|                     34|
|00231887000129|         00231887|          1|           0|                1|                     21|
|00256837000104|         00256837|          1|           0|                1|                     17|
|00501368000133|         00501368|          0|           0|                1|                     64|
|00560191000146|         00560191|          1|           2|                1|                     11|
|00642972000180|         00642972|          0|           0|                1|                     42|
|00663069000103|         00663069|          0|           0|                1|     