## Análise de dados: Comércio eletrônico brasileiro

Este projeto é composto por um conjunto de dados públicos de comércio eletrônico brasileiro, disponibilizados pelo site Olist, são registros que compõem todo o processo de venda de um produto, da compra, pagamento, entrega e avaliação, além de dados de geolocalização, produtos e vendedores. Estas informações serão tratadas e analisadas, de modo a responder questões de negócio.


### Importação de bibliotecas

In [64]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, desc


### Criação e iniciação de uma sessão Spark

In [2]:
spark = SparkSession.builder.appName('PySpark - Olist').getOrCreate()
spark


### Criação dos datasets a partir da leitura dos arquivos *.csv


In [3]:
df_orders = spark.read.csv('dados\olist_orders_dataset.csv', sep=',', header=True, encoding='utf-8', inferSchema=True)
df_customers = spark.read.csv('dados\olist_customers_dataset.csv', sep=',', header=True, encoding='utf-8', inferSchema=True)
df_geolocation = spark.read.csv('dados\olist_geolocation_dataset.csv', sep=',', header=True, encoding='utf-8', inferSchema=True)
df_order_items = spark.read.csv('dados\olist_order_items_dataset.csv', sep=',', header=True, encoding='utf-8', inferSchema=True)
df_order_payments = spark.read.csv('dados\olist_order_payments_dataset.csv', sep=',', header=True, encoding='utf-8', inferSchema=True)
df_order_reviews = spark.read.csv('dados\olist_order_reviews_dataset.csv', sep=',', header=True, encoding='utf-8', inferSchema=True)
df_products = spark.read.csv('dados\olist_products_dataset.csv', sep=',', header=True, encoding='utf-8', inferSchema=True)
df_sellers = spark.read.csv('dados\olist_sellers_dataset.csv', sep=',', header=True, encoding='utf-8', inferSchema=True)

### Verificando os tipos das colunas

In [4]:
df_orders.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)



In [5]:
df_customers.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: integer (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)



In [6]:
df_geolocation.printSchema()

root
 |-- geolocation_zip_code_prefix: integer (nullable = true)
 |-- geolocation_lat: double (nullable = true)
 |-- geolocation_lng: double (nullable = true)
 |-- geolocation_city: string (nullable = true)
 |-- geolocation_state: string (nullable = true)



In [7]:
df_order_items.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- product_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)



In [8]:
df_order_payments.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- payment_sequential: integer (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- payment_installments: integer (nullable = true)
 |-- payment_value: double (nullable = true)



In [9]:
df_order_reviews.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- review_score: string (nullable = true)
 |-- review_comment_title: string (nullable = true)
 |-- review_comment_message: string (nullable = true)
 |-- review_creation_date: string (nullable = true)
 |-- review_answer_timestamp: string (nullable = true)



In [10]:
df_products.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- product_category_name: string (nullable = true)
 |-- product_name_lenght: integer (nullable = true)
 |-- product_description_lenght: integer (nullable = true)
 |-- product_photos_qty: integer (nullable = true)
 |-- product_weight_g: integer (nullable = true)
 |-- product_length_cm: integer (nullable = true)
 |-- product_height_cm: integer (nullable = true)
 |-- product_width_cm: integer (nullable = true)



In [11]:
df_sellers.printSchema()

root
 |-- seller_id: string (nullable = true)
 |-- seller_zip_code_prefix: integer (nullable = true)
 |-- seller_city: string (nullable = true)
 |-- seller_state: string (nullable = true)



### Verificando a existência de registros nulos

In [61]:
def check_nulls(dataframe, name):
    print('\n', name.upper(), '-'* (100 - len(name)))
    for coluna in dataframe.columns:
        qty = dataframe.filter(dataframe[coluna].isNull()).count()
        if qty >= 1:
            print('', coluna, qty)


In [62]:
check_nulls(df_orders, 'df_orders')
check_nulls(df_customers, 'df_customers')
check_nulls(df_geolocation, 'df_geolocation')
check_nulls(df_order_items, 'df_order_items')
check_nulls(df_order_payments, 'df_order_payments')
check_nulls(df_order_reviews, 'df_order_reviews')
check_nulls(df_products, 'df_products')
check_nulls(df_sellers, 'df_sellers')



 DF_ORDERS -------------------------------------------------------------------------------------------
 order_approved_at 160
 order_delivered_carrier_date 1783
 order_delivered_customer_date 2965

 DF_CUSTOMERS ----------------------------------------------------------------------------------------

 DF_GEOLOCATION --------------------------------------------------------------------------------------

 DF_ORDER_ITEMS --------------------------------------------------------------------------------------

 DF_ORDER_PAYMENTS -----------------------------------------------------------------------------------

 DF_ORDER_REVIEWS ------------------------------------------------------------------------------------
 review_id 1
 order_id 2236
 review_score 2380
 review_comment_title 92157
 review_comment_message 63079
 review_creation_date 8764
 review_answer_timestamp 8785

 DF_PRODUCTS -----------------------------------------------------------------------------------------
 product_categor

Foram identificados valores nulos em 3 dataframes, df_orders, df_order_reviews e df_products, entretando no caso de **df_orders** os dados representam operações de venda, logo possui vários estágios podendo ser uma venda concluída, cancelada, processamento ou mesmo em trânsito, ou seja, dependendo do estágio algumas colunas podem ficarem vazias (nulas), em **df_order_reviews** há campos com reviews dos compradores sobre suas compras, não é obrigatório um cliente escrever um review e em **df_products** há produtos com nome e descrição ausentes, porém constam em pedidos de clientes.

### Verificando a existência de registros duplicados

In [67]:
dup = df_customers.groupBy('customer_unique_id') \
        .agg(count('*').alias('qty')) \
        .where(col('qty') > 1) \
        .orderBy(desc('qty'))
        
dup.show(5, truncate=False)

+--------------------------------+---+
|customer_unique_id              |qty|
+--------------------------------+---+
|8d50f5eadf50201ccdcedfb9e2ac8455|17 |
|3e43e6105506432c953e165fb2acf44c|9  |
|ca77025e7201e3b30c44b472ff346268|7  |
|1b6c7548a2a1f9037c1fd3ddfed95f33|7  |
|6469f99c1f9dfae7733b25662e7f1782|7  |
+--------------------------------+---+
only showing top 5 rows



### Criando views temporárias para uso do Spark SQL

In [14]:
df_orders.createOrReplaceTempView('orders')
df_customers.createOrReplaceTempView('customers')
df_geolocation.createOrReplaceTempView('geolocation')
df_order_items.createOrReplaceTempView('order_items')
df_order_payments.createOrReplaceTempView('order_payments')
df_order_reviews.createOrReplaceTempView('order_reviews')
df_products.createOrReplaceTempView('products')
df_sellers.createOrReplaceTempView('sellers')


In [25]:
spark.sql('''
      SELECT * FROM customers 
      WHERE customer_unique_id='8d50f5eadf50201ccdcedfb9e2ac8455';
''').show(truncate=False)

+--------------------------------+--------------------------------+------------------------+-------------+--------------+
|customer_id                     |customer_unique_id              |customer_zip_code_prefix|customer_city|customer_state|
+--------------------------------+--------------------------------+------------------------+-------------+--------------+
|1bd3585471932167ab72a84955ebefea|8d50f5eadf50201ccdcedfb9e2ac8455|4045                    |sao paulo    |SP            |
|a8fabc805e9a10a3c93ae5bff642b86b|8d50f5eadf50201ccdcedfb9e2ac8455|4045                    |sao paulo    |SP            |
|897b7f72042714efaa64ac306ba0cafc|8d50f5eadf50201ccdcedfb9e2ac8455|4045                    |sao paulo    |SP            |
|b2b13de0770e06de50080fea77c459e6|8d50f5eadf50201ccdcedfb9e2ac8455|4045                    |sao paulo    |SP            |
|42dbc1ad9d560637c9c4c1533746f86d|8d50f5eadf50201ccdcedfb9e2ac8455|4045                    |sao paulo    |SP            |
|dfb941d6f7b02f57a44c3b7