In [None]:
import findspark

In [None]:
findspark.init()

In [None]:
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import regexp_replace

spark = SparkSession.builder.appName('roxproject').getOrCreate()
sc =  spark.sparkContext
sqlContext = SQLContext(sc)

In [None]:
df_person = spark.read.csv('data/Person.Person.csv', 
                            sep=';', inferSchema=True, header=True)

In [None]:
df_person.createOrReplaceTempView('Person')

In [None]:
df_product = spark.read.csv('data/Production.Product.csv', 
                            sep=';', inferSchema=True, header=True)

In [None]:
df_product.createOrReplaceTempView('Product')

In [None]:
df_costumer = spark.read.csv('data/Sales.Customer.csv', 
                            sep=';', inferSchema=True, header=True)

In [None]:
df_costumer.createOrReplaceTempView('Customer')

In [None]:
df_orderdetail = spark.read.csv('data/Sales.SalesOrderDetail.csv', 
                            sep=';', inferSchema=True, header=True)

In [None]:
df_orderdetail.createOrReplaceTempView('SalesOrderDetail')

In [None]:
df_orderheader = spark.read.csv('data/Sales.SalesOrderHeader.csv', 
                            sep=';', inferSchema=True, header=True)

In [None]:
df_orderheader = df_orderheader.withColumn('TotalDue', regexp_replace('TotalDue', ',', '.'))
df_orderheader = df_orderheader.withColumn('TotalDue', df_orderheader['TotalDue'].cast("float"))

In [None]:
df_orderheader.createOrReplaceTempView('SalesOrderHeader')

In [None]:
df_specialoffer = spark.read.csv('data/Sales.SpecialOfferProduct.csv', 
                            sep=';', inferSchema=True, header=True)

In [None]:
df_specialoffer.createOrReplaceTempView('SpecialOfferProduct')

### Análise de Dados

1. Escreva uma query que retorna a quantidade de linhas na tabela Sales.SalesOrderDetail pelo campo SalesOrderID, desde que tenham pelo menos três linhas de detalhes.
    

In [None]:
query = """
    SELECT COUNT(SalesOrderID) AS OrderQty
    FROM (
        SELECT SalesOrderID, COUNT(SalesOrderID) AS Qty
        FROM SalesOrderDetail
        GROUP BY SalesOrderID
    )
    WHERE Qty > 2
"""

sqlContext.sql(query).show(100, False)

+--------+
|OrderQty|
+--------+
|12757   |
+--------+



2. Escreva uma query que ligue as tabelas Sales.SalesOrderDetail, Sales.SpecialOfferProduct e Production.Product e retorne os 3 produtos (Name) mais vendidos (pela soma de OrderQty), agrupados pelo número de dias para manufatura (DaysToManufacture).

In [None]:
query = """
    SELECT Name, SUM(OrderQty) AS OrderQtyTotal, DaysToManufacture
    FROM Product AS P
    JOIN SalesOrderDetail AS D ON D.ProductID = P.ProductID
    JOIN SpecialOfferProduct AS S ON S.ProductID = P.ProductID 
        AND S.SpecialOfferID = D.SpecialOfferID
    GROUP BY Name, DaysToManufacture
    ORDER BY OrderQtyTotal DESC
    LIMIT 3
"""

sqlContext.sql(query).show(100, False)

+----------------------+-------------+-----------------+
|Name                  |OrderQtyTotal|DaysToManufacture|
+----------------------+-------------+-----------------+
|AWC Logo Cap          |8311         |0                |
|Water Bottle - 30 oz. |6815         |0                |
|Sport-100 Helmet, Blue|6743         |0                |
+----------------------+-------------+-----------------+



3. Escreva uma query ligando as tabelas Person.Person, Sales.Customer e Sales.SalesOrderHeader de forma a obter uma lista de nomes de clientes e uma contagem de pedidos efetuados.

In [None]:
query = """
    SELECT CONCAT_WS(' ', FirstName, NULLIF(MiddleName, 'NULL'), LastName) AS Name,
        COUNT(SalesOrderID) AS OrderQty
    FROM Person AS P
    JOIN Customer AS C ON C.PersonID = P.BusinessEntityID 
    JOIN SalesOrderHeader AS S ON C.CustomerID = S.CustomerID
    GROUP BY Name
    ORDER BY OrderQty DESC
"""

sqlContext.sql(query).show(20, False)

+------------------+--------+
|Name              |OrderQty|
+------------------+--------+
|Dalton Perez      |28      |
|Mason D Roberts   |28      |
|Jennifer Simmons  |27      |
|Samantha Jenkins  |27      |
|Hailey I Patterson|27      |
|Henry B Garcia    |27      |
|Jason L Griffin   |27      |
|Ashley Henderson  |27      |
|Fernando Barnes   |27      |
|Charles P Jackson |27      |
|Daniel Davis      |27      |
|Nancy E Chapman   |27      |
|Ryan M Thompson   |27      |
|April L Shan      |25      |
|Samantha Russell  |17      |
|Gina E Martin     |17      |
|Luke L Lal        |17      |
|Luis D Diaz       |17      |
|Chloe Campbell    |17      |
|Ana Perry         |17      |
+------------------+--------+
only showing top 20 rows



4. Escreva uma query usando as tabelas Sales.SalesOrderHeader, Sales.SalesOrderDetail e Production.Product, de forma a obter a soma total de produtos (OrderQty) por ProductID e OrderDate.
    

In [None]:
query = """
    SELECT P.ProductID, OrderDate, SUM(OrderQty) AS OrderQtyTotal
    FROM Product AS P
    JOIN SalesOrderDetail AS D ON D.ProductID = P.ProductID
    JOIN SalesOrderHeader AS S ON D.SalesOrderID = S.SalesOrderID
    GROUP BY P.ProductID, OrderDate
    ORDER BY OrderQtyTotal DESC
"""

sqlContext.sql(query).show(20, False)

+---------+-------------------+-------------+
|ProductID|OrderDate          |OrderQtyTotal|
+---------+-------------------+-------------+
|864      |2013-06-30 00:00:00|498          |
|864      |2013-07-31 00:00:00|465          |
|884      |2013-06-30 00:00:00|444          |
|867      |2013-06-30 00:00:00|427          |
|864      |2014-03-31 00:00:00|424          |
|884      |2013-07-31 00:00:00|420          |
|712      |2013-06-30 00:00:00|415          |
|863      |2012-06-30 00:00:00|409          |
|715      |2013-06-30 00:00:00|406          |
|876      |2013-07-31 00:00:00|397          |
|864      |2014-05-01 00:00:00|383          |
|864      |2013-09-30 00:00:00|383          |
|864      |2013-10-30 00:00:00|380          |
|869      |2013-07-31 00:00:00|374          |
|712      |2013-07-31 00:00:00|363          |
|876      |2013-06-30 00:00:00|363          |
|863      |2013-03-30 00:00:00|358          |
|863      |2012-05-30 00:00:00|357          |
|867      |2013-07-31 00:00:00|356

5. Escreva uma query mostrando os campos SalesOrderID, OrderDate e TotalDue da tabela Sales.SalesOrderHeader. Obtenha apenas as linhas onde a ordem tenha sido feita durante o mês de setembro/2011 e o total devido esteja acima de 1.000. Ordene pelo total devido decrescente.

In [None]:
query = """
    SELECT SalesOrderID, OrderDate, TotalDue
    FROM SalesOrderHeader
    WHERE OrderDate >= '2011-09-01 00:00:00'
    AND OrderDate < '2011-10-01 00:00:00'
    AND TotalDue > 1000
    ORDER BY TotalDue DESC
"""

sqlContext.sql(query).show(20, False)

+------------+-------------------+---------+
|SalesOrderID|OrderDate          |TotalDue |
+------------+-------------------+---------+
|44381       |2011-09-11 00:00:00|3953.9883|
|44440       |2011-09-22 00:00:00|3953.9883|
|44385       |2011-09-11 00:00:00|3953.9883|
|44439       |2011-09-22 00:00:00|3953.9883|
|44386       |2011-09-11 00:00:00|3953.9883|
|44327       |2011-09-02 00:00:00|3953.9883|
|44388       |2011-09-11 00:00:00|3953.9883|
|44329       |2011-09-02 00:00:00|3953.9883|
|44389       |2011-09-11 00:00:00|3953.9883|
|44331       |2011-09-03 00:00:00|3953.9883|
|44390       |2011-09-11 00:00:00|3953.9883|
|44334       |2011-09-04 00:00:00|3953.9883|
|44391       |2011-09-11 00:00:00|3953.9883|
|44339       |2011-09-04 00:00:00|3953.9883|
|44392       |2011-09-11 00:00:00|3953.9883|
|44343       |2011-09-05 00:00:00|3953.9883|
|44394       |2011-09-12 00:00:00|3953.9883|
|44345       |2011-09-06 00:00:00|3953.9883|
|44395       |2011-09-12 00:00:00|3953.9883|
|44348    