## IMPORTAÇÃO DE BIBLIOTECAS

In [1]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

In [2]:
from pyspark.sql import SQLContext
sqlContext = SQLContext.getOrCreate(sc)

In [3]:
from pyspark.sql import HiveContext
hc = HiveContext.getOrCreate(sc)
from pyspark.sql.types import *

## IMPORTAÇÃO DE DADOS

In [4]:
# Exibe tabelas, caso exista
sqlContext.sql("show tables").show()

+--------+-------------+-----------+
|database|    tableName|isTemporary|
+--------+-------------+-----------+
| default|       carros|      false|
| default|       cities|      false|
| default|        lines|      false|
| default|       placas|      false|
| default|station_lines|      false|
| default|     stations|      false|
+--------+-------------+-----------+



In [5]:
# Exclui tabela carros, caso exista
hc.sql("drop table carros")

DataFrame[]

In [6]:
# Importa os dados e trata os caracteres

lines = sc.textFile("C:/Dados/E2/registrosPlacas.txt").map(lambda l: l.replace('\t',',').replace(' km/h',''))
print(lines.top(3))

['ZXH5193,9993,Maringá,Mon Jan 03 02:53:39 BRST 2000,110', 'ZXH5193,9992,Maringá,Mon Jan 03 02:53:36 BRST 2000,105', 'ZXH5193,9991,Maringá,Mon Jan 03 02:53:33 BRST 2000,107']


In [7]:
# Separa os dados em arrays

parts = lines.map(lambda l: l.split(','))
print(parts.top(3))

[['ZXH5193', '9993', 'Maringá', 'Mon Jan 03 02:53:39 BRST 2000', '110'], ['ZXH5193', '9992', 'Maringá', 'Mon Jan 03 02:53:36 BRST 2000', '105'], ['ZXH5193', '9991', 'Maringá', 'Mon Jan 03 02:53:33 BRST 2000', '107']]


In [8]:
# Transforma RDD em Data Frame com o seguinte cabeçalho

df = parts.toDF(["placa","local","municipio","data","velocidade"])

In [9]:
# Exibe tabelas

sqlContext.sql("show tables").show()

+--------+-------------+-----------+
|database|    tableName|isTemporary|
+--------+-------------+-----------+
| default|       cities|      false|
| default|        lines|      false|
| default|       placas|      false|
| default|station_lines|      false|
| default|     stations|      false|
+--------+-------------+-----------+



In [10]:
# Exporta o Data Frame para tabela

df.write.saveAsTable("carros")

In [11]:
# Imprime tabela com os 5 primeiros resultados

hc.sql("select * from carros limit 5").show()

+-------+-----+---------+--------------------+----------+
|  placa|local|municipio|                data|velocidade|
+-------+-----+---------+--------------------+----------+
|BQX7899| 0010| Curitiba|Mon Jan 03 01:34:...|        56|
|MJG5107| 9539| Umuarama|Wed Jan 05 08:47:...|        27|
|QZQ0584| 9852| Curitiba|Mon Jan 03 08:32:...|        90|
|JSQ9319| 6288| Londrina|Wed Jan 05 07:28:...|        39|
|WIZ0836| 2592| Londrina|Thu Jan 06 10:49:...|        66|
+-------+-----+---------+--------------------+----------+



In [12]:
# Imprime placa com mais ocorências

hc.sql("select placa, count(*) as cnt \
        from carros \
        group by placa \
        order by cnt desc \
        limit 1").show()

+-------+----+
|  placa| cnt|
+-------+----+
|UFC2015|7000|
+-------+----+



In [13]:
# Imprime placa com maior velocidade

hc.sql("select placa, max(velocidade) as max \
        from carros \
        group by placa \
        order by max desc \
        limit 1").show()

+-------+---+
|  placa|max|
+-------+---+
|IHD9565| 99|
+-------+---+



In [14]:
# Imprime média de velocidade

hc.sql("select placa, avg(velocidade) as avg \
        from carros \
        group by placa \
        order by avg desc \
        limit 1").show()

+-------+------------------+
|  placa|               avg|
+-------+------------------+
|EZB0648|108.66371428571429|
+-------+------------------+

