In [13]:
import findspark
findspark.init()

from pyspark import HiveContext

from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("myApp") \
    .enableHiveSupport() \
    .getOrCreate()

hiveContext = HiveContext(spark.sparkContext)

In [14]:
# Mostra as databases existentes no hive
hiveContext.sql("SHOW DATABASES").show()

+---------+
|namespace|
+---------+
|  default|
|    sales|
+---------+



In [15]:
# Carregando todos os dados
df = spark.read.csv("../amazon_reviews_us_PC_v1_00.tsv", sep=r'\t', header=True)
df.createOrReplaceTempView("temp")
df.printSchema()

root
 |-- marketplace: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: string (nullable = true)
 |-- product_title: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- star_rating: string (nullable = true)
 |-- helpful_votes: string (nullable = true)
 |-- total_votes: string (nullable = true)
 |-- vine: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: string (nullable = true)



# Criando a tabela de avaliações

In [16]:
# Review table
## customer_id
## review_id
## product_id
## star_rating
reviewsTable = spark.sql("SELECT customer_id, review_id, product_id, star_rating FROM temp")

# Versão inicial da Review Table sem as transformações
reviewsTable.show()

+-----------+--------------+----------+-----------+
|customer_id|     review_id|product_id|star_rating|
+-----------+--------------+----------+-----------+
|   22873041|R3ARRMDEGED8RD|B00KJWQIIC|          5|
|   30088427| RQ28TSA020Y6J|B013ALA9LA|          5|
|   20329786| RUXJRZCT6953M|B00PML2GQ8|          1|
|   14215710| R7EO0UO6BPB71|B001NS0OZ4|          1|
|   38264512|R39NJY2YJ1JFSV|B00AQMTND2|          5|
|   30548466|R31SR7REWNX7CF|B00KX4TORI|          5|
|     589298| RVBP8I1R0CTZ8|B00P17WEMY|          3|
|   49329488|R1QF6RS1PDLU18|B00TR05L9Y|          4|
|   50728290|R23AICGEDAJQL1|B0098Y77OG|          1|
|   37802374|R2EY3N4K9W19UP|B00IFYEYXC|          5|
|   52027882| RC9AW4HKJ016M|B0091ITP0S|          1|
|   41770239|R2ALWJE9N6ZBXD|B008I21EA2|          1|
|   42560427|R2G5FPA4OX37GV|B00MRB7SBO|          5|
|   46345923|R1IKTSEVXSIMOD|B00LLER2CS|          5|
|   41751192|R2YA6G6SRFEWF6|B00B0CQCCC|          1|
|   21176481| RS9H1N9I3Z1IA|B00GU8W5AE|          5|
|   10674058

# Calculando média e medianas de estrelas - Contando o número de avaliações por produto

In [17]:
# Product table
## product_id
## product_title
## star_avg (CRIAR)
## star_median (CRIAR)
## total_reviews (CRIAR)

# Calcular número total de avaliações por produto
# Calcular média de estrelas por produto
# Calcular mediana de estrelas por produto

productInfos = spark.sql("SELECT product_id, percentile_approx(star_rating, 0.5) as star_median, \
avg(star_rating) as star_average, count(product_id) as total_reviews FROM temp GROUP BY product_id")
productInfos.createOrReplaceTempView("products")

# Versão inicial da tabela de produtos, ainda com produtos duplicados.
productInfos.show()

+----------+-----------+------------------+-------------+
|product_id|star_median|      star_average|total_reviews|
+----------+-----------+------------------+-------------+
|9875987018|        4.0|               4.5|            2|
|9966285946|        5.0|               5.0|            1|
|9966694242|        5.0|               4.6|           10|
|9967222247|        5.0|               5.0|            3|
|9985538803|        5.0|               5.0|            1|
|9985725344|        5.0|               5.0|            1|
|9989476071|        5.0|               5.0|            1|
|9990950369|        2.0|               2.0|            1|
|B00000J3SV|        1.0|               3.0|            2|
|B00000JBK6|        5.0| 4.153846153846154|           13|
|B00002S73F|        5.0|               3.7|           10|
|B00004VV4B|        5.0|               5.0|            1|
|B00004YNSK|        5.0|               4.5|            8|
|B00004Z7BU|        5.0|               5.0|            1|
|B00005045V|  

# Removendo linhas duplicadas

In [19]:
# Removendo os produtos duplicados
## Em testes foi verificado que o produto com ID = B0049SIJ7K está duplicado duplicado
productNames = spark.sql("SELECT DISTINCT product_id, product_title FROM temp")
productNames = productNames.dropDuplicates(["product_id"])
productTable = productNames.join(productInfos, 'product_id')
productTable.show()
productTable.createOrReplaceTempView("products")

+----------+--------------------+-----------+------------------+-------------+
|product_id|       product_title|star_median|      star_average|total_reviews|
+----------+--------------------+-----------+------------------+-------------+
|9875987018|Professional Ultr...|        4.0|               4.5|            2|
|9966285946|Professional King...|        5.0|               5.0|            1|
|9966694242|Professional King...|        5.0|               4.6|           10|
|9967222247|Professional King...|        5.0|               5.0|            3|
|9985538803|Samsung Galaxy St...|        5.0|               5.0|            1|
|9985725344|Professional King...|        5.0|               5.0|            1|
|9989476071|Professional King...|        5.0|               5.0|            1|
|9990950369|Samsung SGH-i780 ...|        2.0|               2.0|            1|
|B00000J3SV|Intel ICS2USB Cre...|        1.0|               3.0|            2|
|B00000JBK6|    ALTEC ACS SERIES|        5.0| 4.1538

In [22]:
# Removendo as reviews duplicadas
## Em testes foram encontradas 1764 reviews duplicadas
reviewsTable = reviewsTable.dropDuplicates(["customer_id", "product_id", "star_rating"])
reviewsTable.show()

+-----------+--------------+----------+-----------+
|customer_id|     review_id|product_id|star_rating|
+-----------+--------------+----------+-----------+
|   10001434|R15PWB2RWMDYY0|B0028QQC0Q|          1|
|   10002051|R2FV71G5DAQK2V|B0069ASUBQ|          3|
|   10002342|R2X44EUDV89CD0|B00CXAFG72|          5|
|   10002911|R2I3W2ZQ7WDNR1|B00HKEI3EY|          5|
|   10003679|R3PPUM8GC0P4VV|B00A1AK6EE|          4|
|   10003897| RU1J6AWUQDH8A|B000SR120M|          5|
|   10004084|R355XMTXY1GOQF|B001SH2AVQ|          5|
|   10004974|R236IN0TMKW8F9|B007PTCFFW|          1|
|   10006218|R3OJNQVUXB39PW|B00A1EDRR8|          5|
|   10006371|R1M16Q84UAFF6L|B00WBCBPDQ|          4|
|   10006579|R1VUBMDCR5R9ZS|B000GP844S|          5|
|   10007900|R1Q864KE8JGKUZ|B002VJJMSO|          5|
|   10008642| RKIHY06FGFBFU|B007C0Y8PY|          5|
|   10008821|R3RBBU21WYULB1|B0002K6RK0|          5|
|   10010828|R3BQA6G9M6GNYV|B000GHXTBO|          5|
|   10011267|R14OH1P4LDYMF0|B00HAHFQL4|          5|
|    1001155

# Salvando os dataframes no Hive

In [27]:
reviewsTable.write.mode("overwrite").saveAsTable("sales.user_reviews")
productTable.write.mode("overwrite").saveAsTable("sales.products")