In [16]:
from pyspark.context import SparkContext
sc = SparkContext.getOrCreate("local[*]")

In [113]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [114]:
lines = sqlContext.read\
    .option("multiline", "true")\
    .json('hdfs://localhost:8020/user/danielqueiroz/input/resultado_crawler_*.json')

In [115]:
lines.printSchema()

root
 |-- foo: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- page_category: string (nullable = true)
 |    |    |-- page_category_text: string (nullable = true)
 |    |    |-- page_url: string (nullable = true)
 |    |    |-- question_answers: struct (nullable = true)
 |    |    |    |-- question_answer_0: string (nullable = true)
 |    |    |    |-- question_answer_10: string (nullable = true)
 |    |    |    |-- question_answer_11: string (nullable = true)
 |    |    |    |-- question_answer_12: string (nullable = true)
 |    |    |    |-- question_answer_13: string (nullable = true)
 |    |    |    |-- question_answer_14: string (nullable = true)
 |    |    |    |-- question_answer_15: string (nullable = true)
 |    |    |    |-- question_answer_16: string (nullable = true)
 |    |    |    |-- question_answer_17: string (nullable = true)
 |    |    |    |-- question_answer_18: string (nullable = true)
 |    |    |    |-- question_answer_19: s

In [116]:
from pyspark.sql.types import *
from pyspark.sql import functions as F

lines_transf_001 = lines.select(explode("foo").alias("foo"))

lines_transf_002 = lines_transf_001.withColumn("foo.question_datetime", F.explode("foo.question_datetime"))\
    .withColumn("foo.question_title", F.explode("foo.question_title"))\
    .withColumn("foo.question_comments_count", F.explode("foo.question_comments_count"))\
    .select(
        col("foo.page_category").alias("page_category"), 
        col("foo.page_category_text").alias("page_category_text"), 
        col("foo.question_datetime").getItem(0).alias("question_datetime"),
        col("foo.question_title").getItem(0).alias("question_title"),
        col("foo.question_views_count").alias("question_views_count").cast(IntegerType()),
        col("foo.question_comments_count").getItem(0).alias("question_comments_count").cast(IntegerType()),
        col("foo.question_text").alias("question_text"),
        col("foo.question_answers.*")
    )

In [117]:
lines_transf_002.printSchema()

root
 |-- page_category: string (nullable = true)
 |-- page_category_text: string (nullable = true)
 |-- question_datetime: timestamp (nullable = true)
 |-- question_title: string (nullable = true)
 |-- question_views_count: integer (nullable = true)
 |-- question_comments_count: integer (nullable = true)
 |-- question_text: string (nullable = true)
 |-- question_answer_0: string (nullable = true)
 |-- question_answer_10: string (nullable = true)
 |-- question_answer_11: string (nullable = true)
 |-- question_answer_12: string (nullable = true)
 |-- question_answer_13: string (nullable = true)
 |-- question_answer_14: string (nullable = true)
 |-- question_answer_15: string (nullable = true)
 |-- question_answer_16: string (nullable = true)
 |-- question_answer_17: string (nullable = true)
 |-- question_answer_18: string (nullable = true)
 |-- question_answer_19: string (nullable = true)
 |-- question_answer_2: string (nullable = true)
 |-- question_answer_20: string (nullable = true)


In [121]:
# verifica o numero de partições

lines_transf_002.rdd.getNumPartitions()


1

In [122]:
# inclusão da coluna [partition_key] que será utilizada para particionar os dados por ano e mês (YYYYmm)

lines_transf_003 = lines_transf_002.withColumn("partition_key", 
                                               F.date_format(col("question_datetime"), 'yyyyMM').cast(IntegerType()))



In [123]:
lines_transf_003.printSchema()

root
 |-- page_category: string (nullable = true)
 |-- page_category_text: string (nullable = true)
 |-- question_datetime: timestamp (nullable = true)
 |-- question_title: string (nullable = true)
 |-- question_views_count: integer (nullable = true)
 |-- question_comments_count: integer (nullable = true)
 |-- question_text: string (nullable = true)
 |-- question_answer_0: string (nullable = true)
 |-- question_answer_10: string (nullable = true)
 |-- question_answer_11: string (nullable = true)
 |-- question_answer_12: string (nullable = true)
 |-- question_answer_13: string (nullable = true)
 |-- question_answer_14: string (nullable = true)
 |-- question_answer_15: string (nullable = true)
 |-- question_answer_16: string (nullable = true)
 |-- question_answer_17: string (nullable = true)
 |-- question_answer_18: string (nullable = true)
 |-- question_answer_19: string (nullable = true)
 |-- question_answer_2: string (nullable = true)
 |-- question_answer_20: string (nullable = true)


In [124]:
lines_transf_003.write\
    .mode("overwrite")\
    .format("parquet")\
    .partitionBy('partition_key')\
    .option("maxRecordsPerFile", 100)\
    .save('hdfs://localhost:8020/user/danielqueiroz/output/resultado_crawler')