In [53]:
from pyspark.sql import SparkSession
spark = SparkSession\
    .builder\
    .appName("ETL Scrapy")\
    .config('spark.sql.repl.eagerEval.enabled', "true")\
    .config('spark.sql.execution.arrow.enabled', "true")\
    .config('spark.sql.execution.arrow.fallback.enabled', "true")\
    .getOrCreate()

In [54]:
from pyspark.context import SparkContext
sc = SparkContext.getOrCreate("local[*]")

In [55]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [56]:
lines = sqlContext.read\
    .option("multiline", "true")\
    .json('hdfs://localhost:8020/user/danielqueiroz/input/resultado_crawler_*.json')

In [57]:
lines.printSchema()

root
 |-- foo: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- page_category: string (nullable = true)
 |    |    |-- page_category_text: string (nullable = true)
 |    |    |-- page_url: string (nullable = true)
 |    |    |-- question_answers: struct (nullable = true)
 |    |    |    |-- question_answer_0: string (nullable = true)
 |    |    |    |-- question_answer_10: string (nullable = true)
 |    |    |    |-- question_answer_11: string (nullable = true)
 |    |    |    |-- question_answer_12: string (nullable = true)
 |    |    |    |-- question_answer_13: string (nullable = true)
 |    |    |    |-- question_answer_14: string (nullable = true)
 |    |    |    |-- question_answer_15: string (nullable = true)
 |    |    |    |-- question_answer_16: string (nullable = true)
 |    |    |    |-- question_answer_17: string (nullable = true)
 |    |    |    |-- question_answer_18: string (nullable = true)
 |    |    |    |-- question_answer_19: s

In [63]:
# converte o json com colunas do tipo array, em um flatten json
# renomeia as colunas
# conversão de tipos

from pyspark.sql.types import *
from pyspark.sql import functions as F

lines_transf_001 = lines.select(F.explode("foo").alias("foo"))

lines_transf_002 = lines_transf_001.withColumn("foo.question_datetime", F.explode("foo.question_datetime"))\
    .withColumn("foo.question_title", F.explode("foo.question_title"))\
    .withColumn("foo.question_comments_count", F.explode("foo.question_comments_count"))\
    .select(
        F.col("foo.page_category").alias("page_category"), 
        F.col("foo.page_category_text").alias("page_category_text"), 
        F.col("foo.question_datetime").getItem(0).alias("question_datetime"),
        F.col("foo.question_title").getItem(0).alias("question_title"),
        F.col("foo.question_views_count").alias("question_views_count").cast(IntegerType()),
        F.col("foo.question_comments_count").getItem(0).alias("question_comments_count").cast(IntegerType()),
        F.col("foo.question_text").alias("question_text"),
        F.col("foo.question_answers.*")
    )

In [64]:
lines_transf_002.printSchema()

root
 |-- page_category: string (nullable = true)
 |-- page_category_text: string (nullable = true)
 |-- question_datetime: timestamp (nullable = true)
 |-- question_title: string (nullable = true)
 |-- question_views_count: integer (nullable = true)
 |-- question_comments_count: integer (nullable = true)
 |-- question_text: string (nullable = true)
 |-- question_answer_0: string (nullable = true)
 |-- question_answer_10: string (nullable = true)
 |-- question_answer_11: string (nullable = true)
 |-- question_answer_12: string (nullable = true)
 |-- question_answer_13: string (nullable = true)
 |-- question_answer_14: string (nullable = true)
 |-- question_answer_15: string (nullable = true)
 |-- question_answer_16: string (nullable = true)
 |-- question_answer_17: string (nullable = true)
 |-- question_answer_18: string (nullable = true)
 |-- question_answer_19: string (nullable = true)
 |-- question_answer_2: string (nullable = true)
 |-- question_answer_20: string (nullable = true)


In [65]:
# função com o objetivo de verificar se existem valores null nas principais colunas

def check_null_values(dataframe):
    return dataframe.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) \
                                    for c in lines_transf_002.select(
                                       'page_category',
                                       'page_category_text',
                                       'question_datetime',
                                       'question_title',
                                       'question_text',
                                       'question_views_count',
                                       'question_comments_count').columns]).toPandas().T

In [66]:
# existem valores null?

check_null_values(lines_transf_002)

Unnamed: 0,0
page_category,0
page_category_text,7
question_datetime,0
question_title,0
question_text,0
question_views_count,99
question_comments_count,0


In [67]:
# substitui os valores null das colunas identificadas no passo anterior

lines_transf_003 = lines_transf_002.fillna({'question_views_count': 0, 'page_category_text': 'Não Informado'})

In [68]:
# existem valores null?

check_null_values(lines_transf_003)

Unnamed: 0,0
page_category,0
page_category_text,0
question_datetime,0
question_title,0
question_text,0
question_views_count,0
question_comments_count,0


In [121]:
# verifica o numero de partições

lines_transf_003.rdd.getNumPartitions()


1

In [122]:
# inclusão da coluna [partition_key] que será utilizada para particionar os dados por ano e mês (YYYYmm)

lines_transf_004 = lines_transf_002.withColumn("partition_key", 
                                               F.date_format(col("question_datetime"), 'yyyyMM').cast(IntegerType()))



In [123]:
lines_transf_004.printSchema()

root
 |-- page_category: string (nullable = true)
 |-- page_category_text: string (nullable = true)
 |-- question_datetime: timestamp (nullable = true)
 |-- question_title: string (nullable = true)
 |-- question_views_count: integer (nullable = true)
 |-- question_comments_count: integer (nullable = true)
 |-- question_text: string (nullable = true)
 |-- question_answer_0: string (nullable = true)
 |-- question_answer_10: string (nullable = true)
 |-- question_answer_11: string (nullable = true)
 |-- question_answer_12: string (nullable = true)
 |-- question_answer_13: string (nullable = true)
 |-- question_answer_14: string (nullable = true)
 |-- question_answer_15: string (nullable = true)
 |-- question_answer_16: string (nullable = true)
 |-- question_answer_17: string (nullable = true)
 |-- question_answer_18: string (nullable = true)
 |-- question_answer_19: string (nullable = true)
 |-- question_answer_2: string (nullable = true)
 |-- question_answer_20: string (nullable = true)


In [124]:
lines_transf_004.write\
    .mode("overwrite")\
    .format("parquet")\
    .partitionBy('partition_key')\
    .option("maxRecordsPerFile", 100)\
    .save('hdfs://localhost:8020/user/danielqueiroz/output/resultado_crawler')