In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import array_contains, col, lit, when


spark = SparkSession.builder.appName("jsonUp").getOrCreate()

df = spark.read.json('json0.json')

df.show(5)

+-----+--------------------+-------------------+-------+-----------------+--------------------+--------------------+----------+--------------------+------------+--------------------+-----+------------+----------+
|adult|       backdrop_path|          genre_ids|     id|original_language|      original_title|            overview|popularity|         poster_path|release_date|               title|video|vote_average|vote_count|
+-----+--------------------+-------------------+-------+-----------------+--------------------+--------------------+----------+--------------------+------------+--------------------+-----+------------+----------+
|false|/8rpDcsfLJypbO6vR...|      [878, 12, 28]|  76600|               en|Avatar: The Way o...|Set more than a d...|   37.6161|/t6HIqrRAclMCA60N...|  2022-12-14|Avatar: The Way o...|false|       7.611|     12503|
|false|/8btfz81bOJ2lC7cu...|       [16, 14, 12]| 980477|               zh|      哪吒之魔童闹海|Following the Tri...|   35.3386|/5lUmWTGkEcYnXuji...|  2025-

In [2]:
df.select("backdrop_path").show(5)

# A coluna backdrop não é necessária

df = df.drop("backdrop_path")

+--------------------+
|       backdrop_path|
+--------------------+
|/8rpDcsfLJypbO6vR...|
|/8btfz81bOJ2lC7cu...|
|/k6EOrckWFuz7I4z4...|
|/p5ozvmdgsmbWe0H8...|
|/cHkhb5A4gQRK6zs6...|
+--------------------+
only showing top 5 rows



In [3]:
df.printSchema()

root
 |-- adult: boolean (nullable = true)
 |-- genre_ids: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- id: long (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: double (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- video: boolean (nullable = true)
 |-- vote_average: double (nullable = true)
 |-- vote_count: long (nullable = true)



In [4]:
df.select("genre_ids").show(5)

# Fazer a mesma lógica do csv

+-------------------+
|          genre_ids|
+-------------------+
|      [878, 12, 28]|
|       [16, 14, 12]|
|      [12, 28, 878]|
|[16, 12, 35, 10751]|
|       [28, 53, 80]|
+-------------------+
only showing top 5 rows



In [5]:
idAventura = 12
idAcao = 28

try:
    df = df.withColumn(
        "genero_new",
        when(
            (col("genre_ids").isNotNull()) &
            array_contains(col("genre_ids"), idAcao) &
            array_contains(col("genre_ids"), idAventura),
            lit("Action and Adventure")
        ).when(
            (col("genre_ids").isNotNull()) & array_contains(col("genre_ids"), idAventura),
            lit("Adventure")
        ).when(
            (col("genre_ids").isNotNull()) & array_contains(col("genre_ids"), idAcao),
            lit("Action")
        ).otherwise(lit(None))
    )

    df = df.filter(col("genero_new").isNotNull()) \
           .drop("genre_ids") \
           .withColumnRenamed("genero_new", "genero")

    print("Gêneros filtrados!")
except Exception as e:
    print("Falha ao filtrar gêneros!")
    raise

df.select("genero").show(5)


Gêneros filtrados!
+--------------------+
|              genero|
+--------------------+
|Action and Adventure|
|           Adventure|
|Action and Adventure|
|           Adventure|
|              Action|
+--------------------+
only showing top 5 rows



In [6]:
df.write.mode("append").parquet("teste.parquet")