In [1]:
from pyspark.sql import SparkSession

# Iniciar a sessão Spark
# Definir um tamanho máximo de memória disponível para uso pelo driver
spark = SparkSession.builder \
    .appName("Capítulo 5") \
    .getOrCreate()

# Atribui o SparkContext à variável 'sc'
sc = spark.sparkContext

In [2]:
# Carregar os DataFrames
clientes_df = spark.read.csv("arquivos/clientes.csv", header=True, inferSchema=True)
enderecos_df = spark.read.csv("arquivos/enderecos.csv", header=True, inferSchema=True)

# Inner Join
inner_join_df = clientes_df.join(enderecos_df, clientes_df.id_endereco == enderecos_df.id_endereco)
inner_join_df.show()

# Left Outer Join
left_outer_join_df = clientes_df.join(enderecos_df, clientes_df.id_endereco == enderecos_df.id_endereco, "left_outer")
left_outer_join_df.show()

# Cross Join
cross_join_df = clientes_df.crossJoin(enderecos_df)
cross_join_df.show()


+----------+-----------+-----------+----------+-------------------+--------------------+---------------+---------------+-----------+------+--------------------+--------------------+------+-----------+--------------------+------+------------+-------------------+----------------+------+--------------------+---------+
|id_cliente|       nome|id_endereco| sobrenome|    data_nascimento|               email|       telefone|        celular|        cpf|genero|        data_criacao|    data_atualizacao|status|id_endereco|                 rua|numero| complemento|             bairro|          cidade|estado|                pais|      cep|
+----------+-----------+-----------+----------+-------------------+--------------------+---------------+---------------+-----------+------+--------------------+--------------------+------+-----------+--------------------+------+------------+-------------------+----------------+------+--------------------+---------+
|         1|     Esther|        163|  Teixeira|20

In [3]:
# Exemplo de código para realizar um join e selecionar algumas colunas dos dois DataFrames
join_df = clientes_df.join(enderecos_df, "id_endereco") \
                     .select(clientes_df.id_cliente, clientes_df.nome, clientes_df.sobrenome, enderecos_df.rua, enderecos_df.cidade)
join_df.show()

+----------+-----------+----------+--------------------+----------------+
|id_cliente|       nome| sobrenome|                 rua|          cidade|
+----------+-----------+----------+--------------------+----------------+
|         1|     Esther|  Teixeira|Trecho Fernanda d...|         Ribeiro|
|         2|    Mariane|   Barbosa|Sítio Stephany Ar...|         Correia|
|         3|Ana Vitória|     Moura|Passarela Anthony...|  da Mota Alegre|
|         4|    Leandro|     Viana|      Favela Barbosa|   Duarte Grande|
|         5|     Pietro|    Santos|   Recanto Fernandes|         Costela|
|         6|    Clarice|    da Paz|      Sítio de Gomes|da Mata de Silva|
|         7|Ana Vitória|  da Cunha|   Recanto Fernandes|         Costela|
|         8| Ana Sophia|   da Mota|    Núcleo de Farias|        Silveira|
|         9|   Leonardo|    Fogaça|Vale Marcela Moreira|   Farias Alegre|
|        10|    Anthony|   Cardoso| Estrada de Teixeira|           Costa|
|        11|    Natália|    Mendes|   

In [7]:
# Carregar os DataFrames
clientes_df = spark.read.csv("arquivos/clientes.csv", header=True, inferSchema=True)
enderecos_df = spark.read.csv("arquivos/enderecos.csv", header=True, inferSchema=True)

# Inner Join
inner_join_df = clientes_df.join(enderecos_df, clientes_df.id_endereco == enderecos_df.id_endereco)
inner_join_df.show()

# Inner Join com colunas selecionadas
inner_join_df = clientes_df.join(enderecos_df, "id_endereco") \
                           .select("id_cliente", "nome", "sobrenome", "rua", "cidade", "estado")
inner_join_df.show()

# Realiza o inner join usando todas as colunas das duas tabelas
inner_join_df = clientes_df.join(enderecos_df, clientes_df.id_endereco == enderecos_df.id_endereco)
# Após realizar o join, seleciona apenas as colunas desejadas para exibir
inner_join_df.select("id_cliente", "nome", "sobrenome", "rua", "cidade").show()


+----------+-----------+-----------+----------+-------------------+--------------------+---------------+---------------+-----------+------+--------------------+--------------------+------+-----------+--------------------+------+------------+-------------------+----------------+------+--------------------+---------+
|id_cliente|       nome|id_endereco| sobrenome|    data_nascimento|               email|       telefone|        celular|        cpf|genero|        data_criacao|    data_atualizacao|status|id_endereco|                 rua|numero| complemento|             bairro|          cidade|estado|                pais|      cep|
+----------+-----------+-----------+----------+-------------------+--------------------+---------------+---------------+-----------+------+--------------------+--------------------+------+-----------+--------------------+------+------------+-------------------+----------------+------+--------------------+---------+
|         1|     Esther|        163|  Teixeira|20

In [19]:
from pyspark.sql import functions as F
from pyspark.sql.functions import udf, col
from pyspark.sql.types import IntegerType
from datetime import datetime

# Definindo uma UDF para calcular a idade
def calculate_age(birthdate):
    today = datetime.now()
    birthdate = datetime.strptime(birthdate.split(" ")[0], "%Y-%m-%d")
    return today.year - birthdate.year - ((today.month, today.day) < (birthdate.month, birthdate.day))

age_udf = udf(calculate_age, IntegerType())

# Adicionando uma coluna de idade ao DataFrame
clientes_df = clientes_df.withColumn("idade", age_udf(col("data_nascimento").cast("string")))

# Agregação Complexa: Calculando a média de idade por gênero
clientes_df.groupBy("genero").agg(F.avg("idade").alias("idade_media")).show()

+------+-----------------+
|genero|      idade_media|
+------+-----------------+
|     F|54.11494252873563|
|     M|54.23893805309734|
+------+-----------------+



## Exemplo no uso de Funções

In [25]:
netflix_df = spark.read.csv("arquivos/netflix/Netflix Engagement.csv", header=True, inferSchema=True)

In [27]:
netflix_df.select(F.col("Title")).show(truncate=False)

+-----------------------------------------------------------------------+
|Title                                                                  |
+-----------------------------------------------------------------------+
|The Night Agent: Season 1                                              |
|Ginny & Georgia: Season 2                                              |
|The Glory: Season 1 // 더 글로리: 시즌 1                               |
|Wednesday: Season 1                                                    |
|Queen Charlotte: A Bridgerton Story                                    |
|You: Season 4                                                          |
|La Reina del Sur: Season 3                                             |
|Outer Banks: Season 3                                                  |
|Ginny & Georgia: Season 1                                              |
|FUBAR: Season 1                                                        |
|Manifest: Season 4                         

In [None]:
# Usando F.col()
df.select(F.col("nome_da_coluna"))

# Referenciando diretamente pelo nome da coluna
df.select("nome_da_coluna")

In [29]:
netflix_df.withColumn("Source", F.lit("Netflix")).select("Title", "Source").show(truncate=False)

+-----------------------------------------------------------------------+-------+
|Title                                                                  |Source |
+-----------------------------------------------------------------------+-------+
|The Night Agent: Season 1                                              |Netflix|
|Ginny & Georgia: Season 2                                              |Netflix|
|The Glory: Season 1 // 더 글로리: 시즌 1                               |Netflix|
|Wednesday: Season 1                                                    |Netflix|
|Queen Charlotte: A Bridgerton Story                                    |Netflix|
|You: Season 4                                                          |Netflix|
|La Reina del Sur: Season 3                                             |Netflix|
|Outer Banks: Season 3                                                  |Netflix|
|Ginny & Georgia: Season 1                                              |Netflix|
|FUBAR: Season 1      

In [30]:
netflix_df.select(F.avg("Rating").alias("Average Rating")).show()

+------------------+
|    Average Rating|
+------------------+
|6.5090563915060775|
+------------------+



In [31]:
netflix_df.select(F.sum("Hours Viewed").alias("Total Hours Viewed")).show()

+------------------+
|Total Hours Viewed|
+------------------+
|       93863600000|
+------------------+



In [32]:
netflix_df.select(F.max("Rating").alias("Max Rating"), F.min("Rating").alias("Min Rating")).show()

+----------+----------+
|Max Rating|Min Rating|
+----------+----------+
|      10.0|       1.2|
+----------+----------+



In [33]:
netflix_df.select(F.count("Title").alias("Total Titles")).show()

+------------+
|Total Titles|
+------------+
|       18334|
+------------+



In [38]:
netflix_df.withColumn("Highly Rated", F.when(F.col("Rating") >= 8, "Yes") \
                                       .otherwise("No")) \
                                       .select("Title", "Rating", "Highly Rated") \
                                       .show(truncate=False)

+-----------------------------------------------------------------------+------+------------+
|Title                                                                  |Rating|Highly Rated|
+-----------------------------------------------------------------------+------+------------+
|The Night Agent: Season 1                                              |6.0   |No          |
|Ginny & Georgia: Season 2                                              |5.7   |No          |
|The Glory: Season 1 // 더 글로리: 시즌 1                               |8.4   |Yes         |
|Wednesday: Season 1                                                    |null  |No          |
|Queen Charlotte: A Bridgerton Story                                    |7.4   |No          |
|You: Season 4                                                          |6.6   |No          |
|La Reina del Sur: Season 3                                             |7.9   |No          |
|Outer Banks: Season 3                                            

In [41]:
netflix_df.withColumn("Title and Genre", F.concat_ws(" - ", "Title", "Genre")) \
          .select("Title and Genre") \
          .show(truncate=False)

+--------------------------------------------------------------------------------------------------------+
|Title and Genre                                                                                         |
+--------------------------------------------------------------------------------------------------------+
|The Night Agent: Season 1 - ['Biography', 'Drama', 'History']                                           |
|Ginny & Georgia: Season 2 - ['Comedy', 'Drama', 'Romance']                                              |
|The Glory: Season 1 // 더 글로리: 시즌 1 - ['Short']                                                    |
|Wednesday: Season 1 - ['Talk-Show']                                                                     |
|Queen Charlotte: A Bridgerton Story - ['Drama', 'History', 'Romance']                                   |
|You: Season 4 - ['Comedy', 'Romance']                                                                   |
|La Reina del Sur: Season 3 - ['Action', 'C

In [43]:
netflix_df.withColumn("Formatted Release Date", F.date_format("Release Date", "dd/MM/yyyy")) \
          .select("Release Date", "Formatted Release Date") \
          .show()

+-------------------+----------------------+
|       Release Date|Formatted Release Date|
+-------------------+----------------------+
|2023-03-23 00:00:00|            23/03/2023|
|2023-01-05 00:00:00|            05/01/2023|
|2022-12-30 00:00:00|            30/12/2022|
|2022-11-23 00:00:00|            23/11/2022|
|2023-05-04 00:00:00|            04/05/2023|
|2023-02-09 00:00:00|            09/02/2023|
|2022-12-30 00:00:00|            30/12/2022|
|2023-02-23 00:00:00|            23/02/2023|
|2021-02-24 00:00:00|            24/02/2021|
|2023-05-25 00:00:00|            25/05/2023|
|2022-11-04 00:00:00|            04/11/2022|
|2023-01-01 00:00:00|            01/01/2023|
|2022-12-02 00:00:00|            02/12/2022|
|2023-05-12 00:00:00|            12/05/2023|
|2023-01-24 00:00:00|            24/01/2023|
|2023-01-14 00:00:00|            14/01/2023|
|2023-03-24 00:00:00|            24/03/2023|
|2023-04-06 00:00:00|            06/04/2023|
|2023-04-20 00:00:00|            20/04/2023|
|2023-03-1

In [44]:
netflix_df.withColumn("Release Timestamp", F.to_timestamp("Release Date", "yyyy-MM-dd")) \
          .select("Release Date", "Release Timestamp") \
          .show()

+-------------------+-------------------+
|       Release Date|  Release Timestamp|
+-------------------+-------------------+
|2023-03-23 00:00:00|2023-03-23 00:00:00|
|2023-01-05 00:00:00|2023-01-05 00:00:00|
|2022-12-30 00:00:00|2022-12-30 00:00:00|
|2022-11-23 00:00:00|2022-11-23 00:00:00|
|2023-05-04 00:00:00|2023-05-04 00:00:00|
|2023-02-09 00:00:00|2023-02-09 00:00:00|
|2022-12-30 00:00:00|2022-12-30 00:00:00|
|2023-02-23 00:00:00|2023-02-23 00:00:00|
|2021-02-24 00:00:00|2021-02-24 00:00:00|
|2023-05-25 00:00:00|2023-05-25 00:00:00|
|2022-11-04 00:00:00|2022-11-04 00:00:00|
|2023-01-01 00:00:00|2023-01-01 00:00:00|
|2022-12-02 00:00:00|2022-12-02 00:00:00|
|2023-05-12 00:00:00|2023-05-12 00:00:00|
|2023-01-24 00:00:00|2023-01-24 00:00:00|
|2023-01-14 00:00:00|2023-01-14 00:00:00|
|2023-03-24 00:00:00|2023-03-24 00:00:00|
|2023-04-06 00:00:00|2023-04-06 00:00:00|
|2023-04-20 00:00:00|2023-04-20 00:00:00|
|2023-03-10 00:00:00|2023-03-10 00:00:00|
+-------------------+-------------

In [47]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.types import IntegerType
from datetime import datetime

# Criando a SparkSession
spark = SparkSession.builder.appName("Netflix Analysis").getOrCreate()

# Carregando os dados
netflix_df = spark.read.csv("arquivos/netflix/Netflix Engagement.csv", header=True, inferSchema=True)

# Definindo a função para calcular os anos desde a data de lançamento
def years_since(release_date):
    if release_date is None:
        # Lidar com valores nulos
        return None
    elif isinstance(release_date, datetime):
        # Se release_date já for um objeto datetime
        release_year = release_date.year
    else:
        # Se release_date for uma string
        release_date = datetime.strptime(release_date, "%Y-%m-%d")
        release_year = release_date.year
    
    return datetime.now().year - release_year

# Registrando a UDF
years_since_udf = udf(years_since, IntegerType())

# Aplicando a UDF ao DataFrame
netflix_df = netflix_df.withColumn("Years Since Release", years_since_udf(col("Release Date")))
netflix_df.select("Title", "Release Date", "Years Since Release").show(truncate=False)

+-----------------------------------------------------------------------+-------------------+-------------------+
|Title                                                                  |Release Date       |Years Since Release|
+-----------------------------------------------------------------------+-------------------+-------------------+
|The Night Agent: Season 1                                              |2023-03-23 00:00:00|0                  |
|Ginny & Georgia: Season 2                                              |2023-01-05 00:00:00|0                  |
|The Glory: Season 1 // 더 글로리: 시즌 1                               |2022-12-30 00:00:00|1                  |
|Wednesday: Season 1                                                    |2022-11-23 00:00:00|1                  |
|Queen Charlotte: A Bridgerton Story                                    |2023-05-04 00:00:00|0                  |
|You: Season 4                                                          |2023-02-09 00:00:00|0

In [54]:
from pyspark.sql.functions import from_json, explode, col
from pyspark.sql.types import ArrayType, StringType

# Selecionando a coluna original e a coluna transformada
netflix_df.select("Title", "Genre").show(truncate=False)

# Supondo que a coluna 'Genre' esteja no formato de string que representa uma lista Python
# Convertendo a string em uma coluna de array
json_schema = ArrayType(StringType())
netflix_df = netflix_df.withColumn("Genre Array", from_json("Genre", json_schema))

# Explodindo a coluna de array para transformar cada elemento em uma linha separada
netflix_df.withColumn("Single Genre", explode("Genre Array")) \
          .select("Title", "Single Genre") \
          .show(truncate=False)

+-----------------------------------------------------------------------+---------------------------------+
|Title                                                                  |Genre                            |
+-----------------------------------------------------------------------+---------------------------------+
|The Night Agent: Season 1                                              |['Biography', 'Drama', 'History']|
|Ginny & Georgia: Season 2                                              |['Comedy', 'Drama', 'Romance']   |
|The Glory: Season 1 // 더 글로리: 시즌 1                               |['Short']                        |
|Wednesday: Season 1                                                    |['Talk-Show']                    |
|Queen Charlotte: A Bridgerton Story                                    |['Drama', 'History', 'Romance']  |
|You: Season 4                                                          |['Comedy', 'Romance']            |
|La Reina del Sur: Season 3       

In [61]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, explode

# Criando uma SparkSession
spark = SparkSession.builder.appName("SplitExample").getOrCreate()

# Criando um DataFrame de exemplo
data = [("eu viajo, muito, pelo, mundo",)]
df = spark.createDataFrame(data, ["frase"])

df.show(truncate=False)

# Dividindo a frase em palavras com base na vírgula
# Removendo os colchetes e aspas da string antes de dividir
df = df.withColumn("palavras", split(df["frase"], ", "))

df.show(truncate=False)

# Explodindo a lista para ter cada palavra em uma linha separada
df = df.withColumn("palavra", explode("palavras"))

# Exibindo o resultado
df.select("palavra").show(truncate=False)

+----------------------------+
|frase                       |
+----------------------------+
|eu viajo, muito, pelo, mundo|
+----------------------------+

+----------------------------+------------------------------+
|frase                       |palavras                      |
+----------------------------+------------------------------+
|eu viajo, muito, pelo, mundo|[eu viajo, muito, pelo, mundo]|
+----------------------------+------------------------------+

+--------+
|palavra |
+--------+
|eu viajo|
|muito   |
|pelo    |
|mundo   |
+--------+



In [64]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Criando a SparkSession
spark = SparkSession.builder.appName("Netflix Analysis").getOrCreate()

# Carregando os dados
netflix_df = spark.read.csv("arquivos/netflix/Netflix Engagement.csv", header=True, inferSchema=True)

# Definindo a janela de tempo para agrupamento de aproximadamente 30 dias
window_spec = F.window("Release Date", "30 days")

# Agrupando as visualizações por janela de tempo
monthly_views_df = netflix_df.groupBy(window_spec).agg(F.sum("Hours Viewed").alias("Total Hours Viewed"))

# Ordenando os resultados por janela de tempo
monthly_views_ordered_df = monthly_views_df.orderBy("window")

# Exibindo os resultados ordenados
monthly_views_ordered_df.select("window", "Total Hours Viewed").show(truncate=False)


+------------------------------------------+------------------+
|window                                    |Total Hours Viewed|
+------------------------------------------+------------------+
|{2010-04-01 00:00:00, 2010-05-01 00:00:00}|35900000          |
|{2010-08-29 00:00:00, 2010-09-28 00:00:00}|33200000          |
|{2011-02-25 00:00:00, 2011-03-27 00:00:00}|12300000          |
|{2011-08-24 00:00:00, 2011-09-23 00:00:00}|94700000          |
|{2012-01-21 00:00:00, 2012-02-20 00:00:00}|3500000           |
|{2013-01-15 00:00:00, 2013-02-14 00:00:00}|20000000          |
|{2013-02-14 00:00:00, 2013-03-16 00:00:00}|20100000          |
|{2013-05-15 00:00:00, 2013-06-14 00:00:00}|16200000          |
|{2013-06-14 00:00:00, 2013-07-14 00:00:00}|48800000          |
|{2013-07-14 00:00:00, 2013-08-13 00:00:00}|24700000          |
|{2013-08-13 00:00:00, 2013-09-12 00:00:00}|2400000           |
|{2013-09-12 00:00:00, 2013-10-12 00:00:00}|900000            |
|{2013-10-12 00:00:00, 2013-11-11 00:00:

In [None]:
F.window(timeColumn, windowDuration, slideDuration=None, startTime=None)

In [None]:
from pyspark.sql.functions import window, col

views_df = ... # um DataFrame com uma coluna de timestamp chamada 'view_time'
views_df.groupBy(window(col("view_time"), "1 hour")) \
        .agg(sum("views").alias("total_views")) \
        .show()

In [66]:
netflix_cast = netflix_df.withColumn("Hours Viewed", netflix_df["Hours Viewed"].cast("integer"))
netflix_cast.select("Title","Hours Viewed").show(truncate=False)

+-----------------------------------------------------------------------+------------+
|Title                                                                  |Hours Viewed|
+-----------------------------------------------------------------------+------------+
|The Night Agent: Season 1                                              |812100000   |
|Ginny & Georgia: Season 2                                              |665100000   |
|The Glory: Season 1 // 더 글로리: 시즌 1                               |622800000   |
|Wednesday: Season 1                                                    |507700000   |
|Queen Charlotte: A Bridgerton Story                                    |503000000   |
|You: Season 4                                                          |440600000   |
|La Reina del Sur: Season 3                                             |429600000   |
|Outer Banks: Season 3                                                  |402500000   |
|Ginny & Georgia: Season 1                       

In [72]:
from pyspark.sql.functions import concat, regexp_replace, lit

netflix_df = netflix_df.withColumn("Usando concat", concat(netflix_df["Title"], lit(" - "), netflix_df["Genre"]))
netflix_df.select("Usando concat").show(truncate=False)


netflix_df = netflix_df.withColumn("Usando regexp_replace", regexp_replace("Usando concat", "[\\[\\]']", ""))
netflix_df.select("Usando regexp_replace").show(truncate=False)

+--------------------------------------------------------------------------------------------------------+
|Usando concat                                                                                           |
+--------------------------------------------------------------------------------------------------------+
|The Night Agent: Season 1 - ['Biography', 'Drama', 'History']                                           |
|Ginny & Georgia: Season 2 - ['Comedy', 'Drama', 'Romance']                                              |
|The Glory: Season 1 // 더 글로리: 시즌 1 - ['Short']                                                    |
|Wednesday: Season 1 - ['Talk-Show']                                                                     |
|Queen Charlotte: A Bridgerton Story - ['Drama', 'History', 'Romance']                                   |
|You: Season 4 - ['Comedy', 'Romance']                                                                   |
|La Reina del Sur: Season 3 - ['Action', 'C

In [77]:
from pyspark.sql.functions import to_date, datediff, current_date, date_add

netflix_df = netflix_df.withColumn("Release Date", to_date(netflix_df["Release Date"], "yyyy-MM-dd"))
netflix_df = netflix_df.withColumn("Days Since Release", datediff(current_date(), netflix_df["Release Date"]))
netflix_df = netflix_df.withColumn("30 Days Later", date_add(netflix_df["Release Date"], 30))
netflix_df.select("Release Date", "Days Since Release", "30 Days Later").show(truncate=False)

+------------+------------------+-------------+
|Release Date|Days Since Release|30 Days Later|
+------------+------------------+-------------+
|2023-03-23  |276               |2023-04-22   |
|2023-01-05  |353               |2023-02-04   |
|2022-12-30  |359               |2023-01-29   |
|2022-11-23  |396               |2022-12-23   |
|2023-05-04  |234               |2023-06-03   |
|2023-02-09  |318               |2023-03-11   |
|2022-12-30  |359               |2023-01-29   |
|2023-02-23  |304               |2023-03-25   |
|2021-02-24  |1033              |2021-03-26   |
|2023-05-25  |213               |2023-06-24   |
|2022-11-04  |415               |2022-12-04   |
|2023-01-01  |357               |2023-01-31   |
|2022-12-02  |387               |2023-01-01   |
|2023-05-12  |226               |2023-06-11   |
|2023-01-24  |334               |2023-02-23   |
|2023-01-14  |344               |2023-02-13   |
|2023-03-24  |275               |2023-04-23   |
|2023-04-06  |262               |2023-05

In [79]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Criando a SparkSession
spark = SparkSession.builder.appName("Netflix Analysis").getOrCreate()

# Carregando os dados
netflix_df = spark.read.csv("arquivos/netflix/Netflix Engagement.csv", header=True, inferSchema=True)

# Adicionando a coluna "Viewership Score" e ordenando os dados
netflix_df = netflix_df.withColumn("Viewership Score", col("Hours Viewed") * col("Rating"))
ordered_df = netflix_df.orderBy(col("Viewership Score").desc())

# Selecionando as colunas relevantes
ordered_df.select("Title", "Hours Viewed", "Rating", "Viewership Score").show(truncate=False)

+-----------------------------------------------------------------------+------------+------+--------------------+
|Title                                                                  |Hours Viewed|Rating|Viewership Score    |
+-----------------------------------------------------------------------+------------+------+--------------------+
|The Glory: Season 1 // 더 글로리: 시즌 1                               |622800000   |8.4   |5.23152E9           |
|The Night Agent: Season 1                                              |812100000   |6.0   |4.8726E9            |
|Ginny & Georgia: Season 2                                              |665100000   |5.7   |3.79107E9           |
|Queen Charlotte: A Bridgerton Story                                    |503000000   |7.4   |3.7222E9            |
|La Reina del Sur: Season 3                                             |429600000   |7.9   |3.39384E9           |
|You: Season 4                                                          |440600000   |

In [23]:
from pyspark.sql import functions as F

# Lista todas as funções chamáveis no módulo 'pyspark.sql.functions'
function_list = [f for f in dir(F) if callable(getattr(F, f))]

print(function_list)

['Any', 'ArrayType', 'Callable', 'Column', 'DataFrame', 'DataType', 'Dict', 'Iterable', 'List', 'Optional', 'PandasUDFType', 'PythonEvalType', 'SparkContext', 'StringType', 'StructType', 'Tuple', 'Union', 'UserDefinedFunction', 'ValuesView', '_create_column_from_literal', '_create_lambda', '_create_udf', '_get_jvm_function', '_get_lambda_parameters', '_invoke_binary_math_function', '_invoke_function', '_invoke_function_over_columns', '_invoke_function_over_seq_of_columns', '_invoke_higher_order_function', '_options_to_str', '_test', '_to_java_column', '_to_seq', '_unresolved_named_lambda_variable', 'abs', 'acos', 'acosh', 'add_months', 'aggregate', 'approxCountDistinct', 'approx_count_distinct', 'array', 'array_contains', 'array_distinct', 'array_except', 'array_intersect', 'array_join', 'array_max', 'array_min', 'array_position', 'array_remove', 'array_repeat', 'array_sort', 'array_union', 'arrays_overlap', 'arrays_zip', 'asc', 'asc_nulls_first', 'asc_nulls_last', 'ascii', 'asin', 'as

In [14]:
# Carregar os DataFrames
clientes_df = spark.read.csv("arquivos/clientes.csv", header=True, inferSchema=True)

# Exemplo 2: Transformações de Colunas
clientes_df = clientes_df.withColumn("maior_idade", F.year(F.current_date()) - F.year(F.col("data_nascimento")) >= 18)
clientes_df.select("nome", "data_nascimento", "maior_idade").show()

+-----------+-------------------+-----------+
|       nome|    data_nascimento|maior_idade|
+-----------+-------------------+-----------+
|     Esther|2000-09-29 00:00:00|       true|
|    Mariane|2005-01-01 00:00:00|       true|
|Ana Vitória|1972-05-02 00:00:00|       true|
|    Leandro|1957-10-05 00:00:00|       true|
|     Pietro|1948-11-03 00:00:00|       true|
|    Clarice|1991-11-08 00:00:00|       true|
|Ana Vitória|1998-10-06 00:00:00|       true|
| Ana Sophia|1965-12-18 00:00:00|       true|
|   Leonardo|1983-10-21 00:00:00|       true|
|    Anthony|1946-03-09 00:00:00|       true|
|    Natália|1945-12-15 00:00:00|       true|
|       Davi|1983-01-22 00:00:00|       true|
|   Stephany|1958-10-02 00:00:00|       true|
|      Laura|1957-07-26 00:00:00|       true|
| Ana Sophia|1974-07-05 00:00:00|       true|
|       Levi|1966-04-06 00:00:00|       true|
|      Alice|1982-05-27 00:00:00|       true|
|     Marina|1997-06-23 00:00:00|       true|
|     Danilo|1987-04-02 00:00:00| 

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, get_json_object, explode

# Criando a SparkSession
spark = SparkSession.builder.appName("NestedDataExample").getOrCreate()

# Carregando o arquivo JSON
order_df = spark.read.option("multiline", "true").json("arquivos/order.json")

# Imprimindo o esquema do DataFrame para verificar a estrutura
order_df.printSchema()

root
 |-- carrier: string (nullable = true)
 |-- carrier_data: struct (nullable = true)
 |    |-- plp_master_id: string (nullable = true)
 |    |-- postcard: string (nullable = true)
 |    |-- tag_number: string (nullable = true)
 |-- content_declaration: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- description: string (nullable = true)
 |    |    |-- qty: string (nullable = true)
 |    |    |-- value: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- data: struct (nullable = true)
 |    |-- code: string (nullable = true)
 |    |-- discount_service_code: string (nullable = true)
 |    |-- order_id: string (nullable = true)
 |    |-- payment: struct (nullable = true)
 |    |    |-- credit_card: struct (nullable = true)
 |    |    |    |-- created_at: struct (nullable = true)
 |    |    |    |    |-- _nanoseconds: long (nullable = true)
 |    |    |    |    |-- _seconds: long (nullable = true)
 |    |    |    |-- details: st

In [5]:
# Selecionando o campo 'plp_master_id' de 'carrier_data'
master_id_df = order_df.select(col("carrier_data.plp_master_id"))
master_id_df.show(truncate=False)

+-------------+
|plp_master_id|
+-------------+
|876766465    |
+-------------+



In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import get_json_object, col

# Criando a SparkSession
spark = SparkSession.builder.appName("GetJSONObjectExample").getOrCreate()

# Dados de exemplo
data = [("{'name': 'Alice', 'age': 25}",), ("{'name': 'Bob', 'age': 30}",)]
schema = ["json_data"]

# Criando DataFrame
df = spark.createDataFrame(data, schema)
df.show(truncate=False)

# Extraindo o nome do JSON
df_with_name = df.select(get_json_object(col("json_data"), "$.name").alias("name"))
df_with_name.show(truncate=False)

+----------------------------+
|json_data                   |
+----------------------------+
|{'name': 'Alice', 'age': 25}|
|{'name': 'Bob', 'age': 30}  |
+----------------------------+

+-----+
|name |
+-----+
|Alice|
|Bob  |
+-----+



In [21]:
print("Antes do explode(content_declaration)")
order_df.select("content_declaration").show(truncate=False)
# Explodindo e selecionando elementos do array 'content_declaration'
content_decl_df = order_df.selectExpr("explode(content_declaration) as content_item")
print("Depois do explode(content_declaration)")
# Selecionando detalhes específicos de cada item
content_decl_df.select("content_item.description", "content_item.qty", "content_item.value").show(truncate=False)

Antes do explode(content_declaration)
+-----------------------+
|content_declaration    |
+-----------------------+
|[{Diversos, 1, 190.00}]|
+-----------------------+

Depois do explode(content_declaration)
+-----------+---+------+
|description|qty|value |
+-----------+---+------+
|Diversos   |1  |190.00|
+-----------+---+------+



## Seção: Lidando com Dados Faltantes em PySpark

In [24]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, isnan, when, count

# Criando a SparkSession
spark = SparkSession.builder.appName("NetflixDataAnalysis").getOrCreate()

# Carregando os dados da Netflix
netflix_df = spark.read.csv("arquivos/netflix/Netflix Engagement.csv", header=True, inferSchema=True)
netflix_df.show()

+--------------------+-------------------+-------------------+------------+-----------------+------+--------------------+--------------------+--------------------+
|               Title|Available Globally?|       Release Date|Hours Viewed|Number of Ratings|Rating|               Genre|           Key Words|         Description|
+--------------------+-------------------+-------------------+------------+-----------------+------+--------------------+--------------------+--------------------+
|The Night Agent: ...|                Yes|2023-03-23 00:00:00|   812100000|           7696.0|   6.0|['Biography', 'Dr...|persian empire,em...|                null|
|Ginny & Georgia: ...|                Yes|2023-01-05 00:00:00|   665100000|           5216.0|   5.7|['Comedy', 'Drama...|producer,three wo...|The film follows ...|
|The Glory: Season...|                Yes|2022-12-30 00:00:00|   622800000|          11869.0|   8.4|           ['Short']|                null|                null|
| Wednesday: Sea

In [25]:
# Selecionar colunas numéricas para aplicar a função isnan
numeric_cols = ['Hours Viewed', 'Number of Ratings', 'Rating']

# Criar uma lista de verificações de nulidade para todas as colunas
null_checks = [
    count(when(isnan(c) | col(c).isNull(), c)).alias(c) 
    if c in numeric_cols else 
    count(when(col(c).isNull(), c)).alias(c) 
    for c in netflix_df.columns
]

# Aplicar as verificações de nulidade
netflix_df.select(null_checks).show()

+-----+-------------------+------------+------------+-----------------+------+-----+---------+-----------+
|Title|Available Globally?|Release Date|Hours Viewed|Number of Ratings|Rating|Genre|Key Words|Description|
+-----+-------------------+------------+------------+-----------------+------+-----+---------+-----------+
|    0|                  2|       13455|           2|             4112|  4112| 2573|     5533|       7715|
+-----+-------------------+------------+------------+-----------------+------+-----+---------+-----------+



In [26]:
netflix_df_filled = netflix_df.fillna({'Hours Viewed': 0, 'Rating': 5.0})

In [27]:
from pyspark.sql.functions import mean
mean_val = netflix_df.select(mean(netflix_df['Rating'])).collect()[0][0]
netflix_df_filled = netflix_df.fillna({'Rating': mean_val})

In [29]:
# Excluir qualquer linha que contenha pelo menos um valor nulo
netflix_df_dropped = netflix_df.dropna()

# Remover linhas onde todos os campos estão nulos
netflix_df_dropped = netflix_df.dropna(how='all')

# Onde qualquer campo está nulo, faz o mesmo que .dropna()
netflix_df_dropped = netflix_df.dropna(how='any')

In [30]:
netflix_df.select([count(when(col(c).isNull(), c)).alias(c) for c in netflix_df.columns]).show()

+-----+-------------------+------------+------------+-----------------+------+-----+---------+-----------+
|Title|Available Globally?|Release Date|Hours Viewed|Number of Ratings|Rating|Genre|Key Words|Description|
+-----+-------------------+------------+------------+-----------------+------+-----+---------+-----------+
|    0|                  2|       13455|           2|             4112|  4112| 2573|     5533|       7715|
+-----+-------------------+------------+------------+-----------------+------+-----+---------+-----------+



In [31]:
netflix_df_dropped.select([count(when(col(c).isNull(), c)).alias(c) for c in netflix_df_dropped.columns]).show()

+-----+-------------------+------------+------------+-----------------+------+-----+---------+-----------+
|Title|Available Globally?|Release Date|Hours Viewed|Number of Ratings|Rating|Genre|Key Words|Description|
+-----+-------------------+------------+------------+-----------------+------+-----+---------+-----------+
|    0|                  0|           0|           0|                0|     0|    0|        0|          0|
+-----+-------------------+------------+------------+-----------------+------+-----+---------+-----------+



In [32]:
from pyspark.sql.functions import avg, stddev

netflix_df_filled.select(
    avg('Rating').alias('Average Rating'),
    stddev('Rating').alias('Rating Standard Deviation')
).show()

+-----------------+-------------------------+
|   Average Rating|Rating Standard Deviation|
+-----------------+-------------------------+
|6.509056391505675|       1.0814082573969044|
+-----------------+-------------------------+



In [34]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import year, month, count

# Criando a SparkSession
spark = SparkSession.builder.appName("NetflixTitlesByYearMonth").getOrCreate()

# Carregando os dados da Netflix
netflix_df = spark.read.csv("arquivos/netflix/Netflix Engagement.csv", header=True, inferSchema=True)

# Extraindo o ano e o mês da coluna "Release Date"
netflix_df = netflix_df.withColumn("Year", year("Release Date"))
netflix_df = netflix_df.withColumn("Month", month("Release Date"))

# Agrupando por ano e mês, e contando os títulos
titles_by_year_month = netflix_df.groupBy("Year", "Month").agg(count("Title").alias("NumTitles"))

# Ordenando os resultados por ano e mês de forma crescente
titles_by_year_month_ordered = titles_by_year_month.orderBy("Year", "Month")

# Exibindo o resultado
titles_by_year_month_ordered.show()

+----+-----+---------+
|Year|Month|NumTitles|
+----+-----+---------+
|null| null|    13455|
|2010|    4|        3|
|2010|    9|        5|
|2011|    3|        2|
|2011|    9|        1|
|2012|    2|        1|
|2013|    2|        2|
|2013|    5|        2|
|2013|    7|        2|
|2013|    8|        1|
|2013|    9|        1|
|2013|   11|        1|
|2013|   12|        3|
|2014|    1|        3|
|2014|    2|        1|
|2014|    5|        2|
|2014|    6|        2|
|2014|    7|        1|
|2014|    8|        3|
|2014|    9|        6|
+----+-----+---------+
only showing top 20 rows



In [37]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
from itertools import product
from datetime import datetime

# Criando a SparkSession
spark = SparkSession.builder.appName("CompleteMonthsNetflixData").getOrCreate()

# Carregando os dados da Netflix
netflix_df = spark.read.csv("arquivos/netflix/Netflix Engagement.csv", header=True, inferSchema=True)

# Extraindo o ano e o mês da coluna "Release Date"
netflix_df = netflix_df.withColumn("Year", year("Release Date"))
netflix_df = netflix_df.withColumn("Month", month("Release Date"))

# Criando um DataFrame com todos os meses de um período específico
years = range(2010, 2024)  # Por exemplo, de 2018 a 2023
months = range(1, 13)  # Todos os meses

# Produto cartesiano de anos e meses
year_month_list = list(product(years, months))
year_month_df = spark.createDataFrame(year_month_list, ["Year", "Month"])

# Agrupando os dados da Netflix por ano e mês, e contando os títulos
titles_by_year_month = netflix_df.groupBy("Year", "Month").agg(count("Title").alias("NumTitles"))

# Realizando a junção para incluir meses sem títulos
complete_titles_by_year_month = year_month_df.join(titles_by_year_month, ["Year", "Month"], "left_outer")

# Preenchendo os valores nulos com zero
complete_titles_by_year_month = complete_titles_by_year_month.fillna({'NumTitles': 0})

# Ordenando os resultados por ano e mês de forma crescente
complete_titles_by_year_month_ordered = complete_titles_by_year_month.orderBy("Year", "Month")

# Exibindo o resultado
complete_titles_by_year_month_ordered.show()

+----+-----+---------+
|Year|Month|NumTitles|
+----+-----+---------+
|2010|    1|        0|
|2010|    2|        0|
|2010|    3|        0|
|2010|    4|        3|
|2010|    5|        0|
|2010|    6|        0|
|2010|    7|        0|
|2010|    8|        0|
|2010|    9|        5|
|2010|   10|        0|
|2010|   11|        0|
|2010|   12|        0|
|2011|    1|        0|
|2011|    2|        0|
|2011|    3|        2|
|2011|    4|        0|
|2011|    5|        0|
|2011|    6|        0|
|2011|    7|        0|
|2011|    8|        0|
+----+-----+---------+
only showing top 20 rows

