#### Spark Aggregate Functions

##### Funciones simples de agregacion

In [0]:
%run "../includes/configuration"

In [0]:
movie_df = spark.read.parquet(f"{silver_foler_path}/movies")

In [0]:
display(movie_df)

In [0]:
from pyspark.sql.functions import count, countDistinct, sum

In [0]:
movie_df.select(count("*")).show()

In [0]:
movie_df.select(countDistinct("year_release_date")).show()

In [0]:
movie_df.select(sum("budget")).display()

In [0]:
movie_df.filter("year_release_date = 2016") \
                .select(sum("budget"), count("movie_id")) \
                .withColumnRenamed("sum(budget)", "total_budget") \
                .withColumnRenamed("count(movie_id)", "total_movies") \
                .display()

##### Group BY

In [0]:
movie_df \
.groupBy("year_release_date") \
    .sum("budget") \
    .display()

In [0]:
from pyspark.sql.functions import count, countDistinct, sum, max, min, avg

In [0]:
movie_group_by_df = movie_df.groupBy("year_release_date") \
    .agg(
        sum("budget").alias("total_budget"),
        min("budget").alias("min_budget"),
        max("budget").alias("max_budget"),
        avg("budget").alias("avg_budget"),
        count("movie_id").alias("movie_count")
    )

#### Window Functions

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, desc, dense_rank

In [0]:
movie_rank = Window.partitionBy("year_release_date").orderBy(desc("budget"))
movie_dense_rank = Window.partitionBy("year_release_date").orderBy(desc("budget"))


movie_df.select("title", "budget", "year_release_date") \
        .filter("year_release_date is not null") \
        .withColumn("rank", rank().over(movie_rank)) \
        .withColumn("rank", dense_rank().over(movie_dense_rank)) \
        .display()