#GOLD UTILITIES

##Imports
---

In [0]:
from pyspark.sql.functions import col, count

##Functions
---

###Filters


In [0]:
################        DURATION FILTER         ################       
def duration_filter(df, duration_min=59, duration_max=89736):
    """
    Filter out trips with outer values of duration_sec.
    duration_sec - duration of trip in seconds.
    """
    df_filtered = (
        df
        .filter(
            (col("duration_sec") > duration_min) &
            (col("duration_sec") < duration_max)
        )
    )
    return df_filtered

###Aggregations

In [0]:
################        TOP STATIONS         ################  
def top_stations(df, n=10):
    """
    Return top n stations by total number of uses.
    """
    df_start_stations = (
        df
        .select(
            col("start_station_name").alias("station_name")
        )
    )
    df_end_stations = (
        df
        .select(
            col("end_station_name").alias("station_name")
        )
    )
    df_stations = (
        df_start_stations
        .union(df_end_stations)
        .filter(col("station_name").isNotNull())
    )

    df_top_stations = (
        df_stations
        .groupBy("station_name")
        .agg(
            count("*").alias("total_uses")
        )
        .orderBy(
            col("total_uses")
            .desc()
        )
        .limit(n)
    )
    return df_top_stations