In [None]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Analysis") \
    .getOrCreate()

# Read all Data and Cache 

In [None]:
import pyspark.sql.functions as f
all_data = spark.read.parquet(f"/taxi/dataset.parquet")

In [None]:
import pyspark.sql.functions as f

In [None]:
%%time
all_data.cache()
all_data.count()

In [None]:
%%time
all_data.count()

In [None]:
all_data.printSchema()

In [None]:
all_data.show(2)

## Analysis

### Year

In [None]:
all_data.select("year").distinct().orderBy("year").show(20)

In [None]:
yearly_data = (
    all_data.groupBy("year").count().orderBy("year")
).toPandas()

In [None]:
yearly_data.plot(
    x='year', y='count', figsize=(12, 6), 
    title='Rides per Year',
    legend=False,
    kind='bar',
    xlabel='Year',
    ylabel='Rides'
)

### Corona?

In [None]:
yyyy_mm = (
    all_data.filter("year >= 2019").groupBy("year", "month").count().orderBy("year", "month").withColumn("yyyy-mm", f.concat_ws("-", "year", "month"))
).toPandas()

In [None]:
yyyy_mm.plot(
    x='yyyy-mm', y='count', figsize=(36, 6), 
    title='Rides in 2016',
    legend=False,
    kind='bar',
    xlabel='Month',
    ylabel='Rides'
)

### January 2016

In [None]:
jan = all_data.filter("year = 2016").filter("month = 01").withColumn('day', f.dayofmonth("pickup_datetime"))

In [None]:
jan.show(2)

In [None]:
data_jan = (
    jan.groupBy("day").count().orderBy("day")
).toPandas()

In [None]:
data_jan.plot(
    x='day', y='count', figsize=(12, 6), 
    title='Rides in Jan/2016',
    legend=False,
    kind='bar',
    xlabel='Days',
    ylabel='Rides'
)

### Tipping

In [None]:
df = all_data.filter("payment_type = 1 or payment_type = 2").withColumn("percentage_tip", f.round(f.expr("100*tip_amount/trip_amount"),0))

In [None]:
df.show(10)

In [None]:
filtered = df.filter("percentage_tip >= 0").filter("percentage_tip < 50").filter("payment_type = 1")

In [None]:
data = (
    filtered.groupBy("percentage_tip").count().orderBy("percentage_tip")
).toPandas()

In [None]:
data.plot(
    x='percentage_tip', y='count', figsize=(24, 12), 
    title='Tip',
    legend=False,
    kind='bar',
    xlabel='Tip [%]',
    ylabel='Count'
)

### Stopping Spark 

In [None]:
spark.stop()