In [None]:
!pip install plotly cufflinks

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Exploration") \
    .getOrCreate()

In [None]:
flights = spark.read.parquet('/data/parquet/flights')

In [None]:
flights.show()

In [None]:
flights.printSchema()

In [None]:
import pandas as pd
import numpy as np
import scipy as sp
import plotly.plotly as py
import plotly.figure_factory as ff
import pandas as pd

In [None]:
flights.sample(False, 0.01).limit(100).toPandas()

In [None]:
flights = flights.withColumn('Delayed', flights.ArrDelay > 20)
print("%% delayed/(total): %.02f" % (flights.where(flights.Delayed == True).count() / flights.count()))

# Flights by Year

In [None]:
import pyspark.sql.functions as func

delayedByYear = flights \
    .groupBy(flights.Year) \
    .pivot("Delayed", ['True', 'False']) \
    .count() \
    .toPandas()
delayedByYear['Total'] = delayedByYear['True'] + delayedByYear['False']
delayedByYear['Delayed'] = delayedByYear['True'] / delayedByYear['Total']
delayedByYear['OnTime'] = delayedByYear['False'] / delayedByYear['Total']
delayedByYear

In [None]:
import plotly.plotly as py
import cufflinks as cf
import pandas as pd
import numpy as np

delayedByYear[['Year', 'Delayed', 'OnTime']].iplot(x='Year', kind='bar', barmode='stack', filename='pandas-bar-chart')

# Flights by Month

In [None]:
delayedByMonth = flights \
    .groupBy(flights.Month) \
    .pivot("Delayed", ['True', 'False']) \
    .count() \
    .toPandas()
delayedByMonth['Total'] = delayedByMonth['True'] + delayedByMonth['False']
delayedByMonth['Delayed'] = delayedByMonth['True'] / delayedByMonth['Total']
delayedByMonth['OnTime'] = delayedByMonth['False'] / delayedByMonth['Total']
delayedByMonth

In [None]:
delayedByMonth[['Month', 'Delayed', 'OnTime']].iplot(x='Month', kind='bar', barmode='stack', filename='pandas-bar-chart')

In [None]:
import plotly.graph_objs as go

# Flights by Day of Week

In [None]:
delayedByDoW = flights \
    .groupBy(flights.DayOfWeek) \
    .pivot("Delayed", ['True', 'False']) \
    .count() \
    .toPandas()
delayedByDoW['Total'] = delayedByDoW['True'] + delayedByDoW['False']
delayedByDoW['Delayed'] = delayedByDoW['True'] / delayedByDoW['Total']
delayedByDoW['OnTime'] = delayedByDoW['False'] / delayedByDoW['Total']
delayedByDoW

In [None]:
delayedByDoW[['DayOfWeek', 'Delayed', 'OnTime']].iplot(x='DayOfWeek', kind='bar', barmode='stack', filename='pandas-bar-chart')

# Cube

In [None]:
flightsCube = flights.cube(flights.Year, flights.Month, flights.DayOfWeek, flights.Delayed) \
                .count()

In [None]:
flightsCube.where(func.isnull("Month") & func.isnull("DayOfWeek") & ~func.isnull("Delayed") & ~func.isnull("Year")).toPandas()

In [None]:
flightsCube.where(~func.isnull("Month") & func.isnull("DayOfWeek") & ~func.isnull("Delayed") & ~func.isnull("Year")).toPandas()

# Flights by carrier

In [None]:
delayedByCarrier = flights \
    .groupBy(flights.UniqueCarrier) \
    .pivot("Delayed", ['True', 'False']) \
    .count() \
    .toPandas()
delayedByCarrier['Total'] = delayedByCarrier['True'] + delayedByCarrier['False']
delayedByCarrier['Delayed'] = delayedByCarrier['True'] / delayedByCarrier['Total']
delayedByCarrier['OnTime'] = delayedByCarrier['False'] / delayedByCarrier['Total']
delayedByCarrier

In [None]:
delayedByCarrier[['UniqueCarrier', 'Delayed', 'OnTime']].iplot(x='UniqueCarrier', kind='bar', barmode='stack', filename='pandas-bar-chart')

# By Origin -> Dest

In [None]:
#"Origin" , "Dest"
delayedByRoute = flights \
    .groupBy(flights.Origin, flights.Dest) \
    .pivot("Delayed", ['True', 'False']) \
    .count() \
    .toPandas()
delayedByRoute['Total'] = delayedByRoute['True'] + delayedByRoute['False']
delayedByRoute['Delayed'] = delayedByRoute['True'] / delayedByRoute['Total']
delayedByRoute['OnTime'] = delayedByRoute['False'] / delayedByRoute['Total']
delayedByRoute['Route'] = delayedByRoute['Origin'] + delayedByRoute['Dest']
delayedByRoute

In [None]:
delayedByRoute[['Origin', 'Dest', 'Delayed']] \
    .iplot(x='Origin', y='Dest', z='Delayed', kind='heatmap', filename='pandas-heatmap-chart', colorscale='spectral')