In [None]:
from fugue_notebook import setup

setup()

#from dask_sql.integrations import fugue
import fugue_blazing

In [None]:
%%fsql
airports = 
    LOAD CSV "/tmp/airports.csv"
    COLUMNS airport_id:long,name:str,city:str,country:str,iata:str,icao:str,lat:double,lng:double,alt:long,timezone:str,dst:str,type:str,source:str
YIELD DATAFRAME
PRINT

airlines = 
    LOAD CSV "/tmp/airlines.csv"
    COLUMNS airline_id:long,name:str,alias:str,iata:str,icao:str,callsign:str,country:str,active:str
YIELD DATAFRAME
PRINT

In [None]:
airports.native

In [None]:
import pandas as pd
from triad import Schema
from typing import List, Iterable, Dict, Any

df = pd.read_parquet("https://s3.amazonaws.com/bsql/data/air_transport/flight_ontime_2020-01.parquet")
schema = Schema(df.iloc[: , :-1])
print(schema)

In [None]:
import pandas as pd

files = [[f"https://s3.amazonaws.com/bsql/data/air_transport/flight_ontime_2020-0{i}.parquet"] for i in [1,2,3,4,5]]
files_df = pd.DataFrame(files, columns=["path"])

files_df

In [None]:
from typing import Dict, Any, List, Iterable
import os
from shutil import rmtree

def download(files:pd.DataFrame,path:str) -> None:
    os.makedirs(path,exist_ok=True)
    for file in files["path"]:
        fn = os.path.basename(file)
        npath = os.path.join(path,fn)
        print(npath)
        pd.read_parquet(file)[schema.names].to_parquet(npath)
        
download(files_df.head(2), "/tmp/1.parquet")

In [None]:
%%fsql dask
OUTTRANSFORM files_df 
EVEN PREPARTITION ROWCOUNT 
USING download(path="/tmp/flights.parquet")

In [None]:
%%fsql blazing
LOAD "/tmp/flights.parquet"
PRINT ROWCOUNT
SELECT FL_DATE,CRS_DEP_TIME,DEP_TIME,DEP_DELAY LIMIT 50 PERSIST
YIELD DATAFRAME AS test

In [None]:
#schema: *,ts:datetime,day_of_year:int,hour_of_week:int
def generate_time_metrics(df:pd.DataFrame) -> pd.DataFrame:
    date = df["FL_DATE"].astype(str) + " "+df["CRS_DEP_TIME"].astype(str)
    df["ts"]=pd.to_datetime(date, format="%Y-%m-%d %H%M")
    df["day_of_year"]=df["ts"].dt.dayofyear
    df["hour_of_week"]=df["ts"].dt.dayofweek*24+df["ts"].dt.hour
    return df

generate_time_metrics(test.as_pandas())

In [None]:
%%fsql dask
LOAD "/tmp/flights.parquet"
TRANSFORM USING generate_time_metrics
SELECT 
    ts, 
    day_of_year, 
    hour_of_week, 
    ORIGIN AS origin,
    DEST AS dest,
    OP_UNIQUE_CARRIER AS carrier,
    DEP_DELAY AS delay
PERSIST 
YIELD DATAFRAME AS flights
PRINT ROWCOUNT

In [None]:
import matplotlib.pyplot as plt

def plot(df:pd.DataFrame,x:Any,y:Any,sort:Any,**kwargs) -> None:
    df.sort_values(sort).plot(x=x,y=y,**kwargs)
    plt.show()

In [None]:
%%fsql blazing
SELECT day_of_year, AVG(delay) AS avg_delay FROM flights GROUP BY day_of_year
OUTPUT USING plot(x="day_of_year",y="avg_delay",sort="day_of_year")

SELECT hour_of_week, AVG(delay) AS avg_delay FROM flights GROUP BY hour_of_week
OUTPUT USING plot(x="hour_of_week",y="avg_delay",sort="hour_of_week")

In [None]:
%%fsql blazing
info = 
    SELECT ts
        , carrier
        , B.name AS carrier_name
        , origin
        , C.name AS origin_name      
        , C.country AS origin_country      
        , C.lat AS origin_lat       
        , C.lng AS origin_lng    
        , dest
        , D.name AS dest_name
        , D.country AS dest_country    
        , D.lat AS dest_lat       
        , D.lng AS dest_lng    
        , delay
    FROM flights AS A
    LEFT OUTER JOIN airlines AS B
        ON A.carrier = B.iata
    LEFT OUTER JOIN airports AS C
        ON A.origin = C.iata
    LEFT OUTER JOIN airports AS D
        ON A.dest = D.iata
    WHERE C.lat IS NOT NULL AND C.lng IS NOT NULL
        AND D.lat IS NOT NULL AND D.lng IS NOT NULL
PERSIST YIELD DATAFRAME
PRINT ROWCOUNT

SELECT * WHERE origin_country = dest_country AND origin_country = 'United States'
PERSIST YIELD DATAFRAME AS info_us
PRINT ROWCOUNT

In [None]:
def plot_bar(df:pd.DataFrame,x:Any,y:Any,sort:Any,**kwargs) -> None:
    df.sort_values(sort).plot.bar(x=x,y=y,**kwargs)
    plt.show()

In [None]:
%%fsql blazing
SELECT origin, AVG(delay) AS delay FROM info_us GROUP BY origin
SELECT * ORDER BY delay DESC LIMIT 10
OUTPUT USING plot_bar(x="origin",y="delay",sort="delay", title="By Origin")

SELECT dest, AVG(delay) AS delay FROM info_us GROUP BY dest
SELECT * ORDER BY delay DESC LIMIT 10
OUTPUT USING plot_bar(x="dest",y="delay",sort="delay", title="By Dest")

top = 
    SELECT carrier, COUNT(*) AS ct 
    FROM info_us GROUP BY carrier 
    ORDER BY ct DESC LIMIT 20
    PERSIST YIELD DATAFRAME
    
info_top = 
    SELECT info_us.* FROM info_us INNER JOIN top ON info_us.carrier = top.carrier

SELECT carrier_name, AVG(delay) AS delay FROM info_top GROUP BY carrier_name
SELECT * ORDER BY delay DESC LIMIT 10
OUTPUT USING plot_bar(x="carrier_name",y="delay",sort="delay", title="By Top Carriers")


In [None]:
%%fsql blazing
airports = 
    LOAD CSV "/tmp/airports.csv"
    COLUMNS airport_id:long,name:str,city:str,country:str,iata:str,icao:str,lat:double,lng:double,alt:long,timezone:str,dst:str,type:str,source:str

airlines = 
    LOAD CSV "/tmp/airlines.csv"
    COLUMNS airline_id:long,name:str,alias:str,iata:str,icao:str,callsign:str,country:str,active:str

LOAD "/tmp/flights.parquet"
TRANSFORM USING generate_time_metrics
flights = 
    SELECT 
        ts, 
        day_of_year, 
        hour_of_week, 
        ORIGIN AS origin,
        DEST AS dest,
        OP_UNIQUE_CARRIER AS carrier,
        DEP_DELAY AS delay
    PERSIST 
    
SELECT day_of_year, AVG(delay) AS avg_delay FROM flights GROUP BY day_of_year
OUTPUT USING plot(x="day_of_year",y="avg_delay",sort="day_of_year")

SELECT hour_of_week, AVG(delay) AS avg_delay FROM flights GROUP BY hour_of_week
OUTPUT USING plot(x="hour_of_week",y="avg_delay",sort="hour_of_week")

    
info = 
    SELECT ts
        , carrier
        , B.name AS carrier_name
        , origin
        , C.name AS origin_name      
        , C.country AS origin_country      
        , C.lat AS origin_lat       
        , C.lng AS origin_lng    
        , dest
        , D.name AS dest_name
        , D.country AS dest_country    
        , D.lat AS dest_lat       
        , D.lng AS dest_lng    
        , delay
    FROM flights AS A
    LEFT OUTER JOIN airlines AS B
        ON A.carrier = B.iata
    LEFT OUTER JOIN airports AS C
        ON A.origin = C.iata
    LEFT OUTER JOIN airports AS D
        ON A.dest = D.iata
    WHERE C.lat IS NOT NULL AND C.lng IS NOT NULL
        AND D.lat IS NOT NULL AND D.lng IS NOT NULL

info_us = 
    SELECT * WHERE origin_country = dest_country AND origin_country = 'United States'
    PERSIST
    
SELECT origin, AVG(delay) AS delay FROM info_us GROUP BY origin
SELECT * ORDER BY delay DESC LIMIT 10
OUTPUT USING plot_bar(x="origin",y="delay",sort="delay", title="By Origin")

SELECT dest, AVG(delay) AS delay FROM info_us GROUP BY dest
SELECT * ORDER BY delay DESC LIMIT 10
OUTPUT USING plot_bar(x="dest",y="delay",sort="delay", title="By Dest")

top = 
    SELECT carrier, COUNT(*) AS ct 
    FROM info_us GROUP BY carrier 
    ORDER BY ct DESC LIMIT 20
    
info_top = 
    SELECT info_us.* FROM info_us INNER JOIN top ON info_us.carrier = top.carrier

SELECT carrier_name, AVG(delay) AS delay FROM info_top GROUP BY carrier_name
SELECT * ORDER BY delay DESC LIMIT 10
OUTPUT USING plot_bar(x="carrier_name",y="delay",sort="delay", title="By Top Carriers")

