In [None]:
import dagstermill as dm
from airline_demo.repository import define_repo
dm.register_repository(define_repo())

In [None]:
context = dm.get_context()

db_url = 'postgresql://test:test@127.0.0.1:5432/test'
table_name = 'delays_vs_fares'

In [None]:
import os

import sqlalchemy as sa
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
engine = sa.create_engine(db_url)

In [None]:
from matplotlib.backends.backend_pdf import PdfPages
pdf_path = os.path.join(os.getcwd(), 'fares_vs_delays.pdf')
pp = PdfPages(pdf_path)

In [None]:
fares_vs_delays = pd.read_sql('select * from {table_name}'.format(table_name=table_name), engine)

In [None]:
fares_vs_delays.head()

In [None]:
fares_vs_delays['avg_arrival_delay'].describe()

In [None]:
plt.scatter(fares_vs_delays['avg_arrival_delay'], fares_vs_delays['avg_fare'])

try:
    z = np.polyfit(fares_vs_delays['avg_arrival_delay'], fares_vs_delays['avg_fare'], 1)
    f = np.poly1d(z)

    x_fit = np.linspace(fares_vs_delays['avg_arrival_delay'].min(), fares_vs_delays['avg_arrival_delay'].max(), 50)
    y_fit = f(x_fit)
    plt.plot(x_fit, y_fit, 'k--', alpha=0.5)
except:
    pass

plt.title('Arrival Delays vs. Fares (Origin SFO)')
plt.xlabel('Average Delay at Arrival (Minutes)')
plt.ylabel('Average Fare ($)')
pp.savefig()

In [None]:
fig, ax = plt.subplots(figsize=(10,10))

for i, _ in enumerate(fares_vs_delays.index):
    plt.text(
        fares_vs_delays['avg_arrival_delay'][i],
        fares_vs_delays['avg_fare_per_mile'][i],
        fares_vs_delays['dest'][i],
        fontsize=8)

plt.scatter(fares_vs_delays['avg_arrival_delay'], fares_vs_delays['avg_fare_per_mile'], alpha=0)
plt.title('Flight Delays (Origin SFO)')
plt.xlabel('Average Delay at Arrival (Minutes)')
plt.ylabel('Average Fare per Mile Flown($)')

pp.savefig()

In [None]:
pp.close()

In [None]:
dm.yield_result(pdf_path, 'result')