In [2]:
%matplotlib inline

import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style='ticks', context='talk')



In [10]:
def read(fp):
    df = (pd.read_csv(fp)
            .rename(columns=str.lower)
            .drop('unnamed: 20', axis=1)
            .pipe(extract_city_name)
            .pipe(time_to_datetime, ['distance', 'air_time', 'dep_delay', 'arr_delay'])
            .assign(fl_date=lambda x: pd.to_datetime(x['fl_date']),
                    dest_airport_id=lambda x: pd.Categorical(x['dest_airport_id']),
                    origin_airport_id=lambda x: pd.Categorical(x['origin_airport_id']),
                    carrier=lambda x: pd.Categorical(x['carrier']),
                    ))
    return df

In [11]:
def extract_city_name(df):
    '''
    Chicago, IL -> Chicago for origin_city_name and dest_city_name
    '''
    cols = ['origin_city_name', 'dest_city_name']
    city = df[cols].apply(lambda x: x.str.extract("(.*), \w{2}", expand=False))
    df = df.copy()
    df[['origin_city_name', 'dest_city_name']] = city
    return df

In [12]:
def time_to_datetime(df, columns):
    '''
    Combine all time items into datetimes.

    2014-01-01,0914 -> 2014-01-01 09:14:00
    '''
    df = df.copy()
    def converter(col):
        timepart = (col.astype(str)
                       .str.replace('\.0$', '')  # NaNs force float dtype
                       .str.pad(4, fillchar='0'))
        return pd.to_datetime(df['fl_date'] + ' ' +
                               timepart.str.slice(0, 2) + ':' +
                               timepart.str.slice(2, 4),
                               errors='coerce')
    df[columns] = df[columns].apply(converter)
    return df

In [13]:
output = 'data/flights.h5'

if not os.path.exists(output):
    df = read("data/flights.csv")
    df.to_hdf(output, 'flights', format='table')
else:
    df = pd.read_hdf(output, 'flights', format='table')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 570131 entries, 0 to 570130
Data columns (total 20 columns):
fl_date                  570131 non-null datetime64[ns]
airline_id               570131 non-null int64
carrier                  570131 non-null category
origin_airport_id        570131 non-null category
origin_airport_seq_id    570131 non-null int64
origin_city_market_id    570131 non-null int64
origin_city_name         570131 non-null object
dest_airport_id          570131 non-null category
dest_airport_seq_id      570131 non-null int64
dest_city_market_id      570131 non-null int64
dest_city_name           570131 non-null object
dep_delay                189065 non-null datetime64[ns]
dep_delay_new            552805 non-null float64
arr_delay                169650 non-null datetime64[ns]
arr_delay_new            551505 non-null float64
air_time                 333735 non-null datetime64[ns]
distance                 332081 non-null datetime64[ns]
carrier_delay            97766

In [16]:
(df.dropna(subset=['fl_date', 'carrier'])
   .loc[df['carrier']
       .isin(df['carrier'].value_counts().index[:5])]
   .set_index('fl_date')
   # TimeGrouper to resample & groupby at once
   .groupby(['airline_id', pd.TimeGrouper("H")])
   .fl_num.count()
   .unstack(0)
   .fillna(0)
   .rolling(24)
   .sum()
   .rename_axis("Flights per Day", axis=1)
   .plot()
)
sns.despine()

AttributeError: 'DataFrameGroupBy' object has no attribute 'fl_num'

In [19]:
from pandas.core import datetools
import statsmodels.api as sm