In [2]:
%matplotlib inline

import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style='ticks', context='talk')



In [6]:
def read(fp):
    df = (pd.read_csv(fp)
            .rename(columns=str.lower)
            .drop('unnamed: 20', axis=1)
            .pipe(extract_city_name)
            .pipe(time_to_datetime, ['distance', 'air_time', 'dep_delay', 'arr_delay'])
            .assign(fl_date=lambda x: pd.to_datetime(x['fl_date']),
                    dest_airport_id=lambda x: pd.Categorical(x['dest_airport_id']),
                    origin_airport_id=lambda x: pd.Categorical(x['origin_airport_id']),
                    carrier=lambda x: pd.Categorical(x['carrier']),
                    ))
    return df

In [7]:
def extract_city_name(df):
    '''
    Chicago, IL -> Chicago for origin_city_name and dest_city_name
    '''
    cols = ['origin_city_name', 'dest_city_name']
    city = df[cols].apply(lambda x: x.str.extract("(.*), \w{2}", expand=False))
    df = df.copy()
    df[['origin_city_name', 'dest_city_name']] = city
    return df

In [8]:
def time_to_datetime(df, columns):
    '''
    Combine all time items into datetimes.

    2014-01-01,0914 -> 2014-01-01 09:14:00
    '''
    df = df.copy()
    def converter(col):
        timepart = (col.astype(str)
                       .str.replace('\.0$', '')  # NaNs force float dtype
                       .str.pad(4, fillchar='0'))
        return pd.to_datetime(df['fl_date'] + ' ' +
                               timepart.str.slice(0, 2) + ':' +
                               timepart.str.slice(2, 4),
                               errors='coerce')
    df[columns] = df[columns].apply(converter)
    return df

In [9]:
output = 'data/flights.h5'

if not os.path.exists(output):
    df = read("data/flights.csv")
    df.to_hdf(output, 'flights', format='table')
else:
    df = pd.read_hdf(output, 'flights', format='table')
df.info()

KeyError: 'unique_carrier'