In [3]:
import pandas as pd

In [2]:
# method chaining in pandas

In [6]:
# method chain for dealing with loading dataset into a DataFrame
def read(fp):
    '''
        read dataset into DataFrame -- process dataset for use
    '''
    
    df = (pd.read_csv(fp)
            .rename(columns=str.lower)
            .drop('unnamed: 36', axis=1)
            .pipe(extract_city_names)
            .pipe(time_to_datetime, ['dep_time', 'arr_time', 'crs_arr_time', 'crs_dep_time'])
            .assign(fl_date=lambda x: pd.to_datetime(x['fl_date']),
                    dest=lambda x: pd.Categorical(x['dest']),
                    origin=lambda x: pd.Categorical(x['origin']),
                    tail_num=lambda x: pd.Categorical(x['tail_num']),
                    unique_carrier=lambda x: pd.Categorical(x['unique_carrier']),
                    cancellation_code=lambda x: pd.Categorical(x['cancellation_code'])))
    
    return df

def extract_city_names(df):
    '''
        Chicago, IL to Chicago
    '''
    
    cols = ['origin_city_name', 'dest_city_name']
    city = df[cols].apply(lambda x: x.str.extract('(.*), \w{2}', expand=False))
    df = df.copy()
    df[cols] = city
    
    return df

def time_to_datetime(df, cols):
    '''
        convert all columns with dates as strings to dates as datetimes
    '''
    
    df = df.copy()
    
    def converter(col):
        '''
            converts dates
        '''
        
        timepart = (col.astype(str)
                       .str.replace('\.0$', '') # NaNs force float dtype
                       .str.pad(4, fillchar='0'))
        
        return pd.to_datetime(df['fl_date'] + ' ' +
                                 timepart.str.slice(0, 2) + ':' +
                                 timepart.str.slice(2, 4),
                                 errors='coerce')
    
    df[cols] = df[cols].apply(converter)
    
    return df

In [7]:
df = read('856263442_T_ONTIME.csv')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 471949 entries, 0 to 471948
Data columns (total 36 columns):
fl_date                  471949 non-null datetime64[ns]
unique_carrier           471949 non-null category
airline_id               471949 non-null int64
tail_num                 467903 non-null category
fl_num                   471949 non-null int64
origin_airport_id        471949 non-null int64
origin_airport_seq_id    471949 non-null int64
origin_city_market_id    471949 non-null int64
origin                   471949 non-null category
origin_city_name         471949 non-null object
origin_state_nm          471949 non-null object
dest_airport_id          471949 non-null int64
dest_airport_seq_id      471949 non-null int64
dest_city_market_id      471949 non-null int64
dest                     471949 non-null category
dest_city_name           471949 non-null object
dest_state_nm            471949 non-null object
crs_dep_time             471949 non-null datetime64[ns]
dep_time 

In [34]:
# using decorators for logging
from functools import wraps
import logging

def log_shape(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        result = func(*args, **kwargs)
        logging.error('{}{}'.format(func.__name__, result.shape))
        return result
    return wrapper

def log_dtypes(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        result = func(*args, **kwargs)
        logging.error('{}{}'.format(func.__name__, result.dtypes))
        return result
    return wrapper


@log_shape
@log_dtypes
def load(fp):
    df = pd.read_csv(fp, index_col=0, parse_dates=True)
    return df


In [35]:
load('856263442_T_ONTIME.csv')

ERROR:root:loadUNIQUE_CARRIER            object
AIRLINE_ID                 int64
TAIL_NUM                  object
FL_NUM                     int64
ORIGIN_AIRPORT_ID          int64
ORIGIN_AIRPORT_SEQ_ID      int64
ORIGIN_CITY_MARKET_ID      int64
ORIGIN                    object
ORIGIN_CITY_NAME          object
ORIGIN_STATE_NM           object
DEST_AIRPORT_ID            int64
DEST_AIRPORT_SEQ_ID        int64
DEST_CITY_MARKET_ID        int64
DEST                      object
DEST_CITY_NAME            object
DEST_STATE_NM             object
CRS_DEP_TIME               int64
DEP_TIME                 float64
DEP_DELAY                float64
TAXI_OUT                 float64
WHEELS_OFF               float64
WHEELS_ON                float64
TAXI_IN                  float64
CRS_ARR_TIME               int64
ARR_TIME                 float64
ARR_DELAY                float64
CANCELLED                float64
CANCELLATION_CODE         object
DIVERTED                 float64
DISTANCE                 flo

Unnamed: 0_level_0,UNIQUE_CARRIER,AIRLINE_ID,TAIL_NUM,FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_STATE_NM,...,CANCELLED,CANCELLATION_CODE,DIVERTED,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 36
FL_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-01-01,OO,20304,N951SW,6252,11603,1160302,31603,EUG,"Eugene, OR",Oregon,...,0.0,,0.0,451.0,,,,,,
2014-01-01,OO,20304,N951SW,6252,14771,1477101,32457,SFO,"San Francisco, CA",California,...,0.0,,0.0,451.0,,,,,,
2014-01-01,OO,20304,N796SK,6253,10372,1037203,30372,ASE,"Aspen, CO",Colorado,...,0.0,,0.0,125.0,6.0,0.0,0.0,0.0,78.0,
2014-01-01,OO,20304,N745SK,6370,11292,1129202,30325,DEN,"Denver, CO",Colorado,...,0.0,,0.0,1201.0,24.0,0.0,15.0,0.0,63.0,
2014-01-01,OO,20304,N967SW,6371,12892,1289203,32575,LAX,"Los Angeles, CA",California,...,0.0,,0.0,954.0,,,,,,
2014-01-01,OO,20304,N925SW,6371,14747,1474703,30559,SEA,"Seattle, WA",Washington,...,0.0,,0.0,954.0,0.0,0.0,0.0,0.0,35.0,
2014-01-01,OO,20304,N936SW,6373,11292,1129202,30325,DEN,"Denver, CO",Colorado,...,0.0,,0.0,959.0,0.0,0.0,0.0,0.0,67.0,
2014-01-01,OO,20304,N772SK,6374,10397,1039705,30397,ATL,"Atlanta, GA",Georgia,...,0.0,,0.0,1199.0,0.0,0.0,4.0,0.0,31.0,
2014-01-01,OO,20304,N772SK,6374,11292,1129202,30325,DEN,"Denver, CO",Colorado,...,0.0,,0.0,1199.0,0.0,0.0,38.0,0.0,0.0,
2014-01-01,OO,20304,N716SK,6375,14771,1477101,32457,SFO,"San Francisco, CA",California,...,0.0,,0.0,522.0,63.0,0.0,0.0,0.0,0.0,


In [31]:
!ls

856263442_T_ONTIME.csv	flights.csv.zip  part1.ipynb  part2.ipynb
