In [1]:
import pandas as pd

In [2]:
# mehod chaining in pandas

In [20]:
# method chain for dealing with loading dataset into a DataFrame
def read(fp):
    '''
        read dataset into DataFrame -- process dataset for use
    '''
    
    df = (pd.read_csv(fp)
            .rename(columns=str.lower)
            .drop('unnamed: 36', axis=1)
            .pipe(extract_city_names)
            .pipe(time_to_datetime, ['dep_time', 'arr_time', 'crs_arr_time', 'crs_dep_time']))
    
    return df

def extract_city_names(df):
    '''
        Chicago, IL to Chicago
    '''
    
    cols = ['origin_city_name', 'dest_city_name']
    city = df[cols].apply(lambda x: x.str.extract('(.*), \w{2}', expand=False))
    df = df.copy()
    df[cols] = city
    
    return df

def time_to_datetime(df, cols):
    '''
        convert all columns with dates as strings to dates as datetimes
    '''
    
    df = df.copy()
    
    def converter(col):
        '''
            converts dates
        '''
        
        timepart = (col.astype(str)
                       .str.replace('\.0$', '') # NaNs force float dtype
                       .str.pad(4, fillchar='0'))
        
        return pd.to_datetime(df['fl_date'] + ' ' +
                                 timepart.str.slice(0, 2) + ':' +
                                 timepart.str.slice(2, 4),
                                 errors='coerce')
    
    df[cols] = df[cols].apply(converter)
    
    return df

In [21]:
df = read('856263442_T_ONTIME.csv')

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 471949 entries, 0 to 471948
Data columns (total 36 columns):
fl_date                  471949 non-null object
unique_carrier           471949 non-null object
airline_id               471949 non-null int64
tail_num                 467903 non-null object
fl_num                   471949 non-null int64
origin_airport_id        471949 non-null int64
origin_airport_seq_id    471949 non-null int64
origin_city_market_id    471949 non-null int64
origin                   471949 non-null object
origin_city_name         471949 non-null object
origin_state_nm          471949 non-null object
dest_airport_id          471949 non-null int64
dest_airport_seq_id      471949 non-null int64
dest_city_market_id      471949 non-null int64
dest                     471949 non-null object
dest_city_name           471949 non-null object
dest_state_nm            471949 non-null object
crs_dep_time             471949 non-null datetime64[ns]
dep_time                 