Preprocessing data

In [152]:
import os
import pandas as pd
import numpy as np
import datetime
import fastparquet as fp
from dateutil import parser
import calendar
import pickle

In [153]:
os.getcwd()

'C:\\Users\\ellio\\Desktop\\Python\\scripts\\ML_flight_cancellation_prediction\\2- Preprocessing'

## Utility functions

In [154]:
def cols_to_snake(df):
    ''' function to make all column headers snake case
    
    Input: pd dataframe
    Outputs: pd dataframe with clean headers
    
    '''
        
    df.columns = df.columns.astype("str")
    df.columns = df.columns.str.lower()
    df.columns = df.columns.map(lambda x : x.replace("-", "_").replace(" ", "_").replace("/", "").replace(".", "_"))
    return df


def create_data_dict(dataframe, csv_name):
    
    """Extracts and prints a data dictionary to help with extracting and cleaning, needs manual 
    additions for variable descriptions
    
    Parameters
    ----------
    Input: dataframe: a dataframe for which you would like a dictionary
           csv_name: a 'string' which will become part of your csv file name
    
    Output: a dataframe with the variable names and data types. 
    
    Will need to manually enter in addition: data categories, variable descriptions, include (y/n)? and validations"""

    data_dict_cars = pd.DataFrame({
        "variable": list(data.columns),
        "data_type": data.dtypes,
    })
    
    data_dict_cars.to_csv(str('2-' + csv_name + '.csv'))
    
    return data_dict_cars 

In [156]:
#read data

data = pd.read_parquet('C:\\Users\\ellio\\Desktop\\Python\\scripts\\ML_flight_cancellation_prediction\\datasets\\1.1-full_departures.parquet.gzip')

In [157]:
# regularise headers to snakecase
data = cols_to_snake(data)
data.head(2)

Unnamed: 0,type,status,departure_iatacode,departure_icaocode,departure_terminal,departure_scheduledtime,arrival_iatacode,arrival_icaocode,arrival_terminal,arrival_scheduledtime,...,departure_actualrunway,arrival_estimatedtime,arrival_baggage,arrival_gate,arrival_delay,arrival_actualtime,arrival_estimatedrunway,arrival_actualrunway,error,success
0,departure,unknown,lhr,egll,2,2020-06-01 06:40:00,dub,eidw,2,2020-06-01t08:05:00.000,...,,,,,,,,,,
1,departure,unknown,lhr,egll,2,2020-06-01 06:40:00,dub,eidw,2,2020-06-01t08:05:00.000,...,,,,,,,,,,


In [158]:
# create a data dictionary

# comment this out so not to overwrite the manual adds

#create_data_dict(data, 'departures_data_dict')

data_dict = pd.read_csv('2-departures_data_dict.csv')

display(data_dict)

Unnamed: 0.1,Unnamed: 0,variable,data_type,var_type,include
0,1,status,object,flight_status,y
1,4,departure_terminal,object,terminal,y
2,5,departure_scheduledtime,object,time,y
3,8,arrival_terminal,object,terminal,y
4,10,airline_name,object,airline,y
5,13,flight_number,object,id_number,y
6,22,departure_gate,object,gate,y
7,23,departure_delay,float64,time,y
8,24,departure_estimatedtime,object,time,y
9,25,departure_actualtime,object,time,y


In [159]:
# Remove unwanted columns
columns = data_dict.loc[data_dict.eval("include == 'y'"), "variable"]  #extract all of the drivetrain types from the data dictionary & keep the variable only that becomes columns

data = data[columns]

data.head(3)

# write to new data.dict so as not to query missing columns
data_dict = data_dict.loc[data_dict.eval("include == 'y'"), :]

data_dict.to_csv('2-departures_data_dict.csv')

#only included value in our data dictionary
data_dict.head(3)

Unnamed: 0.1,Unnamed: 0,variable,data_type,var_type,include
0,1,status,object,flight_status,y
1,4,departure_terminal,object,terminal,y
2,5,departure_scheduledtime,object,time,y


As this is an exercise in cancelled flights, we will be using the scheduled date as the date timestamp as the true reference

In [160]:
# dates and times to dates and time format

# Remove unwanted columns

time_columns = data_dict.loc[data_dict.eval("var_type == 'time'"), "variable"]  #extract all of the drivetrain types from the data dictionary & keep the variable only that becomes columns

time_cols = data[time_columns]

time_cols_good_formats = time_cols.apply(pd.to_datetime, errors='coerce')

time_cols_good_formats

no_bad_time_data = data.drop(columns = time_cols)

time_parsed_data = pd.concat([no_bad_time_data, time_cols_good_formats], axis = 1)

time_parsed_data.head(2)

Unnamed: 0,status,departure_terminal,arrival_terminal,airline_name,flight_number,departure_gate,departure_estimatedrunway,departure_actualrunway,arrival_gate,departure_scheduledtime,departure_delay,departure_estimatedtime,departure_actualtime,arrival_estimatedtime,arrival_delay,arrival_actualtime
0,unknown,2,2,british airways,5949,,,,,2020-06-01 06:40:00,NaT,NaT,NaT,NaT,NaT,NaT
1,unknown,2,2,etihad airways,1306,,,,,2020-06-01 06:40:00,NaT,NaT,NaT,NaT,NaT,NaT


In [161]:
# split date and time from our reference date - departure_scheduledtime 

#get time of day and week day for EDA

time_parsed_data['weekday'] = time_parsed_data['departure_scheduledtime'].dt.day_name()
time_parsed_data['time_of_day'] = time_parsed_data['departure_scheduledtime'].dt.time
time_parsed_data['date'] = time_parsed_data['departure_scheduledtime'].dt.date

time_parsed_data.head(3)

Unnamed: 0,status,departure_terminal,arrival_terminal,airline_name,flight_number,departure_gate,departure_estimatedrunway,departure_actualrunway,arrival_gate,departure_scheduledtime,departure_delay,departure_estimatedtime,departure_actualtime,arrival_estimatedtime,arrival_delay,arrival_actualtime,weekday,time_of_day,date
0,unknown,2,2,british airways,5949,,,,,2020-06-01 06:40:00,NaT,NaT,NaT,NaT,NaT,NaT,Monday,06:40:00,2020-06-01
1,unknown,2,2,etihad airways,1306,,,,,2020-06-01 06:40:00,NaT,NaT,NaT,NaT,NaT,NaT,Monday,06:40:00,2020-06-01
2,unknown,2,2,united airlines,7649,,,,,2020-06-01 06:40:00,NaT,NaT,NaT,NaT,NaT,NaT,Monday,06:40:00,2020-06-01


preprocessing largely done - write pickle to datasets and start EDA - using a pickle file to save the date format

In [162]:
# set our outcome variable

time_parsed_data['cancelled_outcome'] = np.where(time_parsed_data['status']=='cancelled', 1, 0)

#cool

In [163]:
time_parsed_data.to_pickle(path = 'C:\\Users\\ellio\\Desktop\\Python\\scripts\\ML_flight_cancellation_prediction\\datasets\\2-formatted_data_for_EDA.pickle')