Preprocessing data

In [15]:
import os
import pandas as pd
import numpy as np
import datetime
import fastparquet as fp
from dateutil import parser
import calendar
import pickle

In [16]:
os.getcwd()

'C:\\Users\\ellio\\Desktop\\Python\\scripts\\ML_flight_cancellation_prediction\\2- Preprocessing'

## Utility functions

In [17]:
def cols_to_snake(df):
    ''' function to make all column headers snake case
    
    Input: pd dataframe
    Outputs: pd dataframe with clean headers
    
    '''
        
    df.columns = df.columns.astype("str")
    df.columns = df.columns.str.lower()
    df.columns = df.columns.map(lambda x : x.replace("-", "_").replace(" ", "_").replace("/", "").replace(".", "_"))
    return df


def create_data_dict(dataframe, csv_name):
    
    """Extracts and prints a data dictionary to help with extracting and cleaning, needs manual 
    additions for variable descriptions
    
    Parameters
    ----------
    Input: dataframe: a dataframe for which you would like a dictionary
           csv_name: a 'string' which will become part of your csv file name
    
    Output: a dataframe with the variable names and data types. 
    
    Will need to manually enter in addition: data categories, variable descriptions, include (y/n)? and validations"""

    data_dict_cars = pd.DataFrame({
        "variable": list(data.columns),
        "data_type": data.dtypes,
    })
    
    data_dict_cars.to_csv(str('2-' + csv_name + '.csv'))
    
    return data_dict_cars 

In [18]:
#read data
data = pd.read_parquet('C:\\Users\\ellio\\Desktop\\Python\\scripts\\ML_flight_cancellation_prediction\\datasets\\1.1-full_departures_3012.parquet.gzip')

In [19]:
# regularise headers to snakecase
data = cols_to_snake(data)
data.head(2)

Unnamed: 0,type,status,departure_iatacode,departure_icaocode,departure_terminal,departure_scheduledtime,arrival_iatacode,arrival_icaocode,arrival_terminal,arrival_scheduledtime,...,departure_actualtime,departure_estimatedrunway,departure_actualrunway,arrival_estimatedtime,arrival_baggage,arrival_delay,arrival_gate,arrival_actualtime,arrival_estimatedrunway,arrival_actualrunway
0,departure,unknown,lhr,egll,2,2020-05-15 06:40:00,arn,essa,5,2020-05-15t10:05:00.000,...,,,,,,,,,,
1,departure,unknown,lhr,egll,2,2020-05-15 06:40:00,arn,essa,5,2020-05-15t10:05:00.000,...,,,,,,,,,,


In [20]:
# create a data dictionary

# comment this out so not to overwrite the manual adds

#create_data_dict(data, 'departures_data_dict')

data_dict = pd.read_csv('2-departures_data_dict.csv')

display(data_dict)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,variable,data_type,var_type,include
0,0,type,type,object,none,y
1,1,status,status,object,flight_status,y
2,2,departure_iatacode,departure_iatacode,object,id,y
3,3,departure_icaocode,departure_icaocode,object,id,y
4,4,departure_terminal,departure_terminal,object,terminal,y
5,5,departure_scheduledtime,departure_scheduledtime,datetime64[ns],time,y
6,8,arrival_terminal,arrival_terminal,object,terminal,y
7,9,arrival_scheduledtime,arrival_scheduledtime,object,time,y
8,10,airline_name,airline_name,object,airline,y
9,13,flight_number,flight_number,object,id,y


In [21]:
# Remove unwanted columns - annotated in dictionary by include = 'n'
columns = data_dict.loc[data_dict.eval("include == 'y'"), "variable"]  #extract all of the drivetrain types from the data dictionary & keep the variable only that becomes columns

data = data[columns]

data.head(3)

# write to new data.dict so as not to query missing columns
data_dict = data_dict.loc[data_dict.eval("include == 'y'"), :]

data_dict.to_csv('2-departures_data_dict.csv')

#only included value in our data dictionary
data_dict.head(3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,variable,data_type,var_type,include
0,0,type,type,object,none,y
1,1,status,status,object,flight_status,y
2,2,departure_iatacode,departure_iatacode,object,id,y


As this is an exercise in cancelled flights, we will be using the scheduled date as the date timestamp as the true reference

In [23]:
# dates and times to dates and time format

# Remove unwanted columns

time_columns = data_dict.loc[data_dict.eval("var_type == 'time'"), "variable"]  #extract all of the drivetrain types from the data dictionary & keep the variable only that becomes columns

time_cols = data[time_columns]

time_cols_good_formats = time_cols.apply(pd.to_datetime, errors='coerce')

time_cols_good_formats

no_bad_time_data = data.drop(columns = time_cols)

time_parsed_data = pd.concat([no_bad_time_data, time_cols_good_formats], axis = 1)

time_parsed_data.head(2)

Unnamed: 0,type,status,departure_iatacode,departure_icaocode,departure_terminal,arrival_terminal,airline_name,flight_number,flight_iatanumber,codeshared_airline_name,codeshared_flight_number,departure_gate,departure_estimatedrunway,departure_actualrunway,departure_scheduledtime,arrival_scheduledtime,departure_delay,departure_estimatedtime,departure_actualtime
0,departure,unknown,lhr,egll,2,5,thai airways international,6225,tg6225,sas,1530.0,,,,2020-05-15 06:40:00,2020-05-15 10:05:00,NaT,NaT,NaT
1,departure,unknown,lhr,egll,2,5,sas,1530,sk1530,,,,,,2020-05-15 06:40:00,2020-05-15 10:05:00,NaT,NaT,NaT


In [24]:
# split date and time from our reference date - departure_scheduledtime 

#get time of day and week day for EDA

time_parsed_data['weekday'] = time_parsed_data['departure_scheduledtime'].dt.day_name()
time_parsed_data['time_of_day'] = time_parsed_data['departure_scheduledtime'].dt.time
time_parsed_data['date'] = time_parsed_data['departure_scheduledtime'].dt.date

time_parsed_data.head(3)

Unnamed: 0,type,status,departure_iatacode,departure_icaocode,departure_terminal,arrival_terminal,airline_name,flight_number,flight_iatanumber,codeshared_airline_name,...,departure_estimatedrunway,departure_actualrunway,departure_scheduledtime,arrival_scheduledtime,departure_delay,departure_estimatedtime,departure_actualtime,weekday,time_of_day,date
0,departure,unknown,lhr,egll,2,5,thai airways international,6225,tg6225,sas,...,,,2020-05-15 06:40:00,2020-05-15 10:05:00,NaT,NaT,NaT,Friday,06:40:00,2020-05-15
1,departure,unknown,lhr,egll,2,5,sas,1530,sk1530,,...,,,2020-05-15 06:40:00,2020-05-15 10:05:00,NaT,NaT,NaT,Friday,06:40:00,2020-05-15
2,departure,unknown,lhr,egll,2,3,sas,500,sk500,,...,,,2020-05-15 06:45:00,2020-05-15 09:35:00,NaT,NaT,NaT,Friday,06:45:00,2020-05-15


preprocessing largely done - write pickle to datasets and start EDA - using a pickle file to save the date format

In [25]:
# set our outcome variable

time_parsed_data['cancelled_outcome'] = np.where(time_parsed_data['status']=='cancelled', 1, 0)

#cool

In [26]:
time_parsed_data.to_pickle(path = 'C:\\Users\\ellio\\Desktop\\Python\\scripts\\ML_flight_cancellation_prediction\\datasets\\2-formatted_data_for_EDA.pickle')