# Preppin Data Challenge - 2024 Week 1 - Prep Air's Flow Card

###### Importing modules and bringing in the data

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('/home/lyon/Desktop/Python Projects/Data Cleaning/Preppin Data - 2024/Input Files/PD 2024 Wk 1 Input.csv')

In [4]:
df.head()

Unnamed: 0,Flight Details,Flow Card?,Bags Checked,Meal Type
0,2024-07-22//PA010//Tokyo-New York//Economy//2380,1,0,Egg Free
1,2024-09-28//PA008//Perth-New York//Economy//1855,0,2,Vegetarian
2,2024-04-20//PA002//New York-London//Economy//3490,1,1,Vegan
3,2024-01-23//PA010//Tokyo-New York//Premium Eco...,1,1,Vegetarian
4,2024-10-01//PA008//Perth-New York//Business Cl...,0,0,Vegetarian


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3778 entries, 0 to 3777
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Flight Details  3778 non-null   object
 1   Flow Card?      3778 non-null   int64 
 2   Bags Checked    3778 non-null   int64 
 3   Meal Type       3778 non-null   object
dtypes: int64(2), object(2)
memory usage: 118.2+ KB


###### Splitting up the Flight Details column so that downstream transformations can be applied

In [6]:
# this code will split up the origiinal Flight Details column by the delimiter of //
# and then converts the result to a be a dataframe because of the expand argument

flight_details_split = df['Flight Details'].str.split(pat='//', expand=True)

flight_details_split.head()

Unnamed: 0,0,1,2,3,4
0,2024-07-22,PA010,Tokyo-New York,Economy,2380.0
1,2024-09-28,PA008,Perth-New York,Economy,1855.0
2,2024-04-20,PA002,New York-London,Economy,3490.0
3,2024-01-23,PA010,Tokyo-New York,Premium Economy,825.0
4,2024-10-01,PA008,Perth-New York,Business Class,634.8


In [7]:
# this code renames the columns of the flight_details_split dataframe

flight_details_split.rename(columns={0: 'Date', 1: 'Flight Number', 2: 'Route', 3: 'Class', 4: 'Price'}, inplace=True)

flight_details_split.head()

Unnamed: 0,Date,Flight Number,Route,Class,Price
0,2024-07-22,PA010,Tokyo-New York,Economy,2380.0
1,2024-09-28,PA008,Perth-New York,Economy,1855.0
2,2024-04-20,PA002,New York-London,Economy,3490.0
3,2024-01-23,PA010,Tokyo-New York,Premium Economy,825.0
4,2024-10-01,PA008,Perth-New York,Business Class,634.8


In [8]:
# this code creates a new dataframe, route_split, and then renames its columns
# this is important because the Route field from the flight_details_split dataframe isn't correct


route_split = flight_details_split['Route'].str.split(pat='-', expand=True)

route_split.rename(columns={0: 'From', 1: 'To'}, inplace=True)

route_split.head()

Unnamed: 0,From,To
0,Tokyo,New York
1,Perth,New York
2,New York,London
3,Tokyo,New York
4,Perth,New York


In [9]:
# this code appends the flight_details_split and route_split dataframes
# to make full_flight_details

full_flight_details = pd.concat([flight_details_split, route_split], axis=1)

full_flight_details.head()

Unnamed: 0,Date,Flight Number,Route,Class,Price,From,To
0,2024-07-22,PA010,Tokyo-New York,Economy,2380.0,Tokyo,New York
1,2024-09-28,PA008,Perth-New York,Economy,1855.0,Perth,New York
2,2024-04-20,PA002,New York-London,Economy,3490.0,New York,London
3,2024-01-23,PA010,Tokyo-New York,Premium Economy,825.0,Tokyo,New York
4,2024-10-01,PA008,Perth-New York,Business Class,634.8,Perth,New York


In [10]:
# this code appends the flight_full_details and original (df) dataframes
# this code also removes unnecessary columns

splitted_data = pd.concat([full_flight_details, df], axis=1)

splitted_data = splitted_data.drop(columns=['Route', 'Flight Details'])

splitted_data.head()

Unnamed: 0,Date,Flight Number,Class,Price,From,To,Flow Card?,Bags Checked,Meal Type
0,2024-07-22,PA010,Economy,2380.0,Tokyo,New York,1,0,Egg Free
1,2024-09-28,PA008,Economy,1855.0,Perth,New York,0,2,Vegetarian
2,2024-04-20,PA002,Economy,3490.0,New York,London,1,1,Vegan
3,2024-01-23,PA010,Premium Economy,825.0,Tokyo,New York,1,1,Vegetarian
4,2024-10-01,PA008,Business Class,634.8,Perth,New York,0,0,Vegetarian


In [11]:
#this code reorders the columns so they align with the expected results
# from the Preppin' Data challenge webpage

splitted_data = splitted_data[['Date', 'Flight Number', 'From', 'To', 'Class', 'Price', 'Flow Card?', 'Bags Checked', 'Meal Type']]
splitted_data.head()

Unnamed: 0,Date,Flight Number,From,To,Class,Price,Flow Card?,Bags Checked,Meal Type
0,2024-07-22,PA010,Tokyo,New York,Economy,2380.0,1,0,Egg Free
1,2024-09-28,PA008,Perth,New York,Economy,1855.0,0,2,Vegetarian
2,2024-04-20,PA002,New York,London,Economy,3490.0,1,1,Vegan
3,2024-01-23,PA010,Tokyo,New York,Premium Economy,825.0,1,1,Vegetarian
4,2024-10-01,PA008,Perth,New York,Business Class,634.8,0,0,Vegetarian


###### Fully consolidated dataset that can be further filtered down

In [12]:
# this code replaces the original 1 and 0 values from the Flow Card? column
# so that the table can aligns with the expected results from the Preppin' Data challenge webpage

splitted_data['Flow Card?'] = np.where(splitted_data['Flow Card?'] == 1, 'Yes', 'No')
splitted_data.head()

Unnamed: 0,Date,Flight Number,From,To,Class,Price,Flow Card?,Bags Checked,Meal Type
0,2024-07-22,PA010,Tokyo,New York,Economy,2380.0,Yes,0,Egg Free
1,2024-09-28,PA008,Perth,New York,Economy,1855.0,No,2,Vegetarian
2,2024-04-20,PA002,New York,London,Economy,3490.0,Yes,1,Vegan
3,2024-01-23,PA010,Tokyo,New York,Premium Economy,825.0,Yes,1,Vegetarian
4,2024-10-01,PA008,Perth,New York,Business Class,634.8,No,0,Vegetarian


In [13]:
# this code converts the Date column from the splitted_data dataframe to have a date dtype
# and saves the result into a new dataframe called dates

# this code also removes the Date column from the splitted_data dataframe
# this is so that both dataframes can be appended (avoiding two Date columns)

dates = pd.to_datetime(splitted_data['Date'])

splitted_data = splitted_data.drop(columns=['Date'])

cleaned_data = pd.concat([splitted_data, dates], axis=1)

cleaned_data.head()

Unnamed: 0,Flight Number,From,To,Class,Price,Flow Card?,Bags Checked,Meal Type,Date
0,PA010,Tokyo,New York,Economy,2380.0,Yes,0,Egg Free,2024-07-22
1,PA008,Perth,New York,Economy,1855.0,No,2,Vegetarian,2024-09-28
2,PA002,New York,London,Economy,3490.0,Yes,1,Vegan,2024-04-20
3,PA010,Tokyo,New York,Premium Economy,825.0,Yes,1,Vegetarian,2024-01-23
4,PA008,Perth,New York,Business Class,634.8,No,0,Vegetarian,2024-10-01


In [14]:
# this code moves the Date column to the correct order

cleaned_data = cleaned_data[['Date', 'Flight Number', 'From', 'To', 'Class', 'Price', 'Flow Card?', 'Bags Checked', 'Meal Type']]

cleaned_data.head()

Unnamed: 0,Date,Flight Number,From,To,Class,Price,Flow Card?,Bags Checked,Meal Type
0,2024-07-22,PA010,Tokyo,New York,Economy,2380.0,Yes,0,Egg Free
1,2024-09-28,PA008,Perth,New York,Economy,1855.0,No,2,Vegetarian
2,2024-04-20,PA002,New York,London,Economy,3490.0,Yes,1,Vegan
3,2024-01-23,PA010,Tokyo,New York,Premium Economy,825.0,Yes,1,Vegetarian
4,2024-10-01,PA008,Perth,New York,Business Class,634.8,No,0,Vegetarian


###### Generating the two final outputs of the challenge 

###### One result set for passengers with a Flow Card and another result set for those without


In [15]:
# this code just filters the dataframe to include records where the value of
# the Flow Card? column is Yes

flow_card_holders = ['Yes']

output_1 = cleaned_data[cleaned_data['Flow Card?'].isin(flow_card_holders)]

output_1.head()

Unnamed: 0,Date,Flight Number,From,To,Class,Price,Flow Card?,Bags Checked,Meal Type
0,2024-07-22,PA010,Tokyo,New York,Economy,2380,Yes,0,Egg Free
2,2024-04-20,PA002,New York,London,Economy,3490,Yes,1,Vegan
3,2024-01-23,PA010,Tokyo,New York,Premium Economy,825,Yes,1,Vegetarian
6,2024-06-05,PA006,Tokyo,London,First Class,618,Yes,3,Vegan
8,2024-03-30,PA004,Perth,London,First Class,446,Yes,1,Nut Free


In [16]:
# this code just filters the dataframe to include records where the value of
# the Flow Card? column is No

non_flow_card_holders = ['No']

output_2 = cleaned_data[cleaned_data['Flow Card?'].isin(non_flow_card_holders)]

output_2.head()

Unnamed: 0,Date,Flight Number,From,To,Class,Price,Flow Card?,Bags Checked,Meal Type
1,2024-09-28,PA008,Perth,New York,Economy,1855.0,No,2,Vegetarian
4,2024-10-01,PA008,Perth,New York,Business Class,634.8,No,0,Vegetarian
5,2024-03-04,PA007,New York,Perth,Business Class,458.4,No,3,Nut Free
7,2024-02-25,PA010,Tokyo,New York,Premium Economy,1435.0,No,0,
13,2024-03-29,PA004,Perth,London,Economy,2730.0,No,2,Vegan


###### Downloading the two outputs as csv files 


In [29]:
output_1.to_csv('/home/lyon/Desktop/Python Projects/Data Cleaning/Preppin Data - 2024/Input Files/flowcard_data', index=0)

In [30]:
output_2.to_csv('/home/lyon/Desktop/Python Projects/Data Cleaning/Preppin Data - 2024/Input Files/non_flowcard_data', index=0)