# Preppin' Data
## 2024: Week 9 - Prep Air Capacity
**Created by:** Jenny Martin | [Challenge Link](https://preppindata.blogspot.com/2024/02/2024-week-9-prep-air-capacity.html)

Prep Air would like to do some analysis on how their flights are filling up over time. <br>They've given a small sample of flights that will be taking off next month, and the actions that customers who have booked those flights have been taking. 

In [61]:
# Input the xlsx files
import pandas as pd
import numpy as np

customer = pd.read_excel("PD 2024 Week 9 Input.xlsx", sheet_name="Customer Actions")
customer

Unnamed: 0,Flight Number,Flight Date,Customer ID,Action,Date,Class,Row,Seat
0,PA001,2024-03-05,20,Booked,2023-12-01,First,7.0,3.0
1,PA001,2024-03-05,20,Cancelled,2023-12-24,,,
2,PA001,2024-03-05,72,Booked,2023-12-25,First,8.0,2.0
3,PA001,2024-03-05,82,Booked,2024-01-31,First,5.0,2.0
4,PA001,2024-03-05,190,Booked,2024-01-07,Business,12.0,3.0
...,...,...,...,...,...,...,...,...
1694,PA012,2024-03-10,9547,Cancelled,2023-12-23,,,
1695,PA012,2024-03-10,9956,Booked,2024-02-07,Business,17.0,2.0
1696,PA012,2024-03-10,9957,Booked,2024-01-29,Business,9.0,4.0
1697,PA012,2024-03-10,9957,Seat Changed,2024-02-07,Business,15.0,1.0


In [62]:
flights = pd.read_excel("PD 2024 Week 9 Input.xlsx", sheet_name="Flight Details")
flights

Unnamed: 0,Flight Number,Flight Date,Class,Capacity
0,PA001,2024-03-05,First,32
1,PA001,2024-03-05,Business,40
2,PA001,2024-03-05,Premium Economy,64
3,PA001,2024-03-05,Economy,160
4,PA002,2024-03-30,First,32
5,PA002,2024-03-30,Business,40
6,PA002,2024-03-30,Premium Economy,64
7,PA002,2024-03-30,Economy,160
8,PA003,2024-03-06,First,32
9,PA003,2024-03-06,Business,40


In [63]:
# If the customer has cancelled their flight, make sure all rows are filtered out for that flight

customer_cancelled = customer[customer["Action"] == "Cancelled"]
customer_cancelled = customer_cancelled.drop(columns="Flight Date").drop(columns="Action").drop(columns="Date").drop(columns="Class").drop(columns="Row").drop(columns="Seat")
customer_cancelled

Unnamed: 0,Flight Number,Customer ID
1,PA001,20
11,PA001,253
14,PA001,324
16,PA001,326
20,PA001,903
...,...,...
1677,PA012,9093
1684,PA012,9336
1688,PA012,9433
1692,PA012,9493


In [64]:
merged_customers = customer.merge(customer_cancelled, how="left", on=["Flight Number", "Customer ID"], indicator=True)
merged_customers = merged_customers[merged_customers["_merge"] == "left_only"]
merged_customers = merged_customers.drop(columns="_merge")
merged_customers

Unnamed: 0,Flight Number,Flight Date,Customer ID,Action,Date,Class,Row,Seat
2,PA001,2024-03-05,72,Booked,2023-12-25,First,8.0,2.0
3,PA001,2024-03-05,82,Booked,2024-01-31,First,5.0,2.0
4,PA001,2024-03-05,190,Booked,2024-01-07,Business,12.0,3.0
5,PA001,2024-03-05,190,Upgraded,2024-01-31,First,5.0,3.0
6,PA001,2024-03-05,190,Seat Changed,2024-02-28,First,3.0,4.0
...,...,...,...,...,...,...,...,...
1681,PA012,2024-03-10,9157,Seat Changed,2024-03-06,Business,13.0,2.0
1695,PA012,2024-03-10,9956,Booked,2024-02-07,Business,17.0,2.0
1696,PA012,2024-03-10,9957,Booked,2024-01-29,Business,9.0,4.0
1697,PA012,2024-03-10,9957,Seat Changed,2024-02-07,Business,15.0,1.0


In [65]:
# For each customer on a flight, filter the dataset to their most recent action

max_index = merged_customers.groupby(["Flight Number", "Customer ID"])["Date"].idxmax()
last_booking = merged_customers.loc[max_index]
last_booking

Unnamed: 0,Flight Number,Flight Date,Customer ID,Action,Date,Class,Row,Seat
2,PA001,2024-03-05,72,Booked,2023-12-25,First,8.0,2.0
3,PA001,2024-03-05,82,Booked,2024-01-31,First,5.0,2.0
6,PA001,2024-03-05,190,Seat Changed,2024-02-28,First,3.0,4.0
8,PA001,2024-03-05,228,Upgraded,2024-01-02,First,7.0,2.0
18,PA001,2024-03-05,330,Seat Changed,2024-02-13,Business,9.0,1.0
...,...,...,...,...,...,...,...,...
1674,PA012,2024-03-10,8779,Booked,2024-01-06,Business,11.0,1.0
1678,PA012,2024-03-10,9109,Booked,2023-12-14,Premium Economy,22.0,5.0
1681,PA012,2024-03-10,9157,Seat Changed,2024-03-06,Business,13.0,2.0
1695,PA012,2024-03-10,9956,Booked,2024-02-07,Business,17.0,2.0


In [66]:
# Based on the Date field, create a field which shows how many seats in total have been booked as of that date for each flight and class
# Hint: Running Sum could be useful here!

last_booking = last_booking.sort_values(by="Date")
last_booking

Unnamed: 0,Flight Number,Flight Date,Customer ID,Action,Date,Class,Row,Seat
1513,PA012,2024-03-10,808,Booked,2023-11-12,Business,12.0,1.0
1452,PA011,2024-03-01,7083,Booked,2023-11-14,Economy,40.0,9.0
1492,PA011,2024-03-01,9433,Booked,2023-11-16,Economy,34.0,6.0
1048,PA009,2024-03-13,3379,Booked,2023-11-21,Premium Economy,21.0,1.0
1636,PA012,2024-03-10,7083,Booked,2023-11-21,Business,11.0,1.0
...,...,...,...,...,...,...,...,...
684,PA006,2024-03-27,9939,Upgraded,2024-03-26,Premium Economy,26.0,1.0
181,PA002,2024-03-30,1346,Seat Changed,2024-03-26,First,5.0,2.0
951,PA008,2024-03-30,7719,Upgraded,2024-03-26,First,3.0,4.0
886,PA008,2024-03-30,3734,Seat Changed,2024-03-26,Premium Economy,22.0,8.0


In [67]:
last_booking["Total Seats booked over time"] = last_booking.groupby(["Flight Number", "Class"]).cumcount() + 1
last_booking

Unnamed: 0,Flight Number,Flight Date,Customer ID,Action,Date,Class,Row,Seat,Total Seats booked over time
1513,PA012,2024-03-10,808,Booked,2023-11-12,Business,12.0,1.0,1
1452,PA011,2024-03-01,7083,Booked,2023-11-14,Economy,40.0,9.0,1
1492,PA011,2024-03-01,9433,Booked,2023-11-16,Economy,34.0,6.0,2
1048,PA009,2024-03-13,3379,Booked,2023-11-21,Premium Economy,21.0,1.0,1
1636,PA012,2024-03-10,7083,Booked,2023-11-21,Business,11.0,1.0,2
...,...,...,...,...,...,...,...,...,...
684,PA006,2024-03-27,9939,Upgraded,2024-03-26,Premium Economy,26.0,1.0,14
181,PA002,2024-03-30,1346,Seat Changed,2024-03-26,First,5.0,2.0,1
951,PA008,2024-03-30,7719,Upgraded,2024-03-26,First,3.0,4.0,7
886,PA008,2024-03-30,3734,Seat Changed,2024-03-26,Premium Economy,22.0,8.0,21


In [68]:
# calculate max booking per flight (delete all other rows)
max_index = last_booking.groupby(["Flight Number", "Class"])["Date"].idxmax()
current_flight_bookings = last_booking.loc[max_index]
current_flight_bookings = current_flight_bookings.drop(columns="Customer ID").drop(columns="Action").drop(columns="Row").drop(columns="Seat")
current_flight_bookings

Unnamed: 0,Flight Number,Flight Date,Date,Class,Total Seats booked over time
117,PA001,2024-03-05,2024-03-01,Business,17
30,PA001,2024-03-05,2024-03-01,First,27
84,PA001,2024-03-05,2024-01-30,Premium Economy,2
239,PA002,2024-03-30,2024-03-22,Business,11
181,PA002,2024-03-30,2024-03-26,First,1
232,PA002,2024-03-30,2024-03-13,Premium Economy,13
318,PA003,2024-03-06,2024-03-02,Business,7
306,PA003,2024-03-06,2024-03-01,Economy,7
339,PA003,2024-03-06,2024-02-22,First,5
272,PA003,2024-03-06,2024-03-06,Premium Economy,18


In [69]:
# Bring in information about the Flight Details

flights_booked = flights.merge(current_flight_bookings, how="left", on=["Flight Number", "Class"])
flights_booked = flights_booked.drop(columns="Flight Date_y")
flights_booked.head(10)

Unnamed: 0,Flight Number,Flight Date_x,Class,Capacity,Date,Total Seats booked over time
0,PA001,2024-03-05,First,32,2024-03-01,27.0
1,PA001,2024-03-05,Business,40,2024-03-01,17.0
2,PA001,2024-03-05,Premium Economy,64,2024-01-30,2.0
3,PA001,2024-03-05,Economy,160,NaT,
4,PA002,2024-03-30,First,32,2024-03-26,1.0
5,PA002,2024-03-30,Business,40,2024-03-22,11.0
6,PA002,2024-03-30,Premium Economy,64,2024-03-13,13.0
7,PA002,2024-03-30,Economy,160,NaT,
8,PA003,2024-03-06,First,32,2024-02-22,5.0
9,PA003,2024-03-06,Business,40,2024-03-02,7.0


In [70]:
flights_booked["Total Seats booked over time"] = flights_booked["Total Seats booked over time"].fillna(0).astype(int)
flights_booked.head(10)

Unnamed: 0,Flight Number,Flight Date_x,Class,Capacity,Date,Total Seats booked over time
0,PA001,2024-03-05,First,32,2024-03-01,27
1,PA001,2024-03-05,Business,40,2024-03-01,17
2,PA001,2024-03-05,Premium Economy,64,2024-01-30,2
3,PA001,2024-03-05,Economy,160,NaT,0
4,PA002,2024-03-30,First,32,2024-03-26,1
5,PA002,2024-03-30,Business,40,2024-03-22,11
6,PA002,2024-03-30,Premium Economy,64,2024-03-13,13
7,PA002,2024-03-30,Economy,160,NaT,0
8,PA003,2024-03-06,First,32,2024-02-22,5
9,PA003,2024-03-06,Business,40,2024-03-02,7


In [71]:
# Calculate the Capacity %: of the available seats on the flight for each class, what percentage have been booked so far
# For classes which are yet to be booked for a flight, ensure the Capacity % shows as 0% for these rows

flights_booked["Capacity %"] = (flights_booked["Total Seats booked over time"] / flights_booked["Capacity"]) * 100
flights_booked.head()

Unnamed: 0,Flight Number,Flight Date_x,Class,Capacity,Date,Total Seats booked over time,Capacity %
0,PA001,2024-03-05,First,32,2024-03-01,27,84.375
1,PA001,2024-03-05,Business,40,2024-03-01,17,42.5
2,PA001,2024-03-05,Premium Economy,64,2024-01-30,2,3.125
3,PA001,2024-03-05,Economy,160,NaT,0,0.0
4,PA002,2024-03-30,First,32,2024-03-26,1,3.125


In [75]:
# The Date for these rows should be today's date (28/02/2024) 
flights_booked["Date"] = "2024-02-28"
flights_booked.head()

Unnamed: 0,Flight Number,Flight Date_x,Class,Capacity,Date,Total Seats booked over time,Capacity %
0,PA001,2024-03-05,First,32,2024-02-28,27,84.375
1,PA001,2024-03-05,Business,40,2024-02-28,17,42.5
2,PA001,2024-03-05,Premium Economy,64,2024-02-28,2,3.125
3,PA001,2024-03-05,Economy,160,2024-02-28,0,0.0
4,PA002,2024-03-30,First,32,2024-02-28,1,3.125


## Output

In [73]:
# Output the data
output = flights_booked.rename_axis(columns=None)
output

Unnamed: 0,Flight Number,Flight Date_x,Class,Capacity,Date,Total Seats booked over time,Capacity %
0,PA001,2024-03-05,First,32,2024-02-28,27,84.375
1,PA001,2024-03-05,Business,40,2024-02-28,17,42.5
2,PA001,2024-03-05,Premium Economy,64,2024-02-28,2,3.125
3,PA001,2024-03-05,Economy,160,2024-02-28,0,0.0
4,PA002,2024-03-30,First,32,2024-02-28,1,3.125
5,PA002,2024-03-30,Business,40,2024-02-28,11,27.5
6,PA002,2024-03-30,Premium Economy,64,2024-02-28,13,20.3125
7,PA002,2024-03-30,Economy,160,2024-02-28,0,0.0
8,PA003,2024-03-06,First,32,2024-02-28,5,15.625
9,PA003,2024-03-06,Business,40,2024-02-28,7,17.5


In [74]:
# Generating csv output file
output.to_csv("output-202409.csv", index=False)