# 01 â€“ Data Loading & Quality Checks

## Objective
- Load raw flight data from Excel / CSV
- Inspect structure, coverage, and consistency
- Identify missing values, duplicates, and anomalies
- Produce a consolidated raw dataset for cleaning

In [14]:
import pandas as pd
import numpy as np

In [15]:
file_path = "../data/raw/flights_raw.xlsx"

In [16]:
sheets = pd.read_excel(file_path, sheet_name=None)

In [17]:
type(sheets)

dict

In [18]:
sheets.keys()

dict_keys(['2020-25 OTP', '2020', '2019', '2018', '2017', '2016', '2015', '2014', '2013', '2012', '2011', '2010'])

In [20]:
df_2015 = sheets['2015']
df_2015.head(10)

Unnamed: 0,Route,Departing Port,Arriving Port,Airline,Month,Sectors Scheduled,Sectors Flown,Cancellations,Departures On Time,Arrivals On Time,Departures Delayed,Arrivals Delayed,OnTime Departures \n(%),OnTime Arrivals \n(%),Cancellations \n\n(%)
0,Adelaide-Brisbane,Adelaide,Brisbane,Jetstar,2015-01-01,31,31,0,30,31,1,0,96.774194,100.0,0.0
1,Adelaide-Gold Coast,Adelaide,Gold Coast,Jetstar,2015-01-01,47,46,1,42,43,4,3,91.304348,93.478261,2.12766
2,Adelaide-Melbourne,Adelaide,Melbourne,Jetstar,2015-01-01,122,116,6,104,102,12,14,89.655172,87.931034,4.918033
3,Adelaide-Perth,Adelaide,Perth,Jetstar,2015-01-01,43,43,0,38,41,5,2,88.372093,95.348837,0.0
4,Adelaide-Sydney,Adelaide,Sydney,Jetstar,2015-01-01,150,146,4,118,123,28,23,80.821918,84.246575,2.666667
5,Ayers Rock-Sydney,Ayers Rock,Sydney,Jetstar,2015-01-01,25,25,0,24,21,1,4,96.0,84.0,0.0
6,Ballina-Sydney,Ballina,Sydney,Jetstar,2015-01-01,90,90,0,68,71,22,19,75.555556,78.888889,0.0
7,Brisbane-Adelaide,Brisbane,Adelaide,Jetstar,2015-01-01,31,31,0,31,31,0,0,100.0,100.0,0.0
8,Brisbane-Cairns,Brisbane,Cairns,Jetstar,2015-01-01,119,117,2,97,101,20,16,82.905983,86.324786,1.680672
9,Brisbane-Darwin,Brisbane,Darwin,Jetstar,2015-01-01,30,30,0,20,28,10,2,66.666667,93.333333,0.0


In [21]:
df_2015.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6174 entries, 0 to 6173
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Route                   6174 non-null   object        
 1   Departing Port          6174 non-null   object        
 2   Arriving Port           6174 non-null   object        
 3   Airline                 6174 non-null   object        
 4   Month                   6174 non-null   datetime64[ns]
 5   Sectors Scheduled       6174 non-null   int64         
 6   Sectors Flown           6174 non-null   int64         
 7   Cancellations           6174 non-null   int64         
 8   Departures On Time      6174 non-null   int64         
 9   Arrivals On Time        6174 non-null   int64         
 10  Departures Delayed      6174 non-null   int64         
 11  Arrivals Delayed        6174 non-null   int64         
 12  OnTime Departures 
(%)  6174 non-null   float64 

In [22]:
dfs = []

for sheet_name, df in sheets.items():
    temp = df.copy()
    temp['source_sheet'] = sheet_name
    dfs.append(temp)

flights = pd.concat(dfs, ignore_index=True)

In [23]:
flights.shape
flights['source_sheet'].value_counts()

source_sheet
2020-25 OTP    32029
2018            6548
2017            6470
2019            6409
2016            6342
2015            6174
2014            6045
2013            5722
2012            5236
2010            5050
2011            4944
2020            4197
Name: count, dtype: int64

In [26]:
flights.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95166 entries, 0 to 95165
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Route                   95164 non-null  object        
 1   Departing Port          95161 non-null  object        
 2   Arriving Port           95161 non-null  object        
 3   Airline                 95161 non-null  object        
 4   Month                   95161 non-null  datetime64[ns]
 5   Sectors Scheduled       95161 non-null  float64       
 6   Sectors Flown           95161 non-null  float64       
 7   Cancellations           95161 non-null  float64       
 8   Departures On Time      95161 non-null  float64       
 9   Arrivals On Time        95161 non-null  float64       
 10  Departures Delayed      95161 non-null  float64       
 11  Arrivals Delayed        95161 non-null  float64       
 12  OnTime Departures 
(%)  95106 non-null  object

In [27]:
flights.to_csv("../data/raw/flights_raw_combined.csv", index=False)