# Data Cleaning and Preprocessing

## Cleaning aircraft data

In [34]:
import pandas as pd

# Load data
filtered_aircraft = pd.read_csv('filtered_aircraft.csv')

filtered_aircraft.head()

Unnamed: 0,timestamp,hex,type,flight,r,t,alt_baro,gs,track,baro_rate,squawk,category,lat,lon,nic,rc,seen_pos,messages,seen,rssi
0,2024-07-01 00:00:00,ab8180,adsb_icao,SCX472,N840SY,B738,35000,443.7,108.8,0.0,4171.0,A3,55.900635,-121.040622,8.0,186.0,0.35,22311828,0.1,-8.1
1,2024-07-01 00:00:00,ac85ce,adsb_icao,DAL1542,N906DN,B739,ground,0.0,,,1063.0,A3,36.07909,-115.139626,0.0,0.0,2.135,25550577,1.3,-27.6
2,2024-07-01 00:00:00,a17bfe,adsb_icao,AAY642,N195NV,A320,37025,518.6,31.12,0.0,711.0,A3,42.314122,-104.698366,8.0,186.0,0.467,21682210,0.3,-8.6
3,2024-07-01 00:00:00,a0808f,adsb_icao,SKW4690,N131SY,E75L,ground,16.5,35.0,-77.0,2033.0,A3,39.861328,-104.67454,8.0,186.0,0.236,18110756,0.2,-21.3
4,2024-07-01 00:00:00,ad1da9,adsb_icao,SKW4764,N944SW,CRJ2,19025,406.3,50.39,,2763.0,A2,40.491348,-104.371765,8.0,186.0,0.497,14109102,0.1,-10.4


In [35]:
# Drop unnecessary columns
filtered_aircraft.drop('type', axis=1, inplace=True)        # Drop the 'type' column
filtered_aircraft.drop('hex', axis=1, inplace=True)         # Drop the 'hex' column
filtered_aircraft.drop('category', axis=1, inplace=True)    # Drop the 'category' column
filtered_aircraft.drop('nic', axis=1, inplace=True)         # Drop the 'nic' column
filtered_aircraft.drop('r', axis=1, inplace=True)           # Drop the 'r' column
filtered_aircraft.drop('squawk', axis=1, inplace=True)      # Drop the 'squawk' column
filtered_aircraft.drop('seen_pos', axis=1, inplace=True)    # Drop the 'seen_pos' column
filtered_aircraft.drop('messages', axis=1, inplace=True)    # Drop the 'messages' column
filtered_aircraft.drop('seen', axis=1, inplace=True)        # Drop the 'seen' column
filtered_aircraft.drop('rc', axis=1, inplace=True)          # Drop the 'rc' column
filtered_aircraft.drop('rssi', axis=1, inplace=True)        # Drop the 'rssi' column

filtered_aircraft.head()

Unnamed: 0,timestamp,flight,t,alt_baro,gs,track,baro_rate,lat,lon
0,2024-07-01 00:00:00,SCX472,B738,35000,443.7,108.8,0.0,55.900635,-121.040622
1,2024-07-01 00:00:00,DAL1542,B739,ground,0.0,,,36.07909,-115.139626
2,2024-07-01 00:00:00,AAY642,A320,37025,518.6,31.12,0.0,42.314122,-104.698366
3,2024-07-01 00:00:00,SKW4690,E75L,ground,16.5,35.0,-77.0,39.861328,-104.67454
4,2024-07-01 00:00:00,SKW4764,CRJ2,19025,406.3,50.39,,40.491348,-104.371765


In [36]:
airport_elevation = 3204  # Airport elevation in feet

# Convert timestamp to datetime format
filtered_aircraft['timestamp'] = pd.to_datetime(filtered_aircraft['timestamp'])

# Convert "ground" in barometric altitude to airport elevation (3203 ft)
filtered_aircraft['alt_baro'] = filtered_aircraft['alt_baro'].replace('ground', airport_elevation)

# Convert barometric altitude to integer, keeping NaN as NaN
filtered_aircraft['alt_baro'] = pd.to_numeric(filtered_aircraft['alt_baro'], errors='coerce').astype('Int64')

# Convert barometric altitude to altitude above ground level
filtered_aircraft['alt'] = filtered_aircraft['alt_baro'] - airport_elevation

# Round values within +30 feet and set any negative altitudes to 0
filtered_aircraft['alt'] = filtered_aircraft['alt'].apply(lambda x: 0 if pd.isna(x) or x < 0 or x <= 30 else x)

filtered_aircraft['alt'] = pd.to_numeric(filtered_aircraft['alt'], errors='coerce').astype('Int64')

filtered_aircraft.drop('alt_baro', axis=1, inplace=True)  # Drop the 'alt_baro' column

# Convert ground speed to float, keeping NaN as NaN
filtered_aircraft['gs'] = pd.to_numeric(filtered_aircraft['gs'], errors='coerce')

# Convert vertical rate to float, keeping NaN as NaN
filtered_aircraft['baro_rate'] = pd.to_numeric(filtered_aircraft['baro_rate'], errors='coerce')

# Convert latitude and longitude to float, keeping NaN as NaN
filtered_aircraft['lat'] = pd.to_numeric(filtered_aircraft['lat'], errors='coerce')
filtered_aircraft['lon'] = pd.to_numeric(filtered_aircraft['lon'], errors='coerce')

filtered_aircraft.head()

Unnamed: 0,timestamp,flight,t,gs,track,baro_rate,lat,lon,alt
0,2024-07-01,SCX472,B738,443.7,108.8,0.0,55.900635,-121.040622,31796
1,2024-07-01,DAL1542,B739,0.0,,,36.07909,-115.139626,0
2,2024-07-01,AAY642,A320,518.6,31.12,0.0,42.314122,-104.698366,33821
3,2024-07-01,SKW4690,E75L,16.5,35.0,-77.0,39.861328,-104.67454,0
4,2024-07-01,SKW4764,CRJ2,406.3,50.39,,40.491348,-104.371765,15821


In [37]:
# Filter by position; only include aircraft within roughly 10nm of the airport

north_limit = 44.21195
south_limit = 43.87861
east_limit = -102.82512
west_limit = -103.28932

filtered_aircraft = filtered_aircraft[
    (filtered_aircraft['lat'] <= north_limit) &
    (filtered_aircraft['lat'] >= south_limit) &
    (filtered_aircraft['lon'] <= east_limit) &
    (filtered_aircraft['lon'] >= west_limit)
]

filtered_aircraft.reset_index(drop=True, inplace=True)

filtered_aircraft.head()

Unnamed: 0,timestamp,flight,t,gs,track,baro_rate,lat,lon,alt
0,2024-07-01 00:09:05,N9294Q,P28A,94.3,214.2,,44.210567,-102.898887,3396
1,2024-07-01 00:09:10,N9294Q,P28A,94.3,212.01,,44.208527,-102.90063,3296
2,2024-07-01 00:09:15,N9294Q,P28A,95.0,210.34,,44.206787,-102.902134,3296
3,2024-07-01 00:09:20,N9294Q,P28A,94.7,211.17,,44.204956,-102.903639,3296
4,2024-07-01 00:09:25,N9294Q,P28A,93.2,213.18,,44.203079,-102.905273,3396


In [38]:
# Filter aircraft by altitude; only include aircraft below 10,000 feet
filtered_aircraft = filtered_aircraft[filtered_aircraft['alt'] < 6000]

filtered_aircraft.reset_index(drop=True, inplace=True)

filtered_aircraft.head()

Unnamed: 0,timestamp,flight,t,gs,track,baro_rate,lat,lon,alt
0,2024-07-01 00:09:05,N9294Q,P28A,94.3,214.2,,44.210567,-102.898887,3396
1,2024-07-01 00:09:10,N9294Q,P28A,94.3,212.01,,44.208527,-102.90063,3296
2,2024-07-01 00:09:15,N9294Q,P28A,95.0,210.34,,44.206787,-102.902134,3296
3,2024-07-01 00:09:20,N9294Q,P28A,94.7,211.17,,44.204956,-102.903639,3296
4,2024-07-01 00:09:25,N9294Q,P28A,93.2,213.18,,44.203079,-102.905273,3396


In [39]:
# Save cleaned data

filtered_aircraft.to_csv('aircraft_cleaned.csv', index=False)

## Cleaning operations data

In [29]:
# Load operations data
filtered_operations = pd.read_csv('filtered_operations.csv')

filtered_operations.head()

Unnamed: 0,time,icao,operation,airport,registration,flight,ac_type,runway,flight_link,squawk,...,mil,apt_type,name,continent,iso_country,iso_region,municipality,scheduled_service,iata_code,elev
0,2024-07-01 00:18:59,ace28c,landing,KRAP,N9294Q,N9294Q,P28A,,https://globe.adsbexchange.com/?icao=ace28c&zo...,204.0,...,f,medium_airport,Rapid City Regional Airport,,US,US-SD,,yes,RAP,3204
1,2024-07-01 00:47:48,a5dc2e,landing,KRAP,N477M,CCQ477,BE9L,14.0,https://globe.adsbexchange.com/?icao=a5dc2e&zo...,5145.0,...,f,medium_airport,Rapid City Regional Airport,,US,US-SD,,yes,RAP,3204
2,2024-07-01 01:45:17,a0808f,landing,KRAP,N131SY,SKW5580,E75L,14.0,https://globe.adsbexchange.com/?icao=a0808f&zo...,5103.0,...,f,medium_airport,Rapid City Regional Airport,,US,US-SD,,yes,RAP,3204
3,2024-07-01 02:30:08,a03618,landing,KRAP,N11206,UAL1004,B738,14.0,https://globe.adsbexchange.com/?icao=a03618&zo...,5162.0,...,f,medium_airport,Rapid City Regional Airport,,US,US-SD,,yes,RAP,3204
4,2024-07-01 02:51:10,a37e6c,takeoff,KRAP,N324NV,AAY78,A319,14.0,https://globe.adsbexchange.com/?icao=a37e6c&zo...,3235.0,...,f,medium_airport,Rapid City Regional Airport,,US,US-SD,,yes,RAP,3204


In [31]:
# Remove unnecessary columns from operations data
filtered_operations.drop(['icao',
                        'airport',
                        'registration',
                        'flight_link',
                        'squawk',
                        'signal_type',
                        'category',
                        'year',
                        'manufacturer',
                        'model',
                        'ownop',
                        'faa_pia',
                        'faa_ladd',
                        'short_type',
                        'mil',
                        'apt_type',
                        'name',
                        'continent',
                        'iso_country',
                        'iso_region',
                        'municipality',
                        'scheduled_service',
                        'iata_code',
                        'elev'], axis=1, inplace=True)

filtered_operations.head()

Unnamed: 0,time,operation,flight,ac_type,runway
0,2024-07-01 00:18:59,landing,N9294Q,P28A,
1,2024-07-01 00:47:48,landing,CCQ477,BE9L,14.0
2,2024-07-01 01:45:17,landing,SKW5580,E75L,14.0
3,2024-07-01 02:30:08,landing,UAL1004,B738,14.0
4,2024-07-01 02:51:10,takeoff,AAY78,A319,14.0


In [32]:
# Convert time to datetime format
filtered_operations['time'] = pd.to_datetime(filtered_operations['time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

# Convert runway to integer, keeping NaN as NaN
filtered_operations['runway'] = pd.to_numeric(filtered_operations['runway'], errors='coerce').astype('Int64')

filtered_operations.head()

Unnamed: 0,time,operation,flight,ac_type,runway
0,2024-07-01 00:18:59,landing,N9294Q,P28A,
1,2024-07-01 00:47:48,landing,CCQ477,BE9L,14.0
2,2024-07-01 01:45:17,landing,SKW5580,E75L,14.0
3,2024-07-01 02:30:08,landing,UAL1004,B738,14.0
4,2024-07-01 02:51:10,takeoff,AAY78,A319,14.0


In [33]:
# Save data
filtered_operations.to_csv('operations_cleaned.csv', index=False)