# Data Cleaning and Preprocessing

To use filtered data and produce cleaned data, do the following:  
1. From filtered_data, bring your desired data set
2. Run the code cells for your desired cleaning operation
3. Save your cleaned data and place in cleaned_data

## Cleaning aircraft data

In [11]:
import pandas as pd

# Load data
filtered_aircraft = pd.read_csv('filtered_data/filtered_aircraft_20250301.csv')

filtered_aircraft.head()

Unnamed: 0,timestamp,hex,type,flight,r,t,alt_baro,gs,track,baro_rate,squawk,category,lat,lon,nic,rc,seen_pos,messages,seen,rssi
0,2025-03-01 00:00:00,a3094d,adsb_icao,AAY3531,N295NV,A320,20725,339.3,241.09,-1664.0,1431.0,A3,34.180389,-110.579186,8.0,186.0,0.032,64122549,0.0,-13.2
1,2025-03-01 00:00:00,acf146,adsb_icao,SKW5086,N933EV,CRJ2,25000,458.5,119.25,,3521.0,A2,38.284927,-103.658051,8.0,186.0,0.075,56658531,0.1,-14.3
2,2025-03-01 00:00:00,a56db1,adsb_icao,SKW5118,N449SW,CRJ2,32000,418.1,247.05,,3621.0,A2,41.324158,-101.137329,8.0,186.0,0.163,40331012,0.0,-14.8
3,2025-03-01 00:00:00,a19aaa,adsb_icao,ENY4021,N202NN,E75L,ground,19.5,,,5131.0,A3,32.898967,-97.047659,8.0,186.0,4.272,78574579,4.3,-16.5
4,2025-03-01 00:00:00,a56661,adsb_icao,UAL1004,N447UA,A320,36000,380.3,267.89,0.0,1115.0,A3,38.964083,-96.314941,8.0,186.0,0.0,81309382,0.0,-13.7


In [12]:
# Drop unnecessary columns
filtered_aircraft.drop('type', axis=1, inplace=True)        # Drop the 'type' column
filtered_aircraft.drop('hex', axis=1, inplace=True)         # Drop the 'hex' column
filtered_aircraft.drop('category', axis=1, inplace=True)    # Drop the 'category' column
filtered_aircraft.drop('nic', axis=1, inplace=True)         # Drop the 'nic' column
filtered_aircraft.drop('r', axis=1, inplace=True)           # Drop the 'r' column
filtered_aircraft.drop('squawk', axis=1, inplace=True)      # Drop the 'squawk' column
filtered_aircraft.drop('seen_pos', axis=1, inplace=True)    # Drop the 'seen_pos' column
filtered_aircraft.drop('messages', axis=1, inplace=True)    # Drop the 'messages' column
filtered_aircraft.drop('seen', axis=1, inplace=True)        # Drop the 'seen' column
filtered_aircraft.drop('rc', axis=1, inplace=True)          # Drop the 'rc' column
filtered_aircraft.drop('rssi', axis=1, inplace=True)        # Drop the 'rssi' column

filtered_aircraft.head()

Unnamed: 0,timestamp,flight,t,alt_baro,gs,track,baro_rate,lat,lon
0,2025-03-01 00:00:00,AAY3531,A320,20725,339.3,241.09,-1664.0,34.180389,-110.579186
1,2025-03-01 00:00:00,SKW5086,CRJ2,25000,458.5,119.25,,38.284927,-103.658051
2,2025-03-01 00:00:00,SKW5118,CRJ2,32000,418.1,247.05,,41.324158,-101.137329
3,2025-03-01 00:00:00,ENY4021,E75L,ground,19.5,,,32.898967,-97.047659
4,2025-03-01 00:00:00,UAL1004,A320,36000,380.3,267.89,0.0,38.964083,-96.314941


In [13]:
airport_elevation = 3204  # Airport elevation in feet

# Convert timestamp to datetime format
filtered_aircraft['timestamp'] = pd.to_datetime(filtered_aircraft['timestamp'])

# Convert "ground" in barometric altitude to airport elevation (3204 ft)
filtered_aircraft['alt_baro'] = filtered_aircraft['alt_baro'].replace('ground', airport_elevation)

# Convert barometric altitude to integer, keeping NaN as NaN
filtered_aircraft['alt_baro'] = pd.to_numeric(filtered_aircraft['alt_baro'], errors='coerce').astype('Int64')

# Convert barometric altitude to altitude above ground level
filtered_aircraft['alt'] = filtered_aircraft['alt_baro'] - airport_elevation

# Round values within +30 feet and set any negative altitudes to 0
filtered_aircraft['alt'] = filtered_aircraft['alt'].apply(lambda x: 0 if pd.isna(x) or x < 0 or x <= 30 else x)

filtered_aircraft['alt'] = pd.to_numeric(filtered_aircraft['alt'], errors='coerce').astype('Int64')

filtered_aircraft.drop('alt_baro', axis=1, inplace=True)  # Drop the 'alt_baro' column

# Convert ground speed to float, keeping NaN as NaN
filtered_aircraft['gs'] = pd.to_numeric(filtered_aircraft['gs'], errors='coerce')

# Convert vertical rate to float, keeping NaN as NaN
filtered_aircraft['baro_rate'] = pd.to_numeric(filtered_aircraft['baro_rate'], errors='coerce')

# Convert latitude and longitude to float, keeping NaN as NaN
filtered_aircraft['lat'] = pd.to_numeric(filtered_aircraft['lat'], errors='coerce')
filtered_aircraft['lon'] = pd.to_numeric(filtered_aircraft['lon'], errors='coerce')

filtered_aircraft.head()

Unnamed: 0,timestamp,flight,t,gs,track,baro_rate,lat,lon,alt
0,2025-03-01,AAY3531,A320,339.3,241.09,-1664.0,34.180389,-110.579186,17521
1,2025-03-01,SKW5086,CRJ2,458.5,119.25,,38.284927,-103.658051,21796
2,2025-03-01,SKW5118,CRJ2,418.1,247.05,,41.324158,-101.137329,28796
3,2025-03-01,ENY4021,E75L,19.5,,,32.898967,-97.047659,0
4,2025-03-01,UAL1004,A320,380.3,267.89,0.0,38.964083,-96.314941,32796


In [14]:
# Filter by position; only include aircraft within roughly 10nm of the airport

north_limit = 44.21195
south_limit = 43.87861
east_limit = -102.82512
west_limit = -103.28932

filtered_aircraft = filtered_aircraft[
    (filtered_aircraft['lat'] <= north_limit) &
    (filtered_aircraft['lat'] >= south_limit) &
    (filtered_aircraft['lon'] <= east_limit) &
    (filtered_aircraft['lon'] >= west_limit)
]

filtered_aircraft.reset_index(drop=True, inplace=True)

filtered_aircraft.head()

Unnamed: 0,timestamp,flight,t,gs,track,baro_rate,lat,lon,alt
0,2025-03-01 00:41:35,SKW5265,CRJ2,0.0,,,44.039197,-103.060668,0
1,2025-03-01 00:41:40,SKW5265,CRJ2,0.0,,,44.0392,-103.060668,0
2,2025-03-01 00:41:45,SKW5265,CRJ2,0.0,,,44.0392,-103.060668,0
3,2025-03-01 00:41:50,SKW5265,CRJ2,0.0,,,44.039197,-103.060668,0
4,2025-03-01 00:41:55,SKW5265,CRJ2,0.0,,,44.039197,-103.060668,0


In [15]:
# Filter aircraft by altitude; only include aircraft below 10,000 feet
filtered_aircraft = filtered_aircraft[filtered_aircraft['alt'] < 6000]

filtered_aircraft.reset_index(drop=True, inplace=True)

filtered_aircraft.head()

Unnamed: 0,timestamp,flight,t,gs,track,baro_rate,lat,lon,alt
0,2025-03-01 00:41:35,SKW5265,CRJ2,0.0,,,44.039197,-103.060668,0
1,2025-03-01 00:41:40,SKW5265,CRJ2,0.0,,,44.0392,-103.060668,0
2,2025-03-01 00:41:45,SKW5265,CRJ2,0.0,,,44.0392,-103.060668,0
3,2025-03-01 00:41:50,SKW5265,CRJ2,0.0,,,44.039197,-103.060668,0
4,2025-03-01 00:41:55,SKW5265,CRJ2,0.0,,,44.039197,-103.060668,0


In [16]:
# Sort by timestamp
filtered_aircraft.sort_values(by='timestamp', inplace=True)

filtered_aircraft.reset_index(drop=True, inplace=True)

filtered_aircraft.head()

Unnamed: 0,timestamp,flight,t,gs,track,baro_rate,lat,lon,alt
0,2025-03-01 00:41:35,SKW5265,CRJ2,0.0,,,44.039197,-103.060668,0
1,2025-03-01 00:41:40,SKW5265,CRJ2,0.0,,,44.0392,-103.060668,0
2,2025-03-01 00:41:45,SKW5265,CRJ2,0.0,,,44.0392,-103.060668,0
3,2025-03-01 00:41:50,SKW5265,CRJ2,0.0,,,44.039197,-103.060668,0
4,2025-03-01 00:41:55,SKW5265,CRJ2,0.0,,,44.039197,-103.060668,0


In [17]:
# Save cleaned data

filtered_aircraft.to_csv('cleaned_data/aircraft_cleaned_20250301.csv', index=False)

## Cleaning operations data

In [18]:
# Load operations data
filtered_operations = pd.read_csv('filtered_data/filtered_operations_20250301.csv')

filtered_operations.head()

Unnamed: 0,time,icao,operation,airport,registration,flight,ac_type,runway,flight_link,squawk,...,mil,apt_type,name,continent,iso_country,iso_region,municipality,scheduled_service,iata_code,elev
0,2025-03-01 00:51:13,ad8901,takeoff,KRAP,N971SW,SKW5265,CRJ2,32.0,https://globe.adsbexchange.com/?icao=ad8901&zo...,5124.0,...,f,medium_airport,Rapid City Regional Airport,,US,US-SD,,yes,RAP,3204
1,2025-03-01 00:56:18,ae5ae9,takeoff,KRAP,15-20775,1520775,H60,,https://globe.adsbexchange.com/?icao=ae5ae9&zo...,1200.0,...,t,medium_airport,Rapid City Regional Airport,,US,US-SD,,yes,RAP,3204
2,2025-03-01 01:04:10,a5045e,takeoff,KRAP,N422PM,N422PM,BE9L,5.0,https://globe.adsbexchange.com/?icao=a5045e&zo...,5117.0,...,f,medium_airport,Rapid City Regional Airport,,US,US-SD,,yes,RAP,3204
3,2025-03-01 01:06:40,ae56c5,takeoff,KRAP,13-20605,1320605,H60,,https://globe.adsbexchange.com/?icao=ae56c5&zo...,1200.0,...,t,medium_airport,Rapid City Regional Airport,,US,US-SD,,yes,RAP,3204
4,2025-03-01 01:13:48,ab2b5e,takeoff,KRAP,N819FX,CFS7584,AT73,32.0,https://globe.adsbexchange.com/?icao=ab2b5e&zo...,1447.0,...,f,medium_airport,Rapid City Regional Airport,,US,US-SD,,yes,RAP,3204


In [19]:
# Remove unnecessary columns from operations data
filtered_operations.drop(['icao',
                        'airport',
                        'registration',
                        'flight_link',
                        'squawk',
                        'signal_type',
                        'category',
                        'year',
                        'manufacturer',
                        'model',
                        'ownop',
                        'faa_pia',
                        'faa_ladd',
                        'short_type',
                        'mil',
                        'apt_type',
                        'name',
                        'continent',
                        'iso_country',
                        'iso_region',
                        'municipality',
                        'scheduled_service',
                        'iata_code',
                        'elev'], axis=1, inplace=True)

filtered_operations.head()

Unnamed: 0,time,operation,flight,ac_type,runway
0,2025-03-01 00:51:13,takeoff,SKW5265,CRJ2,32.0
1,2025-03-01 00:56:18,takeoff,1520775,H60,
2,2025-03-01 01:04:10,takeoff,N422PM,BE9L,5.0
3,2025-03-01 01:06:40,takeoff,1320605,H60,
4,2025-03-01 01:13:48,takeoff,CFS7584,AT73,32.0


In [20]:
# Convert time to datetime format
filtered_operations['time'] = pd.to_datetime(filtered_operations['time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

# Convert runway to integer, keeping NaN as NaN
filtered_operations['runway'] = pd.to_numeric(filtered_operations['runway'], errors='coerce').astype('Int64')

filtered_operations.head()

Unnamed: 0,time,operation,flight,ac_type,runway
0,2025-03-01 00:51:13,takeoff,SKW5265,CRJ2,32.0
1,2025-03-01 00:56:18,takeoff,1520775,H60,
2,2025-03-01 01:04:10,takeoff,N422PM,BE9L,5.0
3,2025-03-01 01:06:40,takeoff,1320605,H60,
4,2025-03-01 01:13:48,takeoff,CFS7584,AT73,32.0


In [21]:
# Save data
filtered_operations.to_csv('cleaned_data/operations_cleaned_20250301.csv', index=False)