# Data Cleaning and Preprocessing

To use filtered data and produce cleaned data, do the following:  
1. From filtered_data, bring your desired data set
2. Run the code cells for your desired cleaning operation
3. Save your cleaned data and place in cleaned_data

## Cleaning aircraft data

In [10]:
import pandas as pd

# Load data
filtered_aircraft = pd.read_csv('../../Data/filtered_data/filtered_aircraft_20250501.csv')

filtered_aircraft.head()

Unnamed: 0,timestamp,hex,type,flight,r,t,alt_baro,gs,track,baro_rate,squawk,category,lat,lon,nic,rc,seen_pos,messages,seen,rssi
0,2025-05-01 00:00:00,ad6b0c,adsb_icao,ASA626,N964AK,B39M,6825,286.9,239.41,-1600.0,2167.0,A3,21.291367,-157.988442,8.0,186.0,0.57,111462993,0.2,-7.7
1,2025-05-01 00:00:00,a40cfc,adsb_icao,HAL16,N360HA,A332,23050,406.5,82.51,1792.0,2734.0,A5,21.30277,-156.828868,8.0,186.0,0.35,46829988,0.0,-12.4
2,2025-05-01 00:00:00,a4ae48,adsb_icao,SKW3420,N400SY,E75L,9500,281.6,307.64,-832.0,6647.0,A3,48.842282,-122.764411,8.0,186.0,0.478,100278388,0.0,-9.1
3,2025-05-01 00:00:00,a5c899,adsb_icao,ASA855,N472AS,B739,3200,199.5,32.1,-1088.0,7206.0,A3,45.467245,-122.447115,8.0,186.0,0.062,109065799,0.1,-10.7
4,2025-05-01 00:00:00,a2bba4,adsb_icao,UAL1434,N27511,B39M,ground,0.0,,,3310.0,A3,37.608203,-122.382754,8.0,186.0,2.863,99204996,1.1,-5.4


In [11]:
# Drop unnecessary columns
filtered_aircraft.drop('type', axis=1, inplace=True)        # Drop the 'type' column
filtered_aircraft.drop('hex', axis=1, inplace=True)         # Drop the 'hex' column
filtered_aircraft.drop('category', axis=1, inplace=True)    # Drop the 'category' column
filtered_aircraft.drop('nic', axis=1, inplace=True)         # Drop the 'nic' column
filtered_aircraft.drop('r', axis=1, inplace=True)           # Drop the 'r' column
filtered_aircraft.drop('squawk', axis=1, inplace=True)      # Drop the 'squawk' column
filtered_aircraft.drop('seen_pos', axis=1, inplace=True)    # Drop the 'seen_pos' column
filtered_aircraft.drop('messages', axis=1, inplace=True)    # Drop the 'messages' column
filtered_aircraft.drop('seen', axis=1, inplace=True)        # Drop the 'seen' column
filtered_aircraft.drop('rc', axis=1, inplace=True)          # Drop the 'rc' column
filtered_aircraft.drop('rssi', axis=1, inplace=True)        # Drop the 'rssi' column

filtered_aircraft.head()

Unnamed: 0,timestamp,flight,t,alt_baro,gs,track,baro_rate,lat,lon
0,2025-05-01 00:00:00,ASA626,B39M,6825,286.9,239.41,-1600.0,21.291367,-157.988442
1,2025-05-01 00:00:00,HAL16,A332,23050,406.5,82.51,1792.0,21.30277,-156.828868
2,2025-05-01 00:00:00,SKW3420,E75L,9500,281.6,307.64,-832.0,48.842282,-122.764411
3,2025-05-01 00:00:00,ASA855,B739,3200,199.5,32.1,-1088.0,45.467245,-122.447115
4,2025-05-01 00:00:00,UAL1434,B39M,ground,0.0,,,37.608203,-122.382754


In [12]:
airport_elevation = 17  # Airport elevation in feet

# Convert timestamp to datetime format
filtered_aircraft['timestamp'] = pd.to_datetime(filtered_aircraft['timestamp'])

# Convert "ground" in barometric altitude to airport elevation (17 ft)
filtered_aircraft['alt_baro'] = filtered_aircraft['alt_baro'].replace('ground', airport_elevation)

# Convert barometric altitude to integer, keeping NaN as NaN
filtered_aircraft['alt_baro'] = pd.to_numeric(filtered_aircraft['alt_baro'], errors='coerce').astype('Int64')

# Convert barometric altitude to altitude above ground level
filtered_aircraft['alt'] = filtered_aircraft['alt_baro'] - airport_elevation

# Round values within +30 feet and set any negative altitudes to 0
filtered_aircraft['alt'] = filtered_aircraft['alt'].apply(lambda x: 0 if pd.isna(x) or x < 0 or x <= 30 else x)

filtered_aircraft['alt'] = pd.to_numeric(filtered_aircraft['alt'], errors='coerce').astype('Int64')

filtered_aircraft.drop('alt_baro', axis=1, inplace=True)  # Drop the 'alt_baro' column

# Convert ground speed to float, keeping NaN as NaN
filtered_aircraft['gs'] = pd.to_numeric(filtered_aircraft['gs'], errors='coerce')

# Convert vertical rate to float, keeping NaN as NaN
filtered_aircraft['baro_rate'] = pd.to_numeric(filtered_aircraft['baro_rate'], errors='coerce')

# Convert latitude and longitude to float, keeping NaN as NaN
filtered_aircraft['lat'] = pd.to_numeric(filtered_aircraft['lat'], errors='coerce')
filtered_aircraft['lon'] = pd.to_numeric(filtered_aircraft['lon'], errors='coerce')

filtered_aircraft.head()

Unnamed: 0,timestamp,flight,t,gs,track,baro_rate,lat,lon,alt
0,2025-05-01,ASA626,B39M,286.9,239.41,-1600.0,21.291367,-157.988442,6808
1,2025-05-01,HAL16,A332,406.5,82.51,1792.0,21.30277,-156.828868,23033
2,2025-05-01,SKW3420,E75L,281.6,307.64,-832.0,48.842282,-122.764411,9483
3,2025-05-01,ASA855,B739,199.5,32.1,-1088.0,45.467245,-122.447115,3183
4,2025-05-01,UAL1434,B39M,0.0,,,37.608203,-122.382754,0


In [13]:
# Filter by position; only include aircraft within roughly 10nm of the airport

north_limit = 32.8996
south_limit = 32.5662
east_limit = -116.9916
west_limit = -117.3878

filtered_aircraft = filtered_aircraft[
    (filtered_aircraft['lat'] <= north_limit) &
    (filtered_aircraft['lat'] >= south_limit) &
    (filtered_aircraft['lon'] <= east_limit) &
    (filtered_aircraft['lon'] >= west_limit)
]

filtered_aircraft.reset_index(drop=True, inplace=True)

filtered_aircraft.head()

Unnamed: 0,timestamp,flight,t,gs,track,baro_rate,lat,lon,alt
0,2025-05-01,SKW3384,E75L,283.4,307.11,2304.0,32.835993,-117.374286,6533
1,2025-05-01,ACA1014,A321,268.5,145.01,-1664.0,32.888504,-117.275802,9833
2,2025-05-01,AAL2361,A321,9.8,,,32.735208,-117.202317,0
3,2025-05-01,AAL2050,A321,0.0,274.0,,32.733997,-117.201139,0
4,2025-05-01,SKW3388,E75L,1.4,,,32.73349,-117.200432,0


In [14]:
# Filter aircraft by altitude; only include aircraft below 10,000 feet
filtered_aircraft = filtered_aircraft[filtered_aircraft['alt'] < 6000]

filtered_aircraft.reset_index(drop=True, inplace=True)

filtered_aircraft.head()

Unnamed: 0,timestamp,flight,t,gs,track,baro_rate,lat,lon,alt
0,2025-05-01,AAL2361,A321,9.8,,,32.735208,-117.202317,0
1,2025-05-01,AAL2050,A321,0.0,274.0,,32.733997,-117.201139,0
2,2025-05-01,SKW3388,E75L,1.4,,,32.73349,-117.200432,0
3,2025-05-01,SWA1412,B737,4.2,,,32.734612,-117.198235,0
4,2025-05-01,SWA482,B737,0.0,,,32.733479,-117.197081,0


In [15]:
# Sort by timestamp
filtered_aircraft.sort_values(by='timestamp', inplace=True)

filtered_aircraft.reset_index(drop=True, inplace=True)

filtered_aircraft.head()

Unnamed: 0,timestamp,flight,t,gs,track,baro_rate,lat,lon,alt
0,2025-05-01,AAL2361,A321,9.8,,,32.735208,-117.202317,0
1,2025-05-01,N950DM,GA6C,7.8,,,32.730192,-117.175453,0
2,2025-05-01,UAL2486,B738,0.0,,,32.729565,-117.17677,0
3,2025-05-01,SWA1356,B38M,1.4,,,32.729347,-117.177525,0
4,2025-05-01,,B763,0.0,,,32.73821,-117.186314,0


In [17]:
# Save cleaned data

filtered_aircraft.to_csv('../../Data/AnalysisData/aircraft_cleaned_20250501.csv', index=False)

## Cleaning operations data

In [18]:
# Load operations data
filtered_operations = pd.read_csv('../../Data/filtered_data/filtered_operations_20250501.csv')

filtered_operations.head()

Unnamed: 0,time,icao,operation,airport,registration,flight,ac_type,runway,flight_link,squawk,...,mil,apt_type,name,continent,iso_country,iso_region,municipality,scheduled_service,iata_code,elev
0,2025-05-01 00:00:08,a8a319,takeoff,KSAN,N656AW,AAL1951,A320,27.0,https://globe.adsbexchange.com/?icao=a8a319&zo...,1323.0,...,f,large_airport,San Diego International Airport,,US,US-CA,,yes,SAN,17
1,2025-05-01 00:00:39,ad34fe,takeoff,KSAN,N950DM,N950DM,GA6C,27.0,https://globe.adsbexchange.com/?icao=ad34fe&zo...,5173.0,...,f,large_airport,San Diego International Airport,,US,US-CA,,yes,SAN,17
2,2025-05-01 00:01:46,a43e7f,takeoff,KSAN,N37263,UAL2486,B738,27.0,https://globe.adsbexchange.com/?icao=a43e7f&zo...,1315.0,...,f,large_airport,San Diego International Airport,,US,US-CA,,yes,SAN,17
3,2025-05-01 00:03:05,ac27cf,takeoff,KSAN,N8821S,SWA1356,B38M,27.0,https://globe.adsbexchange.com/?icao=ac27cf&zo...,1023.0,...,f,large_airport,San Diego International Airport,,US,US-CA,,yes,SAN,17
4,2025-05-01 00:06:20,a411d3,landing,KSAN,N361UP,UPS2636,B763,27.0,https://globe.adsbexchange.com/?icao=a411d3&zo...,3766.0,...,f,large_airport,San Diego International Airport,,US,US-CA,,yes,SAN,17


In [19]:
# Remove unnecessary columns from operations data
filtered_operations.drop(['icao',
                        'airport',
                        'registration',
                        'flight_link',
                        'squawk',
                        'signal_type',
                        'category',
                        'year',
                        'manufacturer',
                        'model',
                        'ownop',
                        'faa_pia',
                        'faa_ladd',
                        'short_type',
                        'mil',
                        'apt_type',
                        'name',
                        'continent',
                        'iso_country',
                        'iso_region',
                        'municipality',
                        'scheduled_service',
                        'iata_code',
                        'elev'], axis=1, inplace=True)

filtered_operations.head()

Unnamed: 0,time,operation,flight,ac_type,runway
0,2025-05-01 00:00:08,takeoff,AAL1951,A320,27.0
1,2025-05-01 00:00:39,takeoff,N950DM,GA6C,27.0
2,2025-05-01 00:01:46,takeoff,UAL2486,B738,27.0
3,2025-05-01 00:03:05,takeoff,SWA1356,B38M,27.0
4,2025-05-01 00:06:20,landing,UPS2636,B763,27.0


In [20]:
# Convert time to datetime format
filtered_operations['time'] = pd.to_datetime(filtered_operations['time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

# Convert runway to integer, keeping NaN as NaN
filtered_operations['runway'] = pd.to_numeric(filtered_operations['runway'], errors='coerce').astype('Int64')

filtered_operations.head()

Unnamed: 0,time,operation,flight,ac_type,runway
0,2025-05-01 00:00:08,takeoff,AAL1951,A320,27
1,2025-05-01 00:00:39,takeoff,N950DM,GA6C,27
2,2025-05-01 00:01:46,takeoff,UAL2486,B738,27
3,2025-05-01 00:03:05,takeoff,SWA1356,B38M,27
4,2025-05-01 00:06:20,landing,UPS2636,B763,27


In [22]:
# Save data
filtered_operations.to_csv('../../Data/AnalysisData/operations_cleaned_20250501.csv', index=False)