In [1]:
import pandas as pd
import numpy as np

In [2]:
preprocessed_data = pd.read_pickle('../data/datasets/preprocessed_data.pkl')

In [3]:
# mapping nan values to empty lists
preprocessed_data['Flight.phase'] = preprocessed_data['Flight.phase'].apply(lambda x: [] if x!=x else x)

## Removing incorrect latitude and longitude data

In [4]:
all_data = preprocessed_data[(preprocessed_data.state == preprocessed_data.State) | (preprocessed_data.state.isna())].copy()

In [5]:
all_data.drop(columns=['Latitude', 'Longitude', 'state', 'Year', 'Month', 'Weekday', 'Broad.phase.of.flight'], inplace=True)
all_data.rename(columns={'lat':'Latitude', 'lon':'Longitude'})

Unnamed: 0,Event.Id,Investigation.Type,Accident.Number,Event.Date,Location,Country,Airport.Code,Airport.Name,Injury.Severity,Aircraft.damage,...,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Report.Status,Publication.Date,State,Flight.phase,Latitude,Longitude
7,20020909X01562,Accident,SEA82DA022,1982-01-01,"PULLMAN, WA",United States,,BLACKBURN AG STRIP,Non-Fatal,Substantial,...,0.0,0.0,2.0,VMC,Probable Cause,01-01-1982,WA,[takeoff],,
8,20020909X01561,Accident,NYC82DA015,1982-01-01,"EAST HANOVER, NJ",United States,N58,HANOVER,Non-Fatal,Substantial,...,0.0,0.0,2.0,IMC,Probable Cause,01-01-1982,NJ,[landing],,
9,20020909X01560,Accident,MIA82DA029,1982-01-01,"JACKSONVILLE, FL",United States,JAX,JACKSONVILLE INTL,Non-Fatal,Substantial,...,0.0,3.0,0.0,IMC,Probable Cause,01-01-1982,FL,[cruise],,
10,20020909X01559,Accident,FTW82DA034,1982-01-01,"HOBBS, NM",United States,,,Non-Fatal,Substantial,...,0.0,0.0,1.0,VMC,Probable Cause,01-01-1982,NM,[approach],,
11,20020909X01558,Accident,ATL82DKJ10,1982-01-01,"TUSKEGEE, AL",United States,,TUSKEGEE,Non-Fatal,Substantial,...,0.0,0.0,1.0,VMC,Probable Cause,01-01-1982,AL,[landing],,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88884,20221227106491,Accident,ERA23LA093,2022-12-26,"Annapolis, MD",United States,,,Minor,,...,1.0,0.0,0.0,,,29-12-2022,MD,[],,
88885,20221227106494,Accident,ERA23LA095,2022-12-26,"Hampton, NH",United States,,,,,...,0.0,0.0,0.0,,,,NH,[],,
88886,20221227106497,Accident,WPR23LA075,2022-12-26,"Payson, AZ",United States,PAN,PAYSON,Non-Fatal,Substantial,...,0.0,0.0,1.0,VMC,,27-12-2022,AZ,[],34.1525,-111.2021
88887,20221227106498,Accident,WPR23LA076,2022-12-26,"Morgan, UT",United States,,,,,...,0.0,0.0,0.0,,,,UT,[],,


In [6]:
all_data.to_json('../data/datasets/final_all.json', lines=True, orient='records')

## Only data with correct latitude and longitude

In [7]:
# selecting only the data with lat and lon and in proper state
only_correct_lat_lon = preprocessed_data[~preprocessed_data.lat.isna() & ~preprocessed_data.lon.isna() & \
                  (preprocessed_data.state == preprocessed_data.State)].copy()

In [8]:
only_correct_lat_lon.drop(columns=['Latitude', 'Longitude', 'State', 'Year', 'Month', 'Weekday', 'Broad.phase.of.flight'], inplace=True)
only_correct_lat_lon.rename(columns={'lat':'Latitude', 'lon':'Longitude', 'state':'State'})

Unnamed: 0,Event.Id,Investigation.Type,Accident.Number,Event.Date,Location,Country,Airport.Code,Airport.Name,Injury.Severity,Aircraft.damage,...,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Report.Status,Publication.Date,Flight.phase,Latitude,Longitude,State
593,20080417X00504,Accident,MIA08CA076,1982-03-16,"MOBILE, AL",United States,MOB,MOBILE REGIONAL,Fatal(1),Substantial,...,,,,IMC,Probable Cause,30-04-2008,[landing],30.757778,-88.355555,AL
3654,20051208X01953,Accident,SEA83LA209,1983-01-08,"Goldendale, WA",United States,,,Fatal(2),Destroyed,...,,,,VMC,Probable Cause,28-03-2006,[cruise],46.041111,-120.849722,WA
6202,20020904X01525,Accident,SEA83FA208,1983-09-09,"Kalispell, MT",United States,,,Fatal(2),Destroyed,...,,,,IMC,Probable Cause,23-07-2003,[cruise],48.120000,-113.887500,MT
24567,20021022X05356,Accident,CHI90LA280,1989-12-01,"ENGADINE, MI",United States,,,Fatal(1),Substantial,...,,,,IMC,Probable Cause,30-05-2003,[unknown],46.154444,-85.663611,MI
26826,20030411X00484,Accident,ANC91GAMS1,1990-10-11,"Deadhorse, AK",United States,,,Fatal(3),Destroyed,...,,,,VMC,Probable Cause,30-05-2003,[unknown],70.333333,-150.933333,AK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88869,20221213106455,Accident,WPR23LA065,2022-12-13,"Lewistown, MT",United States,KLWT,Lewiston Municipal Airport,Non-Fatal,Substantial,...,0.0,0.0,1.0,,,14-12-2022,[],47.257000,-109.280000,MT
88873,20221215106463,Accident,ERA23LA090,2022-12-14,"San Juan, PR",United States,SIG,FERNANDO LUIS RIBAS DOMINICCI,Non-Fatal,Substantial,...,0.0,0.0,1.0,VMC,,27-12-2022,[],18.272400,-66.554000,PR
88876,20221219106475,Accident,WPR23LA069,2022-12-15,"Wichita, KS",United States,ICT,WICHITA DWIGHT D EISENHOWER NT,Non-Fatal,Substantial,...,0.0,0.0,1.0,,,19-12-2022,[],37.382900,-97.263500,KS
88877,20221219106470,Accident,ERA23LA091,2022-12-16,"Brooksville, FL",United States,BKV,BROOKSVILLE-TAMPA BAY RGNL,Minor,Substantial,...,1.0,0.0,0.0,VMC,,23-12-2022,[],28.282500,-82.271900,FL


In [10]:
only_correct_lat_lon.to_json('../data/datasets/final_only_lat_lon.json', lines=True, orient='records')