# Feature Engineering

This notebook takes new data from [ADS-B Exchange] (https://www.adsbexchange.com/) on April 1, 2018 and engineers it similarly to the Buzzfeed preparation. 

In [1]:
#import packages that we'll need
import pandas as pd
import numpy as np  
import scipy
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
import pickle

  from numpy.core.umath_tests import inner1d


In [2]:
#read in data
flights1 = pd.read_csv("/mnt/data/large-spyplane-data/flights1.csv", low_memory = False)
flights2 = pd.read_csv("/mnt/data/large-spyplane-data/flights2.csv", low_memory = False)
flights3 = pd.read_csv("/mnt/data/large-spyplane-data/flights3.csv", low_memory = False)

In [3]:
#combine all federal plane data
flight_data = pd.concat([flights1, flights2, flights3])

In [4]:
#look at overview of the data
flight_data.head()

Unnamed: 0,adshex,altitude,latitude,longitude,PosTime,TSecs,speed,track,squawk
0,01013D,35975.0,29.086397,36.426678,1522550000000.0,474.0,436.0,268.2,4757.0
1,040030,38000.0,50.055061,10.831555,1522550000000.0,1856.0,472.1,303.9,3564.0
2,040030,37975.0,50.043888,10.857387,1522550000000.0,595.0,472.1,303.9,3564.0
3,040030,38000.0,50.053711,10.834688,1522550000000.0,2593.0,472.1,303.9,3564.0
4,040030,37975.0,50.053711,10.834688,1522550000000.0,3022.0,472.1,303.9,3564.0


In [5]:
#sort by aircraft ID and time
flight_data.sort_values(['adshex', 'PosTime'], inplace = True)

In [6]:
#Create a column, 'steer' which is the change in compass bearing from the previous transponder detection 
#for that aircraft; negative values indicate a turn to the left, positive values a turn to the right. 
flight_data['steer'] = flight_data['track'] - flight_data['track'].shift(-1)
flight_data.head()

Unnamed: 0,adshex,altitude,latitude,longitude,PosTime,TSecs,speed,track,squawk,steer
170113,1000000.0,9450.0,49.91829,14.235757,1522546000000.0,16.0,318.8,143.4,1407.0,0.0
170112,1000000.0,9775.0,49.907959,14.247605,1522546000000.0,17.0,320.0,143.4,1407.0,-0.1
964526,1000000.0,10925.0,49.848137,14.316225,1522546000000.0,77.0,356.0,143.5,1407.0,0.0
964527,1000000.0,11025.0,49.836871,14.329141,1522546000000.0,75.0,363.2,143.5,1407.0,0.0
1917135,1000000.0,12250.0,49.771698,14.403818,1522546000000.0,137.0,381.0,143.5,4555.0,0.0


In [7]:
#Create steer bins, which are the proportion of steer values for each aircraft falling into bins set manually, 
#using the breaks: -180, -25, -10, -1, 0, 1, 22, 45, 180
steer = flight_data['steer'].groupby([flight_data['adshex'], pd.cut(flight_data['steer'], 
                             bins = [-180, -25, -10, -1, 0, 1, 22, 45, 180])]).size().unstack(fill_value=0)

steer = steer.div(steer.sum(axis=1), axis=0)

In [8]:
#clean up data frame
steer.columns = ['steer1', 'steer2', 'steer3', 'steer4', 'steer5', 'steer6', 'steer7', 'steer8']
steer.reset_index(inplace = True)
steer.head()

Unnamed: 0,adshex,steer1,steer2,steer3,steer4,steer5,steer6,steer7,steer8
0,0100E4,0.004444,0.002778,0.031667,0.722778,0.197222,0.035556,0.003889,0.001667
1,0100F6,0.004517,0.00813,0.020777,0.720867,0.224029,0.01897,0.000903,0.001807
2,01013D,0.003453,0.001973,0.026147,0.75777,0.182042,0.02664,0.00148,0.000493
3,010141,0.003675,0.007349,0.052493,0.690289,0.1979,0.046194,0.0021,0.0
4,010153,0.00184,0.00276,0.032199,0.73965,0.180313,0.037718,0.0046,0.00092


In [9]:
#proportion the altitude into 5 bins based on the distribution
q, alt_bins = pd.qcut(flight_data['altitude'], q = 5, retbins=True)
alt_bins = list(alt_bins)
alt_bins

[-1200.0, 19975.0, 32000.0, 35000.0, 37000.0, 3746867.0]

In [10]:
#proportion the altitude into 5 bins based on the distribution
altitude = flight_data['altitude'].groupby([flight_data['adshex'], pd.cut(flight_data['altitude'], 
                                                                          bins = alt_bins)]).size().unstack(fill_value=0)

altitude = altitude.div(altitude.sum(axis=1), axis=0)

In [11]:
#clean up data frame
altitude.columns = ['altitude1', 'altitude2', 'altitude3', 'altitude4', 'altitude5']
altitude.reset_index(inplace = True)
altitude.head()

Unnamed: 0,adshex,altitude1,altitude2,altitude3,altitude4,altitude5
0,0100E4,0.15257,0.153676,0.29298,0.400774,0.0
1,0100F6,0.014414,0.054955,0.026126,0.844144,0.06036
2,01013D,0.075258,0.078701,0.655189,0.187408,0.003443
3,010141,0.110821,0.177209,0.560899,0.139571,0.0115
4,010153,0.117539,0.05877,0.22865,0.592287,0.002755


In [12]:
#get most common squawk code for each plane
sep_df = flight_data[['adshex', 'squawk']]

def get_mode(x): 
    m = pd.Series.mode(x); 
    return m.values[0] if not m.empty else np.nan
squawk = sep_df.groupby('adshex')['squawk'].agg(get_mode)

In [13]:
#move series into data frame and clean up
squawk.columns = ['squawk']
squawk = pd.Series.to_frame(squawk)
squawk.reset_index(inplace = True)
squawk.head()

Unnamed: 0,adshex,squawk
0,0100E4,6015.0
1,0100F6,4545.0
2,01013D,2215.0
3,010141,1132.0
4,010153,2235.0


In [14]:
#merge data frames into one
new_flight_data = altitude.merge(steer, on = 'adshex')
new_flight_data = new_flight_data.merge(squawk, on = 'adshex')

In [15]:
#format data to be the same as the training data
new_flight_data.rename(columns = {'squawk':'squawk_1'}, inplace = True)
new_flight_data.dropna(inplace = True)
new_flight_data['squawk_1'] = new_flight_data['squawk_1'].astype(int)
new_flight_data.head()

Unnamed: 0,adshex,altitude1,altitude2,altitude3,altitude4,altitude5,steer1,steer2,steer3,steer4,steer5,steer6,steer7,steer8,squawk_1
0,0100E4,0.15257,0.153676,0.29298,0.400774,0.0,0.004444,0.002778,0.031667,0.722778,0.197222,0.035556,0.003889,0.001667,6015
1,0100F6,0.014414,0.054955,0.026126,0.844144,0.06036,0.004517,0.00813,0.020777,0.720867,0.224029,0.01897,0.000903,0.001807,4545
2,01013D,0.075258,0.078701,0.655189,0.187408,0.003443,0.003453,0.001973,0.026147,0.75777,0.182042,0.02664,0.00148,0.000493,2215
3,010141,0.110821,0.177209,0.560899,0.139571,0.0115,0.003675,0.007349,0.052493,0.690289,0.1979,0.046194,0.0021,0.0,1132
4,010153,0.117539,0.05877,0.22865,0.592287,0.002755,0.00184,0.00276,0.032199,0.73965,0.180313,0.037718,0.0046,0.00092,2235


In [16]:
#format data to be the same as the training data
classify_new = new_flight_data[['steer1', 'steer2', 'steer4', 'steer5', 'steer6', 'squawk_1', 'altitude3']]

In [17]:
# Load model

# This relies on output from a previous notebook!
# If this cell does not work, try using the pregenerated data instead
#model = pickle.load(open('/mnt/data/spyplane-data/pregenerated_SpyPlane-RandomForest.sav', 'rb'))
model = pickle.load(open('/mnt/data/spyplane-data/SpyPlane_RandomForest.sav', 'rb'))

In [18]:
#classify data
predictions = model.predict(classify_new)

In [19]:
#look at number of predicted spy planes
sum(predictions)

3

In [20]:
#get list of the probabilities
probability_pred = model.predict_proba(classify_new)

In [21]:
#add the surveillance plane classifications to the data frame and sort
classify_prob = new_flight_data.copy()
classify_prob.loc[:,'spy_prob'] = probability_pred[:,1]
classify_prob.sort_values(by = 'spy_prob', ascending=False, inplace=True)

In [22]:
classify_prob.head()

Unnamed: 0,adshex,altitude1,altitude2,altitude3,altitude4,altitude5,steer1,steer2,steer3,steer4,steer5,steer6,steer7,steer8,squawk_1,spy_prob
122,393D23,0.066863,0.932153,0.000983,0.0,0.0,0.121896,0.032731,0.015801,0.625282,0.032731,0.040632,0.034989,0.095937,5166,0.653404
2372,7CF7CA,1.0,0.0,0.0,0.0,0.0,0.13564,0.028007,0.031301,0.574959,0.004393,0.060406,0.042284,0.123009,7712,0.635537
1083,447AC7,1.0,0.0,0.0,0.0,0.0,0.362205,0.055118,0.028871,0.16273,0.007874,0.049869,0.068241,0.265092,7777,0.628085
2152,4CABA3,0.108721,0.168023,0.422674,0.049419,0.251163,0.040398,0.009368,0.075527,0.586066,0.19555,0.053864,0.01815,0.021077,5774,0.453901
2136,4CAAEC,0.999638,0.0,0.0,0.0,0.000362,0.009458,0.008003,0.069116,0.680975,0.153147,0.070207,0.005457,0.003638,4431,0.42822


In [23]:
#save the spy candidates data frame to a csv file
classify_prob.to_csv("/mnt/data/spyplane-data/new_flight_spy_candidates.csv", index = False)