# Feature Engineering

This notebook takes new data from [ADS-B Exchange] (https://www.adsbexchange.com/) on April 1, 2018 and engineers it similarly to the Buzzfeed preparation. 

In [None]:
#import packages that we'll need
import pandas as pd
import numpy as np  
import scipy
import matplotlib.pyplot as plt
import pickle

In [None]:
#read in data
flights1 = pd.read_csv("/mnt/data/flights1.csv", low_memory = False)
flights2 = pd.read_csv("/mnt/data/flights2.csv", low_memory = False)
flights3 = pd.read_csv("/mnt/data/flights3.csv", low_memory = False)

In [None]:
#combine all federal plane data
flight_data = pd.concat([flights1, flights2, flights3])

In [None]:
#look at overview of the data
flight_data.head()

In [None]:
#sort by aircraft ID and time
flight_data.sort_values(['adshex', 'PosTime'], inplace = True)

In [None]:
#Create a column, 'steer' which is the change in compass bearing from the previous transponder detection 
#for that aircraft; negative values indicate a turn to the left, positive values a turn to the right. 
flight_data['steer'] = flight_data['track'] - flight_data['track'].shift(-1)
flight_data.head()

In [None]:
#Create steer bins, which are the proportion of steer values for each aircraft falling into bins set manually, 
#using the breaks: -180, -25, -10, -1, 0, 1, 22, 45, 180
steer = flight_data['steer'].groupby([flight_data['adshex'], pd.cut(flight_data['steer'], 
                             bins = [-180, -25, -10, -1, 0, 1, 22, 45, 180])]).size().unstack(fill_value=0)

steer = steer.div(steer.sum(axis=1), axis=0)

In [None]:
#clean up data frame
steer.columns = ['steer1', 'steer2', 'steer3', 'steer4', 'steer5', 'steer6', 'steer7', 'steer8']
steer.reset_index(inplace = True)
steer.head()

In [None]:
#proportion the altitude into 5 bins based on the distribution
q, alt_bins = pd.qcut(flight_data['altitude'], q = 5, retbins=True)
alt_bins = list(alt_bins)
alt_bins

In [None]:
#proportion the altitude into 5 bins based on the distribution
altitude = flight_data['altitude'].groupby([flight_data['adshex'], pd.cut(flight_data['altitude'], 
                                                                          bins = alt_bins)]).size().unstack(fill_value=0)

altitude = altitude.div(altitude.sum(axis=1), axis=0)

In [None]:
#clean up data frame
altitude.columns = ['altitude1', 'altitude2', 'altitude3', 'altitude4', 'altitude5']
altitude.reset_index(inplace = True)
altitude.head()

In [None]:
#get most common squawk code for each plane
sep_df = flight_data[['adshex', 'squawk']]

def get_mode(x): 
    m = pd.Series.mode(x); 
    return m.values[0] if not m.empty else np.nan
squawk = sep_df.groupby('adshex')['squawk'].agg(get_mode)

In [None]:
#move series into data frame and clean up
squawk.columns = ['squawk']
squawk = pd.Series.to_frame(squawk)
squawk.reset_index(inplace = True)
squawk.head()

In [None]:
#merge data frames into one
new_flight_data = altitude.merge(steer, on = 'adshex')
new_flight_data = new_flight_data.merge(squawk, on = 'adshex')

In [None]:
#format data to be the same as the training data
new_flight_data.rename(columns = {'squawk':'squawk_1'}, inplace = True)
new_flight_data.dropna(inplace = True)
new_flight_data['squawk_1'] = new_flight_data['squawk_1'].astype(int)
new_flight_data.head()

In [None]:
#format data to be the same as the training data
classify_new = new_flight_data[['steer1', 'steer2', 'steer4', 'steer5', 'steer6', 'squawk_1', 'altitude3']]

In [None]:
#import model
model_loc = '/mnt/data/SpyPlane-RandomForest.sav'
model = pickle.load(open(model_loc, 'rb'))

In [None]:
#classify data
predictions = model.predict(classify_new)

In [None]:
#look at number of predicted spy planes
sum(predictions)

In [None]:
#get list of the probabilities
probability_pred = model.predict_proba(classify_new)

In [None]:
#add the surveillance plane classifications to the data frame and sort
classify_prob = new_flight_data.copy()
classify_prob.loc[:,'spy_prob'] = probability_pred[:,1]
classify_prob.sort_values(by = 'spy_prob', ascending=False, inplace=True)

In [None]:
classify_prob.head()

In [None]:
#save the spy candidates data frame to a csv file
classify_prob.to_csv("/mnt/data/new_flight_spy_candidates.csv", index = False)