# Create and Use model

This notebook pulls in the model from the SpyPlane-OptimizingModel.ipynb notebook and uses that model to train the remaining data

This projects is based off the Buzzfeed news article on identifying spy planes found [here](https://www.buzzfeednews.com/article/peteraldhous/hidden-spy-planes), using the data and code adapted from their github repository [here](https://github.com/BuzzFeedNews/2017-08-spy-plane-finder).

In [1]:
%matplotlib inline
#import packages
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

#sci-kit learn is a library with machine learning algorithms
from sklearn.ensemble import RandomForestClassifier

#package for saving our ML model
import pickle

In [2]:
#read in data
planes_labeled = pd.read_csv("/mnt/data/planes_labeled.csv")

In [3]:
#format data by removing non-numeric columnns and factorize the class
X = planes_labeled[['steer1', 'steer2', 'steer4', 'steer5', 'steer6', 'squawk_1', 'altitude3']]
y = pd.factorize(planes_labeled['class'])[0]

In [4]:
# Create a model based on parameters from the random grid search
np.random.seed(415)
model_tuned = RandomForestClassifier(n_estimators=1100, max_depth=50, max_features='sqrt', 
                               min_samples_split=4, bootstrap=False)

#train model with only features above
model_tuned.fit(X, y)

RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=50, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, n_estimators=1100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

### Classify all data

#### Gather and format data
First we'll remove all of the training data and the known federal planes from the entire data set (which is in the planes_features file.

In [5]:
#read in all data
feds = pd.read_csv("/mnt/data/feds.csv")
train = pd.read_csv("/mnt/data/train.csv")
planes = pd.read_csv('/mnt/data/planes_features.csv')

In [6]:
#first gather list of federal plane identifiers to remove
fed_ids = list(feds['adshex'])
len(fed_ids)

199

In [7]:
#next gather list of training list identifiers to remove 
remove = fed_ids + list(train['adshex'])
len(remove)

796

In [8]:
classify = planes[~planes['adshex'].isin(remove)] 

In [9]:
#look at number of rows and columns in the raw data 
classify.shape

(19160, 33)

In [10]:
X_all = classify[['steer1', 'steer2', 'steer4', 'steer5', 'steer6', 'squawk_1', 'altitude3']]

In [11]:
X_all.head()

Unnamed: 0,steer1,steer2,steer4,steer5,steer6,squawk_1,altitude3
0,0.020211,0.048913,0.34409,0.097317,0.186651,0,0.066831
1,0.034976,0.048127,0.356314,0.116116,0.159325,0,0.129648
3,0.029871,0.044118,0.380515,0.094669,0.182904,0,0.149816
4,0.019048,0.049206,0.326984,0.112698,0.206349,1135,0.039683
5,0.001582,0.009494,0.416139,0.112342,0.169304,2356,0.178797


#### Make Predictions

The we'll use our model from the previous section to output those planes that were determined to be potential surveillance plans and join this data with the [FAA aircraft registration database](https://www.faa.gov/licenses_certificates/aircraft_certification/aircraft_registry/releasable_aircraft_download/), which gives the planes’ registration numbers and the organizations they are registered to.|

In [12]:
real_predictions = model_tuned.predict(X_all)

In [13]:
#look at number of predicted spy planes
sum(real_predictions)

282

In [14]:
#create data frame with only those potential spy planes
candidates = classify[real_predictions == 1]

In [15]:
candidates.head()

Unnamed: 0,adshex,duration1,duration2,duration3,duration4,duration5,boxes1,boxes2,boxes3,boxes4,...,steer3,steer4,steer5,steer6,steer7,steer8,flights,squawk_1,observations,type
25,A001FC,0.222222,0.222222,0.222222,0.111111,0.222222,0.222222,0.222222,0.111111,0.222222,...,0.268151,0.292352,0.090029,0.141336,0.011617,0.013553,9,7754,1033,BE35
41,A00324,0.0,0.0,0.0,0.235294,0.764706,0.0,0.735294,0.176471,0.029412,...,0.373111,0.130127,0.091131,0.120319,0.00427,0.002393,34,5347,21310,C208
86,A005D8,0.241379,0.137931,0.137931,0.103448,0.37931,0.241379,0.068966,0.310345,0.206897,...,0.335145,0.193823,0.015442,0.070043,0.007288,0.016553,29,5011,8095,unknown
123,A008A4,0.27957,0.112903,0.134409,0.11828,0.354839,0.263441,0.263441,0.284946,0.134409,...,0.196159,0.243768,0.035043,0.097058,0.012515,0.025235,186,0,19576,C208
130,A00948,0.090909,0.545455,0.272727,0.090909,0.0,0.272727,0.545455,0.181818,0.0,...,0.262551,0.179783,0.090231,0.158073,0.010855,0.006106,11,0,1474,unknown


In [16]:
#read in FAA data
faa = pd.read_csv("/mnt/data/faa-registration.csv")
faa.head()

Unnamed: 0,N-NUMBER,SERIAL NUMBER,MFR MDL CODE,ENG MFR MDL,YEAR MFR,TYPE REGISTRANT,NAME,STREET,STREET2,CITY,...,OTHER NAMES(2),OTHER NAMES(3),OTHER NAMES(4),OTHER NAMES(5),EXPIRATION DATE,UNIQUE ID,KIT MFR,KIT MODEL,MODE S CODE HEX,X35
0,1,1071,3980115,54556.0,1988.0,5.0,FEDERAL AVIATION ADMINISTRATION,WASHINGTON REAGAN NATIONAL ARPT,3201 THOMAS AVE HANGAR 6,WASHINGTON,...,,,,,20191130.0,524101,,,A00001,
1,100,5334,7100510,17003.0,1940.0,1.0,BENE MARY D,PO BOX 329,,KETCHUM,...,,,,,20200430.0,600060,,,A004B3,
2,10001,A28,9601202,67007.0,1928.0,1.0,PERRY AARON O,PO BOX 736,,MULBERRY,...,,,,,20190630.0,432072,,,A00726,
3,10002,79-030,8930105,41525.0,1979.0,4.0,ENGLISH MARK,655 DOESKIN TRL,,SANTA MARIA,...,,,,,20180131.0,831480,,,A00727,
4,10003,1,056336T,,,1.0,CAMPBELL CHARLES N,604 CORDOVA CT,,SALISBURY,...,,,,,20180331.0,1173853,,,A00728,


In [17]:
#look at the column names in the FAA registration dataframe
faa.columns

Index(['N-NUMBER', 'SERIAL NUMBER', 'MFR MDL CODE', 'ENG MFR MDL', 'YEAR MFR',
       'TYPE REGISTRANT', 'NAME', 'STREET', 'STREET2', 'CITY', 'STATE',
       'ZIP CODE', 'REGION', 'COUNTY', 'COUNTRY', 'LAST ACTION DATE',
       'CERT ISSUE DATE', 'CERTIFICATION', 'TYPE AIRCRAFT', 'TYPE ENGINE',
       'STATUS CODE', 'MODE S CODE', 'FRACT OWNER', 'AIR WORTH DATE',
       'OTHER NAMES(1)', 'OTHER NAMES(2)', 'OTHER NAMES(3)', 'OTHER NAMES(4)',
       'OTHER NAMES(5)', 'EXPIRATION DATE', 'UNIQUE ID', 'KIT MFR',
       'KIT MODEL', 'MODE S CODE HEX', 'X35'],
      dtype='object')

In [18]:
#seperate out the columns we want to use
plane_info = faa[['N-NUMBER', 'NAME', 'MODE S CODE HEX']].copy()
plane_info.rename(columns = {'N-NUMBER':'n_number', 'NAME':'name', 'MODE S CODE HEX':'adshex'}, inplace = True) 

In [19]:
spy_candidates = candidates.merge(plane_info, on = 'adshex', how = 'left')

#### Look at predicted probabilities

Here, we'll calculate the probabilities and sort them in descending order.

In [20]:
#get list of the probabilities
probability_pred = model_tuned.predict_proba(X_all)

In [21]:
#add the surveillance plane classifications to the data frame and sort
classify_prob = classify.copy()
classify_prob.loc[:,'spy_prob'] = probability_pred[:,1]
classify_prob.sort_values(by = 'spy_prob', ascending=False, inplace=True)

In [22]:
classify_prob.head()

Unnamed: 0,adshex,duration1,duration2,duration3,duration4,duration5,boxes1,boxes2,boxes3,boxes4,...,steer4,steer5,steer6,steer7,steer8,flights,squawk_1,observations,type,spy_prob
2189,A13098,0.166667,0.166667,0.166667,0.083333,0.416667,0.25,0.583333,0.166667,0.0,...,0.300802,0.019331,0.085809,0.010372,0.028289,12,4415,2121,unknown,0.999394
12218,A7F52E,0.142857,0.095238,0.142857,0.333333,0.285714,0.666667,0.095238,0.238095,0.0,...,0.268428,0.009272,0.057487,0.012517,0.035234,21,4334,2157,unknown,0.990909
19689,ADF7A5,0.152778,0.055556,0.125,0.263889,0.402778,0.5,0.444444,0.055556,0.0,...,0.078923,0.013625,0.051148,0.00807,0.021434,72,4414,19082,unknown,0.986515
86,A005D8,0.241379,0.137931,0.137931,0.103448,0.37931,0.241379,0.068966,0.310345,0.206897,...,0.193823,0.015442,0.070043,0.007288,0.016553,29,5011,8095,unknown,0.979394
12036,A7D925,0.121212,0.141414,0.070707,0.070707,0.59596,0.212121,0.515152,0.242424,0.030303,...,0.096653,0.015661,0.047095,0.004015,0.00925,99,230,45079,T206,0.975909


In [23]:
#merge with FAA names and registration numbers
classify_prob_faa= classify_prob.merge(plane_info, on = 'adshex', how = 'left')

In [24]:
#seperate out only those rows with probabilites greater than 0.5 and the relevant columns
relevant_cols = ['adshex', 'type', 'spy_prob', 'n_number', 'name', 'squawk_1', 'steer1', 'steer2', 'steer4', 'steer5', 'steer6', 'altitude3']
candidates_with_prob = classify_prob_faa.loc[classify_prob_faa['spy_prob'] > 0.5, relevant_cols]

In [25]:
#look at the top 15 results
candidates_with_prob.head(15)

Unnamed: 0,adshex,type,spy_prob,n_number,name,squawk_1,steer1,steer2,steer4,steer5,steer6,altitude3
0,A13098,unknown,0.999394,,,4415,0.151344,0.176803,0.300802,0.019331,0.085809,0.665724
1,A7F52E,unknown,0.990909,61122,SRT AVIATION AND TECHNICAL SERVICES INC,4334,0.170144,0.205841,0.268428,0.009272,0.057487,0.585999
2,ADF7A5,unknown,0.986515,,,4414,0.274709,0.310816,0.078923,0.013625,0.051148,0.451473
3,A005D8,unknown,0.979394,,,5011,0.105991,0.22063,0.193823,0.015442,0.070043,0.018653
4,A7D925,T206,0.975909,6045C,ORANGE COUNTY SHERIFFS DEPARTMENT RNSP,230,0.16684,0.315047,0.096653,0.015661,0.047095,0.345793
5,A5DD36,C208,0.961667,477XP,PARACLETE AVIATION LLC,4444,0.118906,0.183848,0.174916,0.039449,0.124488,0.044846
6,A657C9,C208,0.94303,508BH,CITY OF ANAHEIM,4301,0.108156,0.139862,0.226881,0.057274,0.146851,0.518197
7,A954A6,T206,0.940303,700AZ,PIMA COUNTY SHERIFFS DEPARTMENT,0,0.146749,0.153679,0.308205,0.02407,0.089774,0.675872
8,ABF404,unknown,0.921818,87AG,COMMONWEALTH OF PENNSYLVANIA,0,0.123596,0.198502,0.312734,0.06367,0.050562,0.580524
9,A861A5,B350,0.894545,,,4407,0.041726,0.079378,0.386335,0.053196,0.175962,0.165157


In [26]:
#save the spy candidates data frame to a csv file
candidates_with_prob.to_csv("/mnt/data/spy_candidates.csv", index = False)

In [27]:
#save the confirmed federal surveillance planes with their relevant data to file
feds_data = planes_labeled[planes_labeled['adshex'].isin(fed_ids)]
feds_data = feds_data[['adshex', 'steer1', 'steer2', 'steer4', 'steer5', 'steer6', 'squawk_1', 'altitude3']]
feds_data.to_csv('/mnt/data/feds_data.csv')

In [28]:
# save the model to disk
file_loc = '/mnt/data/SpyPlane-RandomForest.sav'
pickle.dump(model_tuned, open(file_loc, 'wb'))