In [5]:
%load_ext google.cloud.bigquery

In [3]:
!pip install keras #keras wasn't installed on my datalab at first

Collecting keras
  Using cached https://files.pythonhosted.org/packages/34/7d/b1dedde8af99bd82f20ed7e9697aac0597de3049b1f786aa2aac3b9bd4da/Keras-2.2.2-py2.py3-none-any.whl
Collecting keras-preprocessing==1.0.2 (from keras)
  Using cached https://files.pythonhosted.org/packages/71/26/1e778ebd737032749824d5cba7dbd3b0cf9234b87ab5ec79f5f0403ca7e9/Keras_Preprocessing-1.0.2-py2.py3-none-any.whl
Collecting keras-applications==1.0.4 (from keras)
  Using cached https://files.pythonhosted.org/packages/54/90/8f327deaa37a71caddb59b7b4aaa9d4b3e90c0e76f8c2d1572005278ddc5/Keras_Applications-1.0.4-py2.py3-none-any.whl
Installing collected packages: keras-preprocessing, keras-applications, keras
Successfully installed keras-2.2.2 keras-applications-1.0.4 keras-preprocessing-1.0.2


In [6]:
import pandas as pd   #Pandas are cute and fuzzy -- https://pandas.pydata.org/ 
import seaborn as sb  #Seaborn is good for charts/graphs -- https://seaborn.pydata.org/
import numpy as np  #numpy has numerous functions that make difficult math easy -- http://www.numpy.org/ 
import matplotlib.pyplot as plt #Matplotlib/pyplot is used to determine the width/height of resulting graphs 
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [7]:
%bq query -n get_data
SELECT
DISTANCE,
DEP_DELAY,
TAXI_OUT,
ARR_DELAY
FROM
`acnskunks-gcp-esy.acnskunksesy.flight_data`
WHERE 
CANCELLED = 0 
AND 
DIVERTED = 0 

In [8]:
#Execute the get_data query staged above, store as dataframe
df = %bq execute --to-dataframe -q get_data
print(type(df))
df.head(5)
df.describe()
df.count()

<class 'pandas.core.frame.DataFrame'>


DISTANCE     3309846
DEP_DELAY    3309846
TAXI_OUT     3309846
ARR_DELAY    3309846
dtype: int64

In [11]:
df.head(5)

Unnamed: 0,DISTANCE,DEP_DELAY,TAXI_OUT,ARR_DELAY
0,626.0,-17.0,4.0,-20.0
1,213.0,-25.0,4.0,-24.0
2,213.0,-19.0,5.0,-18.0
3,725.0,-19.0,4.0,-24.0
4,725.0,-21.0,3.0,-25.0


In [10]:
#store data as a csv (so you don't have to requery next time you stop/start the dataframe)
df.to_csv('alldata.csv')

In [14]:
#Pull the data from the CSV and load it into a new dataframe
dataframe = pd.read_csv("alldata.csv")
dataframe.head(5)

Unnamed: 0.1,Unnamed: 0,DISTANCE,DEP_DELAY,TAXI_OUT,ARR_DELAY
0,0,626.0,-17.0,4.0,-20.0
1,1,213.0,-25.0,4.0,-24.0
2,2,213.0,-19.0,5.0,-18.0
3,3,725.0,-19.0,4.0,-24.0
4,4,725.0,-21.0,3.0,-25.0


In [15]:
#My model predicts whether the flight will arrive before 15 minutes, or after 15 minutes (thereby classifying it as "DELAYED").
#In order to extract this, I need to create a new column, populating it with a 1 if the flight is delayed, or 0 if it is not.

def delay(row):
  if row['ARR_DELAY'] > 15:
    val = 1
  else:
    val = 0
  return val

In [16]:
#Add a new column to my dataframe, 1 if delayed, 0 if not
dataframe['DELAY']= dataframe.apply(delay, axis=1)

In [18]:
#Note that there is a new column "DELAY" with a 0 or 1
dataframe.head(10)

Unnamed: 0.1,Unnamed: 0,DISTANCE,DEP_DELAY,TAXI_OUT,ARR_DELAY,DELAY
0,0,626.0,-17.0,4.0,-20.0,0
1,1,213.0,-25.0,4.0,-24.0,0
2,2,213.0,-19.0,5.0,-18.0,0
3,3,725.0,-19.0,4.0,-24.0,0
4,4,725.0,-21.0,3.0,-25.0,0
5,5,539.0,-14.0,3.0,-18.0,0
6,6,204.0,66.0,5.0,60.0,1
7,7,539.0,52.0,5.0,60.0,1
8,8,399.0,22.0,4.0,20.0,1
9,9,198.0,-25.0,3.0,-24.0,0


In [19]:
#We need to extract two sets of data to feed into Keras 1)The Input data, which in my case is Distance, DEP_DELAY, and TAXI_OUT time and 2)Output data or my prediction, which is DELAY
#In order to do this, I first vectorize my dataframe
X = dataframe.values
print(X)

[[ 0.000000e+00  6.260000e+02 -1.700000e+01  4.000000e+00 -2.000000e+01
   0.000000e+00]
 [ 1.000000e+00  2.130000e+02 -2.500000e+01  4.000000e+00 -2.400000e+01
   0.000000e+00]
 [ 2.000000e+00  2.130000e+02 -1.900000e+01  5.000000e+00 -1.800000e+01
   0.000000e+00]
 ...
 [ 3.309843e+06  1.164000e+03 -1.000000e+00  3.300000e+01  1.100000e+01
   0.000000e+00]
 [ 3.309844e+06  1.090000e+02 -1.000000e+00  3.300000e+01  7.000000e+00
   0.000000e+00]
 [ 3.309845e+06  1.090000e+02 -1.000000e+00  3.300000e+01  4.000000e+00
   0.000000e+00]]


In [21]:
#Extract the DISTANCE, DEP_DELAY, TAXI_OUT into an array
Input = X[:, 1:4].astype(float)
print(Input[0:10])
Output = X[:, 5:6]
print(Output[0:10])

[[626. -17.   4.]
 [213. -25.   4.]
 [213. -19.   5.]
 [725. -19.   4.]
 [725. -21.   3.]
 [539. -14.   3.]
 [204.  66.   5.]
 [539.  52.   5.]
 [399.  22.   4.]
 [198. -25.   3.]]
[[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [1.]
 [0.]]


In [22]:
#Create my model
model = Sequential()
#Create the first layer with my 3 dimentional input data.
model.add(Dense(3, input_dim=3, kernel_initializer='normal', activation = 'relu'))
#Create my output layer with one output variable.  Sigmoid activation is generally used for binary classification problems
model.add(Dense(1, kernel_initializer='normal', activation ='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#Train my model with my Input and Output vectors. 
model.fit(Input, Output, epochs=4, batch_size=32)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7fc0d6c09fd0>

In [23]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 3)                 12        
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 4         
Total params: 16
Trainable params: 16
Non-trainable params: 0
_________________________________________________________________


In [24]:
#Test my model by sending an a test array to my model.  Delay if output=1, not delay if output = 0
test = np.empty(shape=(0, 3))
#test 
test = np.append(test, [[600, 40, 10]], axis=0)
print(model.predict(test))

[[0.91524994]]
