In [1]:
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.metrics import classification_report

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import data
flights = pd.read_pickle("./flights_dataframe.p")
flights.head()

Unnamed: 0,ORIGIN_AIRPORT,DESTINATION_AIRPORT,DISTANCE,ORIGIN_AVGTEMP,ORIGIN_MAXTEMP,ORIGIN_MINTEMP,DESTINATION_AVGTEMP,DESTINATION_MAXTEMP,DESTINATION_MINTEMP,CANCELLED
0,ANC,SEA,2325.630321,35,35,27,33,42,26,0
1,ANC,SEA,2325.630321,35,35,27,33,42,26,0
2,ANC,SEA,2325.630321,35,35,27,33,42,26,0
3,ANC,SEA,2325.630321,35,35,27,33,42,26,1
4,ANC,SEA,2325.630321,35,35,27,33,42,26,0


In [3]:
flights.columns

Index(['ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'DISTANCE', 'ORIGIN_AVGTEMP',
       'ORIGIN_MAXTEMP', 'ORIGIN_MINTEMP', 'DESTINATION_AVGTEMP',
       'DESTINATION_MAXTEMP', 'DESTINATION_MINTEMP', 'CANCELLED'],
      dtype='object')

In [4]:
cols_to_norm = ['DISTANCE', 'ORIGIN_AVGTEMP', 'ORIGIN_MAXTEMP', 'ORIGIN_MINTEMP',
                'DESTINATION_AVGTEMP', 'DESTINATION_MAXTEMP', 'DESTINATION_MINTEMP']

In [5]:
# Separate independent columns from the dependent column
x_data = flights.drop('CANCELLED',axis=1)
y_vals = flights['CANCELLED']

In [6]:
# Train-test split
x_train, x_test, y_train, y_test = train_test_split(x_data,y_vals,test_size=0.3,random_state=12345)

In [7]:
# Create a scaler
scaler = MinMaxScaler()

In [8]:
# Fit data to the scaler and tranform all the numeric columns
x_train[cols_to_norm] = pd.DataFrame(scaler.fit_transform(x_train[cols_to_norm]),
                                     index=x_train[cols_to_norm].index,
                                     columns=x_train[cols_to_norm].columns)
x_test[cols_to_norm] = pd.DataFrame(scaler.transform(x_test[cols_to_norm]),
                                    index=x_test[cols_to_norm].index,
                                    columns=x_test[cols_to_norm].columns)

In [9]:
x_train.head()

Unnamed: 0,ORIGIN_AIRPORT,DESTINATION_AIRPORT,DISTANCE,ORIGIN_AVGTEMP,ORIGIN_MAXTEMP,ORIGIN_MINTEMP,DESTINATION_AVGTEMP,DESTINATION_MAXTEMP,DESTINATION_MINTEMP
2692967,LAS,RNO,0.065508,0.778523,0.763158,0.727273,0.610738,0.631579,0.571429
2851850,HNL,ITO,0.039466,0.805369,0.782895,0.772727,0.771812,0.75,0.746753
1787302,DEN,SLC,0.074593,0.644295,0.671053,0.597403,0.644295,0.657895,0.590909
4375150,IAH,HOB,0.096919,0.798658,0.796053,0.753247,0.724832,0.769737,0.675325
2023007,LAX,LIH,0.522945,0.630872,0.605263,0.603896,0.771812,0.756579,0.766234


In [10]:
print(len(Counter(x_train['ORIGIN_AIRPORT'])))
print(len(Counter(x_train['DESTINATION_AIRPORT'])))

319
319


In [11]:
# Print for each numeric feature columns
for x in x_train[cols_to_norm].columns:
    print(str(x).lower() +' = tf.feature_column.numeric_column(\''+str(x)+'\')')

distance = tf.feature_column.numeric_column('DISTANCE')
origin_avgtemp = tf.feature_column.numeric_column('ORIGIN_AVGTEMP')
origin_maxtemp = tf.feature_column.numeric_column('ORIGIN_MAXTEMP')
origin_mintemp = tf.feature_column.numeric_column('ORIGIN_MINTEMP')
destination_avgtemp = tf.feature_column.numeric_column('DESTINATION_AVGTEMP')
destination_maxtemp = tf.feature_column.numeric_column('DESTINATION_MAXTEMP')
destination_mintemp = tf.feature_column.numeric_column('DESTINATION_MINTEMP')


In [12]:
# Feature columns
origin_airport = tf.feature_column.categorical_column_with_hash_bucket(
    'ORIGIN_AIRPORT', hash_bucket_size = 320)
destination_airport = tf.feature_column.categorical_column_with_hash_bucket(
    'DESTINATION_AIRPORT', hash_bucket_size = 320)
distance = tf.feature_column.numeric_column('DISTANCE')
origin_avgtemp = tf.feature_column.numeric_column('ORIGIN_AVGTEMP')
origin_maxtemp = tf.feature_column.numeric_column('ORIGIN_MAXTEMP')
origin_mintemp = tf.feature_column.numeric_column('ORIGIN_MINTEMP')
destination_avgtemp = tf.feature_column.numeric_column('DESTINATION_AVGTEMP')
destination_maxtemp = tf.feature_column.numeric_column('DESTINATION_MAXTEMP')
destination_mintemp = tf.feature_column.numeric_column('DESTINATION_MINTEMP')

In [13]:
for i in x_train.columns:
    print(i.lower(),end=", ")

origin_airport, destination_airport, distance, origin_avgtemp, origin_maxtemp, origin_mintemp, destination_avgtemp, destination_maxtemp, destination_mintemp, 

In [14]:
# Create a list for the feature columns
feat_cols = [origin_airport, destination_airport, distance,
             origin_avgtemp, origin_maxtemp, origin_mintemp,
             destination_avgtemp, destination_maxtemp, destination_mintemp]

In [15]:
# Creat input function with train data and train specifications
input_func = tf.estimator.inputs.pandas_input_fn(x=x_train,y=y_train, batch_size=10,
                                                 num_epochs=1000,shuffle=True)

In [16]:
# Create model
model = tf.estimator.LinearClassifier(feature_columns=feat_cols,n_classes=2)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp5s58p43c', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ff4f9c11fd0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [17]:
# Train the model
model.train(input_fn=input_func,steps=1000)

Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmp5s58p43c/model.ckpt.
INFO:tensorflow:loss = 6.931472, step = 1
INFO:tensorflow:global_step/sec: 235.74
INFO:tensorflow:loss = 0.29012513, step = 101 (0.427 sec)
INFO:tensorflow:global_step/sec: 359.994
INFO:tensorflow:loss = 0.109917864, step = 201 (0.281 sec)
INFO:tensorflow:global_step/sec: 349.542
INFO:tensorflow:loss = 0.1658538, step = 301 (0.285 sec)
INFO:tensorflow:global_step/sec: 312.709
INFO:tensorflow:loss = 0.36847347, step = 401 (0.319 sec)
IN

<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x7ff4f9c11518>

In [18]:
# Input function for evaluation
eval_input_func = tf.estimator.inputs.pandas_input_fn(
    x=x_test ,y=y_test, batch_size=10, num_epochs=1,
    shuffle=False)

In [19]:
# Evaluate the model
evaluation = model.evaluate(eval_input_func)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-10-20-13:59:50
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp5s58p43c/model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-10-20-14:06:27
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.98361623, accuracy_baseline = 0.98361623, auc = 0.7077249, auc_precision_recall = 0.053510875, average_loss = 0.07825911, global_step = 1000, label/mean = 0.016383793, loss = 0.7825875, precision = 0.0, prediction/mean = 0.01443754, recall = 0.0
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1000: /tmp/tmp5s58p43c/model.ckpt-1000


In [20]:
evaluation

{'accuracy': 0.98361623,
 'accuracy_baseline': 0.98361623,
 'auc': 0.7077249,
 'auc_precision_recall': 0.053510875,
 'average_loss': 0.07825911,
 'label/mean': 0.016383793,
 'loss': 0.7825875,
 'precision': 0.0,
 'prediction/mean': 0.01443754,
 'recall': 0.0,
 'global_step': 1000}

In [21]:
pred_fn = tf.estimator.inputs.pandas_input_fn(
    x=x_test,batch_size=len(x_test),shuffle=False)

In [22]:
pred_gen = model.predict(input_fn=pred_fn)

In [23]:
predictions = list(pred_gen)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp5s58p43c/model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [24]:
final_preds = [pred['class_ids'][0] for pred in predictions]

In [25]:
print(classification_report(y_test,final_preds))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99   1502339
           1       0.00      0.00      0.00     25024

   micro avg       0.98      0.98      0.98   1527363
   macro avg       0.49      0.50      0.50   1527363
weighted avg       0.97      0.98      0.98   1527363



In [26]:
Counter(final_preds)

Counter({0: 1527363})

So the LinearClassifier clearly failed in this task, with these inputs. Lets try a DNNClassifier:

In [27]:
# Create embedded columns
embedded_origin_airport = tf.feature_column.embedding_column(
    origin_airport,dimension = 4)
embedded_destination_airport = tf.feature_column.embedding_column(
    destination_airport,dimension = 4)

In [28]:
# Create a new list for the feature columns
feat_cols = [embedded_origin_airport, embedded_destination_airport,
             distance, origin_avgtemp, origin_maxtemp, origin_mintemp,
             destination_avgtemp, destination_maxtemp, destination_mintemp]

In [29]:
input_func = tf.estimator.inputs.pandas_input_fn(
    x_train,y_train,batch_size=10,num_epochs=1000,shuffle=True)

In [30]:
dnn_model = tf.estimator.DNNClassifier(
    hidden_units=[9,9,9,9],feature_columns=feat_cols,n_classes=2)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpl1w304s0', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ff3c16f7278>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [31]:
dnn_model.train(input_fn=input_func,steps=1000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmpl1w304s0/model.ckpt.
INFO:tensorflow:loss = 8.180362, step = 1
INFO:tensorflow:global_step/sec: 263.651
INFO:tensorflow:loss = 3.9235096, step = 101 (0.380 sec)
INFO:tensorflow:global_step/sec: 423.264
INFO:tensorflow:loss = 0.13816127, step = 201 (0.238 sec)
INFO:tensorflow:global_step/sec: 411.798
INFO:tensorflow:loss = 0.21365617, step = 301 (0.244 sec)
INFO:tensorflow:global_step/sec: 374.419
INFO:tensorflow:loss = 0.13298939, step = 401 (0.266 sec)
INFO:tensorflow:global_step/sec: 381.229
INFO:tensorflow:loss = 0.14135225, step = 501 (0.262 sec)
INFO:tensorflow:global_step/sec: 350.914
INFO:tensorflow:loss = 0.2775678, step = 601 (0.285 sec)
INFO:tensorflow:global_step/sec: 388.629
INFO:tensorfl

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x7ff3c16f7048>

In [32]:
eval_input_func = tf.estimator.inputs.pandas_input_fn(
    x=x_test,y=y_test,batch_size=10,num_epochs=1,shuffle=False)

In [33]:
pred_gen = dnn_model.predict(input_fn=eval_input_func)

In [34]:
predictions = list(pred_gen)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpl1w304s0/model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [35]:
final_preds = [pred['class_ids'][0] for pred in predictions]

In [36]:
print(classification_report(y_test,final_preds))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99   1502339
           1       0.00      0.00      0.00     25024

   micro avg       0.98      0.98      0.98   1527363
   macro avg       0.49      0.50      0.50   1527363
weighted avg       0.97      0.98      0.98   1527363



In [37]:
Counter(final_preds)

Counter({0: 1527363})