<a href="https://colab.research.google.com/github/dangexpert/FlightDelayPrediction/blob/master/FlightDelayPrediction_LogRegModeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder


In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
flight = pd.read_csv('drive/My Drive/Colab Notebooks/Flight_data.csv')
flight_df = flight.copy()

In [0]:
flight_df.info()

In [0]:
flight_df.describe()

### Data Preprocessing

***1. encode categorical data: OP_UNIQUE_CARRIER, ORIGIN, DEST, DEP_TIME_BLK***

In [0]:
features = flight_df[['MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'TAXI_IN', 'TAXI_OUT', 'DEP_TIME_BLK', 'AIR_TIME', 'ARR_DELAY_NEW']]
#'ORIGIN' 'DEST',

In [0]:
labelen = LabelEncoder()

#label encoding
# flight_df.OP_UNIQUE_CARRIER = pd.Series(labelen.fit_transform(flight_df.OP_UNIQUE_CARRIER))
# flight_df.ORIGIN = pd.Series(labelen.fit_transform(flight_df.ORIGIN))
# flight_df.DEST = pd.Series(labelen.fit_transform(flight_df.DEST))
features.DEP_TIME_BLK = pd.Series(labelen.fit_transform(features.DEP_TIME_BLK))

#dummy / one hot encoding
df = pd.get_dummies(features[['OP_UNIQUE_CARRIER']], drop_first=True) #'ORIGIN' 'DEST'
flight_features = pd.concat([features, df], axis=1)
flight_features.drop('OP_UNIQUE_CARRIER', axis = 1, inplace = True)

In [0]:
# #correlation

# df = features[['QUARTER', 'MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'ORIGIN',\
#                 'DEST', 'DEP_TIME_BLK', 'DEP_DELAY_NEW', 'TAXI_OUT', 'TAXI_IN',\
#                 'ARR_DELAY_NEW', 'AIR_TIME', 'DISTANCE', 'DISTANCE_GROUP']]
# df.corr()

In [0]:
# #one hot encoding 
# OP_UNIQUE_CARRIER = np.array(flight_df.OP_UNIQUE_CARRIER).reshape(-1, 1)
# ORIGIN = np.array(flight_df.ORIGIN).reshape(-1, 1)
# DEST = np.array(flight_df.DEST).reshape(-1, 1)

# onehoten = OneHotEncoder(sparse=False)

# OP_CARRIER_oh = onehoten.fit_transform(OP_UNIQUE_CARRIER)
# ORIGIN_oh = onehoten.fit_transform(ORIGIN)
# DEST_oh = onehoten.fit_transform(DEST)

# OP_CARRIER_oh = OP_CARRIER_oh[:, 1:]
# ORIGIN_oh = ORIGIN_oh[:, 1:]
# DEST_oh = DEST_oh[:, 1:]


### Modeling

In [0]:
from __future__ import print_function

import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
#%tensorflow_version 1.x
import tensorflow as tf
from tensorflow.python.data import Dataset

import warnings
warnings.filterwarnings("ignore")

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

tf.logging.set_verbosity(tf.logging.ERROR)

flight_features = flight_features.reindex(np.random.permutation(flight_features.index))

In [0]:
def preprocess_features(flight_features):
  selected_features = flight_features
  processed_features = selected_features.copy()
  return processed_features

def preprocess_targets(california_housing_dataframe):
  output_targets = pd.DataFrame()
  # Create a boolean categorical feature representing whether the
  # median_house_value is above a set threshold.
  output_targets["delay_or_not"] = (
      flight_features['ARR_DELAY_NEW'].apply(lambda x: 1 if x != 0 else 0)
  )
  return output_targets

In [0]:
# Choose the first 70% examples for training.
training_examples = preprocess_features(flight_features.head(500000))
training_targets = preprocess_targets(flight_features).head(500000)

# Choose the last 30% examples for validation.
validation_examples = preprocess_features(flight_features.tail(207103))
validation_targets = preprocess_targets(flight_features).tail(207103)

# Double-check that we've done the right thing.
print("Training examples summary:")
display.display(training_examples.describe())
print("Validation examples summary:")
display.display(validation_examples.describe())

print("Training targets summary:")
display.display(training_targets.describe())
print("Validation targets summary:")
display.display(validation_targets.describe())

In [0]:
def construct_feature_columns(input_features):

  return set([tf.feature_column.numeric_column(my_feature)
              for my_feature in input_features])
  

def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
    
    # Convert pandas data into a dict of np arrays.
    features = {key:np.array(value) for key,value in dict(features).items()}                                            
 
    # Construct a dataset, and configure batching/repeating.
    ds = Dataset.from_tensor_slices((features,targets)) 
    ds = ds.batch(batch_size).repeat(num_epochs)
    
    # Shuffle the data, if specified.
    if shuffle:
      ds = ds.shuffle(10000)
    
    # Return the next batch of data.
    features, labels = ds.make_one_shot_iterator().get_next()
    return features, labels

In [0]:
def train_linear_classifier_model(
    learning_rate,
    steps,
    batch_size,
    training_examples,
    training_targets,
    validation_examples,
    validation_targets):

  periods = 10
  steps_per_period = steps / periods
  
  # Create a linear classifier object.
  my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
  my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)  
  linear_classifier = tf.estimator.LinearClassifier(
      feature_columns=construct_feature_columns(training_examples),
      optimizer=my_optimizer
  )
  
  # Create input functions.
  training_input_fn = lambda: my_input_fn(training_examples, 
                                          training_targets["delay_or_not"], 
                                          batch_size=batch_size)
  predict_training_input_fn = lambda: my_input_fn(training_examples, 
                                                  training_targets["delay_or_not"], 
                                                  num_epochs=1, 
                                                  shuffle=False)
  predict_validation_input_fn = lambda: my_input_fn(validation_examples, 
                                                    validation_targets["delay_or_not"], 
                                                    num_epochs=1, 
                                                    shuffle=False)
  
  # Train the model, but do so inside a loop so that we can periodically assess
  # loss metrics.
  print("Training model...")
  print("LogLoss (on training data):")
  training_log_losses = []
  validation_log_losses = []
  for period in range (0, periods):
    # Train the model, starting from the prior state.
    linear_classifier.train(
        input_fn=training_input_fn,
        steps=steps_per_period
    )
    # Take a break and compute predictions.    
    training_probabilities = linear_classifier.predict(input_fn=predict_training_input_fn)
    training_probabilities = np.array([item['probabilities'] for item in training_probabilities])
    
    validation_probabilities = linear_classifier.predict(input_fn=predict_validation_input_fn)
    validation_probabilities = np.array([item['probabilities'] for item in validation_probabilities])
    
    training_log_loss = metrics.log_loss(training_targets, training_probabilities)
    validation_log_loss = metrics.log_loss(validation_targets, validation_probabilities)
    # Occasionally print the current loss.
    print("  period %02d : %0.2f" % (period, training_log_loss))
    # Add the loss metrics from this period to our list.
    training_log_losses.append(training_log_loss)
    validation_log_losses.append(validation_log_loss)
  print("Model training finished.")
  
  # Output a graph of loss metrics over periods.
  plt.ylabel("LogLoss")
  plt.xlabel("Periods")
  plt.title("LogLoss vs. Periods")
  plt.tight_layout()
  plt.plot(training_log_losses, label="training")
  plt.plot(validation_log_losses, label="validation")
  plt.legend()

  return linear_classifier

In [0]:
linear_classifier = train_linear_classifier_model(
    learning_rate=0.001,
    steps=500,
    batch_size=20,
    training_examples=training_examples,
    training_targets=training_targets,
    validation_examples=validation_examples,
    validation_targets=validation_targets)


predict_validation_input_fn = lambda: my_input_fn(validation_examples, 
                                                  validation_targets["delay_or_not"], 
                                                  num_epochs=1, 
                                                  shuffle=False)

evaluation_metrics = linear_classifier.evaluate(input_fn=predict_validation_input_fn)

print("AUC on the validation set: %0.2f" % evaluation_metrics['auc'])
print("Accuracy on the validation set: %0.2f" % evaluation_metrics['accuracy'])