<a href="https://colab.research.google.com/github/davro76/final-project/blob/main/delay_capstone_wrangling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries and Fly Data

In [11]:
# import required librairies and dependencies
import pandas as pd
from os import listdir
from os import getcwd
import datetime as dt
import plotly.express as px
import plotly.graph_objs as go
from os.path import isfile, join
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import seaborn as sns
import glob
pd.set_option('display.max_columns',100)
import plotly.io as pio
pio.renderers.default = "colab"
%matplotlib inline

In [12]:
# Make Plotly work in our Jupyter Notebook
import cufflinks as cf
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
# Use Plotly locally
cf.go_offline()

In [13]:
# Set the Environment
# Ignore Warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
import pandas as pd

In [16]:
# string variable "file_path" which contains the file path to the csv file
file_path = '/content/drive/MyDrive/airline_final'

# using the "pd.read_csv()" method to read the contents of the csv file located at "file_path" and stores it in the DataFrame "df"
airline_final = pd.read_csv(file_path)

# using the "drop()" method to remove the column with the label "Unnamed: 0"
airline_final = airline_final.drop('Unnamed: 0', axis=1)


# Display the first two rows of the df DataFrame
airline_final.head(2)


Unnamed: 0,mkt_ccode,dep_airport,cancel_code,arr_airport,day_week_name,month__name,route,mkt_fl_no,date,dep_delay_time_actual,arr_delay_time_actual,mins_late_delay_code_e_carrier,mins_late_delay_code_f_weather,mins_late_delay_code_g_nas,mins_late_delay_code_h_security,mins_late_delay_code_i_late_arr_flight,delay
0,DL,BNA,0,MSP,Mon,Jan,BNA-MSP,3975,2022-01-24,36,50,36,0,14,0,0,1
1,DL,STL,0,MSP,Sun,Jan,STL-MSP,3714,2022-01-23,32,42,0,0,10,0,32,1


# Data Vizualization 

### Which routes are most prone to delays?

In [20]:
import pandas as pd
import plotly.express as px

# Use your dataset as a DataFrame
df = airline_final

# Calculate the mean delay for each route
mean_route_delays = df.groupby('route')['arr_delay_time_actual'].mean().reset_index().round(3)

# Sort the routes by mean delay in descending order and select the top 10
top_delayed_routes = mean_route_delays.sort_values(by='arr_delay_time_actual', ascending=False).head(10)

# Create a bar chart using Plotly Express
fig = px.bar(top_delayed_routes, x='route', y='arr_delay_time_actual',color = 'route',title='Top 10 Routes with Highest Mean Delays')
fig.update_layout(xaxis_title='Route', yaxis_title='Mean Delay (minutes)')
fig.show()


### What are the primary causes of departure delay time?

In [18]:
import pandas as pd
import plotly.express as px

# Use your dataset as a DataFrame
df = airline_final

def generate_pie_chart(day):
    # Filter the dataset based on the given day
    filtered_df = df[df['day_week_name'] == day]

    # Calculate the mean delays for each cause and round to three decimal places
    causes = ['mins_late_delay_code_e_carrier', 'mins_late_delay_code_f_weather', 'mins_late_delay_code_g_nas', 'mins_late_delay_code_h_security', 'mins_late_delay_code_i_late_arr_flight']
    mean_delays = filtered_df[causes].mean().round(3)

    # Create a new DataFrame for the pie chart
    delay_data = pd.DataFrame({'cause': causes, 'mean_delays': mean_delays}).sort_values(by='mean_delays', ascending=False)

    # Create a pie chart using Plotly Express
    fig = px.pie(delay_data, values='mean_delays', names='cause', title=f'Primary Causes of Flight Delays (Mean Delays) on {day}')
    fig.show()

# Generate the pie chart for a specific day, e.g., 'Mon' for Monday
generate_pie_chart('Tue')


### Airports with the Highest Average Delays

In [19]:
import pandas as pd
import plotly.express as px

# Read the CSV file
df = pd.read_csv('airline_geo_df.csv')

# Add the 'average_delay' column to the DataFrame
df['average_delay'] = airline_final['arr_delay_time_actual']

# Sort the DataFrame by the 'average_delay' column
df1 = df.sort_values(by='average_delay', ascending=False).head(25)

# Create the map
fig = px.scatter_mapbox(df1,
                        lon=df1['longitude'],
                        lat=df1['latitude'],
                        zoom=1,
                        color=df1['dept_airport'],
                        size=df1['average_delay'],
                        title='Top 25 Airports with the Highest Average Delays')
fig.update_layout(mapbox_style='open-street-map')
fig.update_layout(margin={'r': 0, 't': 50, 'l': 0, 'b': 10})
fig.show()


FileNotFoundError: ignored

### Arrival Delay Time by Airline Across the Months

In [21]:
import pandas as pd
import plotly.express as px

# Use the provided dataset as a DataFrame
df = airline_final

# Convert the 'date' column to a datetime object
df['date'] = pd.to_datetime(df['date'])

# Group the dataset by month and mkt_ccode, and calculate the mean of arr_delay_time_actual
monthly_mean_delays = df.groupby([df['date'].dt.month_name(), 'mkt_ccode'])['arr_delay_time_actual'].mean().reset_index()

# Rename the columns for clarity
monthly_mean_delays.columns = ['month', 'mkt_ccode', 'mean_arr_delay']

# Create a custom mapping for the month names
month_order = {'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
               'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12}

# Sort the DataFrame according to the custom month mapping
monthly_mean_delays['month_order'] = monthly_mean_delays['month'].map(month_order)
monthly_mean_delays = monthly_mean_delays.sort_values(['month_order', 'mkt_ccode']).drop('month_order', axis=1)

# Define a custom color sequence
color_sequence = px.colors.qualitative.Pastel

# Create a line chart using Plotly Express with the custom color sequence
fig = px.line(monthly_mean_delays, x='month', y='mean_arr_delay', color='mkt_ccode', title='Arrival Delay Time by Month and Airline', color_discrete_sequence=color_sequence)
fig.update_layout(xaxis_title='Month', yaxis_title='Mean Arrival Delay Time (minutes)')
fig.show()


# Machine Learning Model

In [22]:
airline_final.columns

Index(['mkt_ccode', 'dep_airport', 'cancel_code', 'arr_airport',
       'day_week_name', 'month__name', 'route', 'mkt_fl_no', 'date',
       'dep_delay_time_actual', 'arr_delay_time_actual',
       'mins_late_delay_code_e_carrier', 'mins_late_delay_code_f_weather',
       'mins_late_delay_code_g_nas', 'mins_late_delay_code_h_security',
       'mins_late_delay_code_i_late_arr_flight', 'delay'],
      dtype='object')

In [26]:
features_selection= ['mkt_ccode', 'dep_airport','day_week_name', 'month__name','mkt_fl_no','delay']
NN_data = airline_final[features_selection]
NN_data.head(2)

Unnamed: 0,mkt_ccode,dep_airport,day_week_name,month__name,mkt_fl_no,delay
0,DL,BNA,Mon,Jan,3975,1
1,DL,STL,Sun,Jan,3714,1


In [27]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


# Load the dataset
data = NN_data

df = pd.DataFrame(data)

# Preprocess the data
le = LabelEncoder()
df['mkt_ccode'] = le.fit_transform(df['mkt_ccode'])
df['dep_airport'] = le.fit_transform(df['dep_airport'])
df['month__name'] = le.fit_transform(df['month__name'])
df['day_week_name'] = le.fit_transform(df['day_week_name'])

# Split the data into training and testing sets
X = df.drop('delay', axis=1)
y = df['delay']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create and train the neural network model
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=1)

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print("Test accuracy: ", test_acc)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test accuracy:  0.6482509970664978
