In [1]:
import pandas as pd
import requests
from io import StringIO

url = "https://group1-project4-flightdata2023.s3.us-east-2.amazonaws.com/2022.csv"

response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Read the CSV content into a Pandas DataFrame
    pandas_df = pd.read_csv(StringIO(response.text))
    # Now 'pandas_df' is a Pandas DataFrame
    print(pandas_df.head())
else:
    print(f"Failed to fetch the CSV file. Status Code: {response.status_code}")


      FL_DATE AIRLINE_CODE  DOT_CODE  FL_NUMBER ORIGIN     ORIGIN_CITY DEST  \
0  2022-05-03           OH     20397       5433    DCA  Washington, DC  ALB   
1  2022-05-04           OH     20397       5433    DCA  Washington, DC  ALB   
2  2022-05-01           OH     20397       5434    CLT   Charlotte, NC  TLH   
3  2022-05-02           OH     20397       5434    CLT   Charlotte, NC  TLH   
4  2022-05-03           OH     20397       5434    CLT   Charlotte, NC  TLH   

         DEST_CITY  CRS_DEP_TIME  DEP_TIME  ...  AIR_TIME  DISTANCE  \
0       Albany, NY          1010    1018.0  ...      61.0     318.0   
1       Albany, NY          1010    1002.0  ...      61.0     318.0   
2  Tallahassee, FL          2046    2039.0  ...      57.0     386.0   
3  Tallahassee, FL          2046    2036.0  ...      60.0     386.0   
4  Tallahassee, FL          2046    2035.0  ...      63.0     386.0   

   DELAY_DUE_CARRIER  DELAY_DUE_WEATHER  DELAY_DUE_NAS  DELAY_DUE_SECURITY  \
0                NaN

In [16]:
pandas_df.head()

Unnamed: 0,FL_DATE,AIRLINE_CODE,DOT_CODE,FL_NUMBER,ORIGIN,ORIGIN_CITY,DEST,DEST_CITY,CRS_DEP_TIME,CRS_ARR_TIME,CANCELLED,CANCELLATION_CODE,DIVERTED,DISTANCE,FL_YEAR,FL_MONTH,FL_DAY
0,2022-05-03,OH,20397,5433,DCA,"Washington, DC",ALB,"Albany, NY",1010,1152,0.0,Z,0.0,318.0,2022,5,3
1,2022-05-04,OH,20397,5433,DCA,"Washington, DC",ALB,"Albany, NY",1010,1152,0.0,Z,0.0,318.0,2022,5,4
2,2022-05-01,OH,20397,5434,CLT,"Charlotte, NC",TLH,"Tallahassee, FL",2046,2227,0.0,Z,0.0,386.0,2022,5,1
3,2022-05-02,OH,20397,5434,CLT,"Charlotte, NC",TLH,"Tallahassee, FL",2046,2227,0.0,Z,0.0,386.0,2022,5,2
4,2022-05-03,OH,20397,5434,CLT,"Charlotte, NC",TLH,"Tallahassee, FL",2046,2227,0.0,Z,0.0,386.0,2022,5,3


In [3]:
pandas_df.isnull().sum()

FL_DATE                          0
AIRLINE_CODE                     0
DOT_CODE                         0
FL_NUMBER                        0
ORIGIN                           0
ORIGIN_CITY                      0
DEST                             0
DEST_CITY                        0
CRS_DEP_TIME                     0
DEP_TIME                    177269
DEP_DELAY                   177348
TAXI_OUT                    180522
WHEELS_OFF                  180522
WHEELS_ON                   182955
TAXI_IN                     182955
CRS_ARR_TIME                     0
ARR_TIME                    182952
ARR_DELAY                   197113
CANCELLED                        0
CANCELLATION_CODE                0
DIVERTED                         0
CRS_ELAPSED_TIME                 0
ELAPSED_TIME                197113
AIR_TIME                    197113
DISTANCE                         0
DELAY_DUE_CARRIER          5352327
DELAY_DUE_WEATHER          5352327
DELAY_DUE_NAS              5352327
DELAY_DUE_SECURITY  

In [4]:
columns_to_drop = ["DEP_TIME", "DEP_DELAY", "TAXI_OUT", "CRS_ELAPSED_TIME", "WHEELS_OFF", "WHEELS_ON", "TAXI_IN", "ARR_TIME", "ARR_DELAY", "ELAPSED_TIME", "AIR_TIME","DELAY_DUE_CARRIER", "DELAY_DUE_WEATHER", "DELAY_DUE_NAS", "DELAY_DUE_SECURITY", "DELAY_DUE_LATE_AIRCRAFT"]
pandas_df = pandas_df.drop(columns=columns_to_drop)

In [5]:
pandas_df.head()

Unnamed: 0,FL_DATE,AIRLINE_CODE,DOT_CODE,FL_NUMBER,ORIGIN,ORIGIN_CITY,DEST,DEST_CITY,CRS_DEP_TIME,CRS_ARR_TIME,CANCELLED,CANCELLATION_CODE,DIVERTED,DISTANCE,FL_YEAR,FL_MONTH,FL_DAY
0,2022-05-03,OH,20397,5433,DCA,"Washington, DC",ALB,"Albany, NY",1010,1152,0.0,Z,0.0,318.0,2022,5,3
1,2022-05-04,OH,20397,5433,DCA,"Washington, DC",ALB,"Albany, NY",1010,1152,0.0,Z,0.0,318.0,2022,5,4
2,2022-05-01,OH,20397,5434,CLT,"Charlotte, NC",TLH,"Tallahassee, FL",2046,2227,0.0,Z,0.0,386.0,2022,5,1
3,2022-05-02,OH,20397,5434,CLT,"Charlotte, NC",TLH,"Tallahassee, FL",2046,2227,0.0,Z,0.0,386.0,2022,5,2
4,2022-05-03,OH,20397,5434,CLT,"Charlotte, NC",TLH,"Tallahassee, FL",2046,2227,0.0,Z,0.0,386.0,2022,5,3


In [6]:
pandas_df.isnull().sum()

FL_DATE              0
AIRLINE_CODE         0
DOT_CODE             0
FL_NUMBER            0
ORIGIN               0
ORIGIN_CITY          0
DEST                 0
DEST_CITY            0
CRS_DEP_TIME         0
CRS_ARR_TIME         0
CANCELLED            0
CANCELLATION_CODE    0
DIVERTED             0
DISTANCE             0
FL_YEAR              0
FL_MONTH             0
FL_DAY               0
dtype: int64

In [7]:
# Split the data into X (features) and y (target)
# The y variable should focus on the target column
y = pandas_df['CANCELLED']

# The X variable should include all features except the target
X = pandas_df.drop(columns=["CANCELLATION_CODE", "AIRLINE_CODE", "ORIGIN", "ORIGIN_CITY", "DEST", "DEST_CITY", "FL_DATE"])


In [8]:
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

# Split into testing and training sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [9]:
scaler = StandardScaler()
x_scaler = scaler.fit(X_train)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
# Declare a logistic regression model.
# Apply a random_state of 9 to the model
logistic_regression_model = LogisticRegression(random_state=9, max_iter=1000)

# Fit and save the logistic regression model using the training data
lr_model = logistic_regression_model.fit(X_train_scaled, y_train)

In [11]:
#Generate training predictions
training_predictions = lr_model.predict(X_train_scaled)

#Generate testing predictions
testing_predictions = logistic_regression_model.predict(X_test_scaled)

In [12]:
# Import the model for sklearn confusion matrix
from sklearn.metrics import confusion_matrix

# Create and save the confusion matrix for the training data
training_matrix = confusion_matrix(y_train, training_predictions)

# Print the confusion matrix for the training data
print(training_matrix)

[[4911173       0]
 [      0  135670]]


In [13]:
# Create and save the confusion matrix for the testing data
test_matrix = confusion_matrix(y_test, testing_predictions)

# Print the confusion matrix for the testing data
print(test_matrix)

[[1636666       0]
 [      0   45616]]


In [14]:
# Create and save the training classification report
training_report = classification_report(y_train, training_predictions)

# Print the training classification report
print(training_report)

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00   4911173
         1.0       1.00      1.00      1.00    135670

    accuracy                           1.00   5046843
   macro avg       1.00      1.00      1.00   5046843
weighted avg       1.00      1.00      1.00   5046843



In [15]:
# Create and save the testing classification report
testing_report = classification_report(y_test, testing_predictions)

# Print the testing classification report
print(testing_report)

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00   1636666
         1.0       1.00      1.00      1.00     45616

    accuracy                           1.00   1682282
   macro avg       1.00      1.00      1.00   1682282
weighted avg       1.00      1.00      1.00   1682282

