In [21]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('./data/flights.csv')

df.fillna(0, inplace=True)

In [23]:
# Create a dictionary for OriginAirportID to OriginAirportName
origin_airport_map = df.set_index('OriginAirportID')['OriginAirportName'].to_dict()

# Create a dictionary for DestAirportID to DestAirportName
dest_airport_map = df.set_index('DestAirportID')['DestAirportName'].to_dict()

# Combine both dictionaries into one
airport_map = {**origin_airport_map, **dest_airport_map}

print(airport_map)

In [24]:
# Remove duplicate records from airport_map
unique_airport_map = {k: v for k, v in airport_map.items()}

print(unique_airport_map)

In [25]:
import csv

# Define the file path
file_path = './data/airports.csv'

# Write the unique_airport_map dictionary to a CSV file
with open(file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['AirportID', 'AirportName'])
    for airport_id, airport_name in unique_airport_map.items():
        writer.writerow([airport_id, airport_name])

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd
import joblib

# Read the CSV file
df = pd.read_csv('./data/flights.csv')

df.fillna(0, inplace=True)

# Create a new column 'Delayed' which is 1 if either DepDel15 or ArrDel15 is greater than 0, else 0
df['Delayed'] = ((df['DepDel15'] > 0) | (df['ArrDel15'] > 0)).astype(int)

# Select features and target variable
X = df[['DayOfWeek', 'OriginAirportID', 'DestAirportID']]
y = df['Delayed']

# Convert categorical variables to dummy variables
X = pd.get_dummies(X, columns=['OriginAirportID', 'DestAirportID'], drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Function to predict delay probability for a given day and airport pair
def predict_delay(day_of_week, origin, dest):
    input_data = pd.DataFrame({'DayOfWeek': [day_of_week], 'Origin': [origin], 'Dest': [dest]})
    input_data = pd.get_dummies(input_data, columns=['Origin', 'Dest'], drop_first=True)
    input_data = input_data.reindex(columns=X.columns, fill_value=0)
    probability = model.predict_proba(input_data)[0][1]
    return probability

# Example usage
day_of_week = 7  # Wednesday
origin = 12478
dest = 14771
delay_probability = predict_delay(day_of_week, origin, dest)
print(f'Probability of delay: {delay_probability:.2f}')

# Save the model to a file
joblib.dump(model, 'model/delayedFlights.pkl')
joblib.dump(X, 'model/delayedFlightsColumns.pkl')



Accuracy: 0.75
Probability of delay: 0.24


['model/delayedFlightsColumns.pkl']