In [6]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.utils import shuffle

# Specify the file path of the flight delay dataset
dataset_file = "flights.csv"

# Load the flight delay dataset
data = pd.read_csv(dataset_file)

# Check for missing values
print(data.isnull().sum())

# Drop rows with missing values
data = data.dropna()

# Shuffle the dataset
data = shuffle(data, random_state=42)

# Select relevant features for prediction
features = ['MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
            'SCHEDULED_DEPARTURE', 'SCHEDULED_ARRIVAL', 'DISTANCE']

# Create feature matrix X and target variable y
X = data[features]
y = data['DEPARTURE_DELAY']

# Convert categorical variables into dummy/indicator variables
X = pd.get_dummies(X)

# Set the minimum threshold for the number of samples required
min_samples_threshold = 100

# Check if the dataset has enough samples for training
if len(X) >= min_samples_threshold:

    # Initialize the Random Forest classifier
    rf = RandomForestClassifier(random_state=42)

    # Define the parameter grid for hyperparameter tuning
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt', 'log2']
    }

    # Perform grid search with cross-validation to find the best hyperparameters
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X, y)

    # Get the best model with optimized hyperparameters
    best_rf = grid_search.best_estimator_

    # Make predictions on the entire dataset
    y_pred = best_rf.predict(X)

    # Generate classification report
    report = classification_report(y, y_pred)
    print("Classification Report:")
    print(report)

    # Get feature importances
    importances = pd.DataFrame({'Feature': X.columns, 'Importance': best_rf.feature_importances_})
    importances = importances.sort_values('Importance', ascending=False).reset_index(drop=True)

    # Display the top 5 most important features
    print("Top 5 Most Important Features:")
    print(importances.head(5))

    # Example: Generate predictions on new data
    new_data = pd.DataFrame({
        'MONTH': [7],
        'DAY': [15],
        'DAY_OF_WEEK': [4],
        'AIRLINE': ['AA'],
        'ORIGIN_AIRPORT': ['JFK'],
        'DESTINATION_AIRPORT': ['LAX'],
        'SCHEDULED_DEPARTURE': [1230],
        'SCHEDULED_ARRIVAL': [1600],
        'DISTANCE': [2475]
    })

    # Convert categorical variables into dummy/indicator variables for the new data
    new_data_encoded = pd.get_dummies(new_data)

    # Make predictions on the new data
    new_predictions = best_rf.predict(new_data_encoded)
    print("Predictions on New Data:")
    print(new_predictions)

else:
    print("Error: The dataset does not have enough samples for training.")

  data = pd.read_csv(dataset_file)


YEAR                         0
MONTH                        0
DAY                          0
DAY_OF_WEEK                  0
AIRLINE                      0
FLIGHT_NUMBER                0
TAIL_NUMBER              14721
ORIGIN_AIRPORT               0
DESTINATION_AIRPORT          0
SCHEDULED_DEPARTURE          0
DEPARTURE_TIME           86153
DEPARTURE_DELAY          86153
TAXI_OUT                 89047
WHEELS_OFF               89047
SCHEDULED_TIME               6
ELAPSED_TIME            105071
AIR_TIME                105071
DISTANCE                     0
WHEELS_ON                92513
TAXI_IN                  92513
SCHEDULED_ARRIVAL            0
ARRIVAL_TIME             92513
ARRIVAL_DELAY           105071
DIVERTED                     0
CANCELLED                    0
CANCELLATION_REASON    5729195
AIR_SYSTEM_DELAY       4755640
SECURITY_DELAY         4755640
AIRLINE_DELAY          4755640
LATE_AIRCRAFT_DELAY    4755640
WEATHER_DELAY          4755640
dtype: int64
Error: The dataset does no