In [None]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import time
from sklearn.model_selection import GridSearchCV
import pickle

In [None]:
df = pd.read_csv('us-flight-cleaned-data.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

### We are going to predict whether the flight is likely to be delayed or not, i.e., a boolean response.

### Q.) What are the possible models that could be used for this?

- Logistic regression: Since our target variable is binary classification, the first model that comes to mind is logistic regression.
- Linear/Polynomial regression: To see whether this model can be used or not, we need to plot graphs beterrn independent and target variables to see the pattern that is being followed.
- Random Forest
- Gradient Boosting
- Support Vector Machines (SVM): SVMs can be used for binary classification, so we can try this. SVMs are useful in handling high-dimensional data, but our data is not high dimensional.
- K-Nearest Neighbors (KNN): KNN is useful for small datasets, so not sure how it'll behave here, but we can use it for classification (and regression as well).
- Time Series Models: For predicting flight delays based on historical data, time series models like ARIMA (AutoRegressive Integrated Moving Average) or SARIMA (Seasonal ARIMA) can be useful.
- Neural Networks: Deep learning models, particularly recurrent neural networks (RNNs) and long short-term memory (LSTM) networks, can capture sequential patterns in flight data, which can be useful for predicting delays.

In [None]:
df.columns

In [None]:
# cols = ['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'AirTime']
# for col in cols:
#     sns.scatterplot(y=df[col], x=df.Delay)
#     plt.show()

### Let's convert categorical columns to numeric values (because some of the models require that all data be in numerical form, so to keep the process consistent, we'll convert all data to nemerical form).

In [None]:
df.head()

In [None]:
df.Origin.value_counts()

In [None]:
df.OriginCityName.value_counts()

In [None]:
df.OriginStateName.value_counts()

In [None]:
df.Dest.value_counts()

In [None]:
df.DestCityName.value_counts()

In [None]:
df.DestStateName.value_counts()

In [None]:
df.AirlineName.value_counts()

### As seen in EDA, reporting airlines, origin airports, origin cities, origin states, destination airports, destination cities and destination states, all follow a pattern in relation to the number of delayed flights, therefore we can use target encoding for all these columns.

In [None]:
AirlineNameNumericalMap = df.groupby('AirlineName')['IsDelayed'].mean().to_dict()
OriginAirportNumericalMap = df.groupby('Origin')['IsDelayed'].mean().to_dict()
OriginCityNumericalMap = df.groupby('OriginCityName')['IsDelayed'].mean().to_dict()
OriginStateNumericalMap = df.groupby('OriginStateName')['IsDelayed'].mean().to_dict()
DestAirportNumericalMap = df.groupby('Dest')['IsDelayed'].mean().to_dict()
DestCityNumericalMap = df.groupby('DestCityName')['IsDelayed'].mean().to_dict()
DestStateNumericalMap = df.groupby('DestStateName')['IsDelayed'].mean().to_dict()

In [None]:
OriginAirportNumericalMap

In [None]:
df['AirlineNameEncoded'] = df['AirlineName'].map(AirlineNameNumericalMap)
df['OriginAirportEncoded'] = df['Origin'].map(OriginAirportNumericalMap)
df['OriginCityEncoded'] = df['OriginCityName'].map(OriginCityNumericalMap)
df['OriginStateEncoded'] = df['OriginStateName'].map(OriginStateNumericalMap)
df['DestAirportEncoded'] = df['Dest'].map(DestAirportNumericalMap)
df['DestCityEncoded'] = df['DestCityName'].map(DestCityNumericalMap)
df['DestStateEncoded'] = df['DestStateName'].map(DestStateNumericalMap)

In [None]:
df.head()

Since Reporting_Airline, AirlineName, Origin, OriginCityName, OriginStateName, Dest, DestCityName and DestStateName are not required now, we'll drop these.

In [None]:
df.drop(columns=['Reporting_Airline', 'AirlineName', 'Origin', 'OriginCityName', 
                 'OriginStateName', 'Dest', 'DestCityName', 'DestStateName'], 
        inplace=True)

In [None]:
df.head()

In [None]:
# df.to_csv('model_ready_data.csv', index = False)

### All the columns have been converted into numerical data

In [None]:
df.columns

## Getting features and target data

In [None]:
X_not_scaled = df.drop(columns=['Delay', 'IsDelayed'])
y_is_delayed = df['IsDelayed']
y_delay_time = df['Delay']

## Creating min-max scaled dataframe of features

In [None]:
X_to_be_scaled = df.drop(columns=['Delay', 'IsDelayed'])
min_max_scaler = MinMaxScaler()
X_min_max_scaled = min_max_scaler.fit_transform(X_to_be_scaled)

## Creating standardly scaled dataframe of features

In [None]:
standard_scaler = StandardScaler()
X_standard_scaled = standard_scaler.fit_transform(X_to_be_scaled)

## Defining helper functions

In [None]:
def print_model_performance(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    print("Accuracy:", accuracy)
#     print(classification_report(y_test, y_pred))

In [None]:
def train_model(model, param_grid, X_train, X_test, y_train, y_test, pickile_file_name):
    
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
    
    start_time = time.time()
    grid_search.fit(X_train, y_train)
    end_time = time.time()
    
    print("Training time: {:.6f} seconds".format(execution_time))
    
    accuracy = grid_search.score(X_test, y_test)
    print("Test Accuracy:", accuracy)

    with open(f'{pickile_file_name}.pkl', 'wb') as file:
        pickle.dump(grid_search, file)

## Linear Regression

Since linear regression is used to predict continuous values, we'll be using the delay (gives the time in minutes by which a flight is delayed) column as the target variable and we will use it to make the prediction for the IsDelayed column based on a threshold value.

### Without scaling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_not_scaled, y_delay_time, test_size=0.2, random_state=42)

linear_reg = LinearRegression()

start_time = time.time()
linear_reg.fit(X_train, y_train)
predictions = linear_reg.predict(X_test)
end_time = time.time()

execution_time = end_time - start_time
print("Training time: {:.6f} seconds".format(execution_time))

y_pred = predictions>0
y_true = y_test>0

print_model_performance(y_true, y_pred)

### With MinMax Scaling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_min_max_scaled, y_delay_time, test_size=0.2, random_state=42)

linear_reg = LinearRegression()

start_time = time.time()
linear_reg.fit(X_train, y_train)
predictions = linear_reg.predict(X_test)
end_time = time.time()

execution_time = end_time - start_time
print("Training time: {:.6f} seconds".format(execution_time))

y_pred = predictions>0
y_true = y_test>0

print_model_performance(y_true, y_pred)

### Standardization (Z-score normalization)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_standard_scaled, y_delay_time, test_size=0.2, random_state=42)

linear_reg = LinearRegression()

start_time = time.time()
linear_reg.fit(X_train, y_train)
predictions = linear_reg.predict(X_test)
end_time = time.time()

execution_time = end_time - start_time
print("Training time: {:.6f} seconds".format(execution_time))

y_pred = predictions>0
y_true = y_test>0

print_model_performance(y_true, y_pred)

### Hyperparameter tuning

There are no such hyperparameters involved in linear regression to be tuned. Although we can use regularization techniques (Ridge/Lasso) to imrove model performance, but the model is already performing so badly, there is no point investing much time on it. There are better models available that we can use.

## Polynomial Regression

### Without scaling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_not_scaled, y_delay_time, test_size=0.2, random_state=42)

degree = 2
polyreg = make_pipeline(PolynomialFeatures(degree), LinearRegression())

start_time = time.time()
polyreg.fit(X_train, y_train)
poly_predictions = polyreg.predict(X_test)
end_time = time.time()

execution_time = end_time - start_time
print("Training time: {:.6f} seconds".format(execution_time))

y_pred = predictions>0
y_true = y_test>0

print_model_performance(y_true, y_pred)

### With MinMax Scaling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_min_max_scaled, y_delay_time, test_size=0.2, random_state=42)

degree = 2
polyreg = make_pipeline(PolynomialFeatures(degree), LinearRegression())

start_time = time.time()
polyreg.fit(X_train, y_train)
poly_predictions = polyreg.predict(X_test)
end_time = time.time()

execution_time = end_time - start_time
print("Training time: {:.6f} seconds".format(execution_time))

y_pred = predictions>0
y_true = y_test>0

print_model_performance(y_true, y_pred)

### Standardization (Z-score normalization)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_standard_scaled, y_delay_time, test_size=0.2, random_state=42)

degree = 2
polyreg = make_pipeline(PolynomialFeatures(degree), LinearRegression())

start_time = time.time()
polyreg.fit(X_train, y_train)
poly_predictions = polyreg.predict(X_test)
end_time = time.time()

execution_time = end_time - start_time
print("Training time: {:.6f} seconds".format(execution_time))

y_pred = predictions>0
y_true = y_test>0

print_model_performance(y_true, y_pred)

### Hyperparameter tuning

Similar to linear regression, polynomial regression is also performing quite badly, so we'll be focussing more on other models. Again, we can try to tune the hyperparameter - degree of polynomial or use regularization techniques, but the accuracy is already so low, it's a good idea to move on to other models.

## Logistic Regression

### Hyperparameter tuning

We'll be using grid search for hyperparameter tuning of models. The inbuilt method GridSearchCV() of sklearn also performs cross-validation internally to get the best model performance, hence we won't be doing cross validation explicitly.

In [None]:
param_grid = {
    'penalty': ['l1', 'l2'],  
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

Hyperparameters' meaning:
- penalty: Type of regularization used in the model ('l1' (Lasso regression), 'l2' (Ridge regression), 'none' (no regularization))
- C: Inverse of regularization strength. Smaller values specify stronger regularization. Regularization helps prevent overfitting by penalizing large coefficients.
- solver: Algorithm to use in the optimization problem. The choice of solver can impact the convergence and speed of the optimization.

### Without scaling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_not_scaled, y_is_delayed, test_size=0.2, random_state=42)
pickile_file_name = 'logistic_regression_no_scaling'
train_model(LogisticRegression(), param_grid, X_train, X_test, y_train, y_test, pickile_file_name)

### With MinMax Scaling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_min_max_scaled, y_is_delayed, test_size=0.2, random_state=42)

pickile_file_name = 'logistic_regression_min_max_scaling'
train_model(LogisticRegression(), param_grid, X_train, X_test, y_train, y_test, pickile_file_name)

### Standardization (Z-score normalization)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_standard_scaled, y_is_delayed, test_size=0.2, random_state=42)

pickile_file_name = 'logistic_regression_standard_scaling'
train_model(LogisticRegression(), param_grid, X_train, X_test, y_train, y_test, pickile_file_name)

## Random Forest

### Hyperparameter tuning

Random forst model uses decision trees at its base and since its not a distance based model, scaling should not impact the model performance, hence measuring this model's performance without scaling only.

Hyperparameters' meaning:
- n_estimators: The number of trees in the forest.
- max_depth: The maximum depth of each tree in the forest. Deeper trees can model more complex patterns in the data, but they are more likely to overfit.
- min_samples_split: The minimum number of samples required to split an internal node. It specifies the smallest number of samples a node can have to be split further.
- min_samples_leaf: The minimum number of samples required to be at a leaf node. It specifies the smallest number of samples a leaf node can have.

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_not_scaled, y_is_delayed, test_size=0.2, random_state=42)

pickile_file_name = 'random_forest'
train_model(RandomForestClassifier(random_state=42), param_grid, X_train, X_test, y_train, y_test, 
            pickile_file_name)

## Gradient Boosting

Just like random forest, gradient boosting model also uses decision trees at its base and since its not a distance based model, scaling should not impact the model performance, hence measuring this model's performance without scaling only.

### Hyperparameter tuning

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2]
}

Hyperparameters' meaning:
- n_estimators: The number of boosting stages to be run. This is the number of trees added to the model.
- learning_rate: Shrinks the contribution of each tree.
- max_depth: The maximum depth of the individual trees.
- min_samples_split: The minimum number of samples required to split an internal node.
- min_samples_leaf: The minimum number of samples required to be at a leaf node.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_not_scaled, y_is_delayed, test_size=0.2, random_state=42)

pickile_file_name = 'gradient_boosting'
train_model(GradientBoostingClassifier(random_state=42), param_grid, X_train, X_test, y_train, y_test, 
            pickile_file_name)

## Support Vector Machines

### Hyperparameter tuning

In [None]:
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto', 0.1, 1]
}

Hyperparameters' meaning:
- C: Regularization parameter. It trades off correct classification of training examples against maximization of the decision function's margin. Smaller C encourages a larger margin and a simpler decision function, but may misclassify some points. Larger C penalizes classification mistakes and aims for a more complex decision function that fits the training data better.
- kernel: Specifies the kernel type used in the algorithm. Common choices include:
    - 'linear': Linear kernel (works well for linearly separable data).
    - 'poly': Polynomial kernel.
    - 'rbf' (Radial basis function): Gaussian kernel.
    - 'sigmoid': Sigmoid kernel.
- gamma: Kernel coefficient for 'poly', 'rbf', and 'sigmoid'. Higher values of gamma make the model fit the training data more precisely, potentially leading to overfitting.

### WIthout scaling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_not_scaled, y_is_delayed, test_size=0.2, random_state=42)

pickile_file_name = 'svm_no_scaling'
train_model(SVC(random_state=42), param_grid, X_train, X_test, y_train, y_test, pickile_file_name)

### With MinMax Scaling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_min_max_scaled, y_is_delayed, test_size=0.2, random_state=42)

pickile_file_name = 'svm_min_max_scaling'
train_model(SVC(random_state=42), param_grid, X_train, X_test, y_train, y_test, pickile_file_name)

### Standardization (Z-score normalization)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_standard_scaled, y_is_delayed, test_size=0.2, random_state=42)

pickile_file_name = 'svm_standard_scaling'
train_model(SVC(random_state=42), param_grid, X_train, X_test, y_train, y_test, pickile_file_name)

## K-Nearest Neighbors (KNN)

### Hyperparameter tuning

In [None]:
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2]
}

Hyperparameters' meaning:
- n_neighbors: The number of neighbors to consider when making predictions.
- weights: The weight function used in prediction. It can be set to:
    - 'uniform': All neighbors have equal weight.
    - 'distance': Closer neighbors have a greater influence than neighbors that are farther away.
    - A custom function: You can define a custom function that assigns weights to neighbors based on their distance.
- algorithm: The algorithm used to compute the nearest neighbors. Options include 'auto', 'ball_tree', 'kd_tree', and 'brute'. The choice of algorithm can impact the speed and memory usage of the KNN model, especially for large datasets.
- p: The power parameter for the Minkowski distance metric. When p=1, it corresponds to the Manhattan distance (L1 norm). When p=2, it corresponds to the Euclidean distance (L2 norm). For other values of p, it represents the generalization of Minkowski distance.

### Without scaling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_not_scaled, y_is_delayed, test_size=0.2, random_state=42)

pickile_file_name = 'knn_no_scaling'
train_model(KNeighborsClassifier(), param_grid, X_train, X_test, y_train, y_test, pickile_file_name)

### With MinMax Scaling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_min_max_scaled, y_is_delayed, test_size=0.2, random_state=42)

pickile_file_name = 'knn_min_max_scaling'
train_model(KNeighborsClassifier(), param_grid, X_train, X_test, y_train, y_test, pickile_file_name

### Standardization (Z-score normalization)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_standard_scaled, y_is_delayed, test_size=0.2, random_state=42)

pickile_file_name = 'knn_standard_scaling'
train_model(KNeighborsClassifier(), param_grid, X_train, X_test, y_train, y_test, pickile_file_name)

## TO-DO: Can compare performance of time series models and deep learning models as well.

In [None]:
df.head()