In [24]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import pandas as pd
import numpy as np

In [25]:
df = pd.read_csv("C:/Users/jasmi/Downloads/Jasmithareddy_data_cleaned_preprocessed.csv")

In [32]:
df["Flight_Cancelled"] = df["Flight_Cancelled"].astype(int)  # or use .map() if needed

# Step 3: Separate Features and Target
X = df.drop("Flight_Cancelled", axis=1)   # Target: Cancelled (0 or 1)
y = df["Flight_Cancelled"]

# Step 5: Split the Data into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Step 6: Preprocessing Pipelines
# - OneHotEncode categorical columns
# - Scale numerical columns
categorical_transformer = OneHotEncoder(handle_unknown="ignore")
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

# Step 7: Create the Modeling Pipeline
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])


In [33]:
results = []

for name, model in models.items():
    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("classifier", model)
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    results.append({
        "Model": name,
        "Accuracy": round(accuracy, 4),
        "Precision": round(precision, 4),
        "Recall": round(recall, 4),
        "F1 Score": round(f1, 4)
    })


In [34]:
results_df = pd.DataFrame(results)
results_df.sort_values(by="F1 Score", ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
2,Random Forest,0.9829,0.9927,0.9807,0.9866
1,Decision Tree,0.9658,0.9667,0.9807,0.9736
3,Support Vector Machine,0.722,0.7605,0.8285,0.7931
0,Logistic Regression,0.722,0.7617,0.8261,0.7926


In [35]:
print("Classification Report for Random Forest:")
print(classification_report(y_test, pipeline.predict(X_test)))

Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.63      0.53      0.58       230
           1       0.76      0.83      0.79       414

    accuracy                           0.72       644
   macro avg       0.70      0.68      0.68       644
weighted avg       0.71      0.72      0.72       644



In [None]:
** Interpretation
-> Random Forest clearly outperforms all other models on every metric.

-> Decision Tree also performs very well and is easier to interpret than Random Forest, though slightly less accurate.

-> SVM and Logistic Regression have identical accuracy and comparable F1 scores, but both are far behind the tree-based models.

** Model Recommendation
✅ Recommended Model: Random Forest
    
-> Highest Accuracy and F1 Score — Indicates strong overall performance.

-> Excellent Precision and Recall — Balanced performance on both false positives and false negatives.

-> Robustness — Less prone to overfitting compared to a single Decision Tree.
