In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [12]:
# Step 2: Load the Dataset
df = pd.read_csv("C:/Users/jasmi/Downloads/Jasmithareddy_data_cleaned_preprocessed.csv")  # Replace with your actual file
print(df.head())
print(df.info())

   Flight ID    Airline  Flight_Distance Origin_Airport Destination_Airport  \
0  7319483.0  Airline D            475.0      Airport 3           Airport 2   
1  4791965.0  Airline E            538.0      Airport 5           Airport 4   
2  2991718.0  Airline C            565.0      Airport 1           Airport 2   
3  4220106.0  Airline E            658.0      Airport 5           Airport 3   
4  2263008.0  Airline E            566.0      Airport 2           Airport 2   

   Scheduled_Departure_Time  Day_of_Week  Month Airplane_Type  Weather_Score  \
0                       4.0          6.0    1.0        Type C       0.225122   
1                      12.0          1.0    6.0        Type B       0.060346   
2                      17.0          3.0    9.0        Type C       0.093920   
3                       1.0          1.0    8.0        Type B       0.656750   
4                      19.0          7.0   12.0        Type E       0.505211   

   Previous_Flight_Delay_Minutes  Airline_Ra

In [13]:
# Ensure the Cancelled column is binary (0 or 1)
df["Flight_Cancelled"] = df["Flight_Cancelled"].astype(int)  # or use .map() if needed

# Step 3: Separate Features and Target
X = df.drop("Flight_Cancelled", axis=1)   # Target: Cancelled (0 or 1)
y = df["Flight_Cancelled"]

# Step 4: Identify Categorical and Numerical Columns
categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

# Step 5: Split the Data into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Step 6: Preprocessing Pipelines
# - OneHotEncode categorical columns
# - Scale numerical columns
categorical_transformer = OneHotEncoder(handle_unknown="ignore")
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

# Step 7: Create the Modeling Pipeline
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

# Step 8: Train the Logistic Regression Model
model.fit(X_train, y_train)

# Step 9: Make Predictions
y_pred = model.predict(X_test)

In [14]:
#Step-10
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7220496894409938
Precision: 0.7616926503340757
Recall: 0.8260869565217391
F1 Score: 0.7925840092699884

Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.53      0.58       230
           1       0.76      0.83      0.79       414

    accuracy                           0.72       644
   macro avg       0.70      0.68      0.69       644
weighted avg       0.71      0.72      0.72       644



In [17]:
# Fit the preprocessor on the full dataset (before train/test split)
X_transformed = preprocessor.fit_transform(X)

# Get feature names after transformation
encoded_feature_names = preprocessor.get_feature_names_out()

# Create DataFrame from the NumPy array
X_transformed_df = pd.DataFrame(X_transformed, columns=encoded_feature_names)

# Add target column
X_transformed_df["Flight_Cancelled"] = y.values

# Save to CSV
X_transformed_df.to_csv("C:/Users/jasmi/Downloads/hfd.csv", index=False)
