# British Airways Data Science Simulation – Forage

This notebook presents my solution to the **British Airways Data Science Virtual Experience Program** on Forage. It walks through data preprocessing, feature engineering, model training, and evaluation.

---


## 📌 Objective

Use customer booking data to:
- Explore factors influencing booking behavior
- Engineer predictive features
- Build a machine learning model to predict booking completion


In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix


In [None]:
# Load Data
# Dataset not included due to licensing restrictions from Forage
df = pd.read_csv("customer_booking.csv", encoding='ISO-8859-1')  # Replace with your local path
df.head()


In [None]:
# ---------------------
# Data Preprocessing
# ---------------------

# Feature Engineering
df['is_weekend_flight'] = df['flight_day'].isin(['Saturday', 'Sunday']).astype(int)
df['trip_duration_ratio'] = df['length_of_stay'] / (df['flight_duration'] + 1e-6)  # avoid division by zero

def time_of_day(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

df['time_of_day'] = df['flight_hour'].apply(time_of_day)

# Drop high cardinality columns if necessary
df = df.drop(columns=['route'])

# One-hot encode categorical features
categorical_cols = ['sales_channel', 'trip_type', 'flight_day', 'booking_origin', 'time_of_day']
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


In [None]:
# ---------------------
# Train/Test Split
# ---------------------

X = df_encoded.drop(columns=['booking_complete'])
y = df_encoded['booking_complete']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# ---------------------
# Train Random Forest
# ---------------------

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)


In [None]:
# ---------------------
# Evaluation
# ---------------------

y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

print("Classification Report:")
print(classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_proba))

# Cross-validation
cv_scores = cross_val_score(clf, X, y, cv=5, scoring='roc_auc')
print(f"Cross-Validated ROC AUC: {cv_scores.mean():.4f}")


In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
# ---------------------
# Feature Importance
# --------------------- 

importances = clf.feature_importances_
features = X.columns
feat_imp = pd.Series(importances, index=features).sort_values(ascending=False)

# Plot top 15 features
plt.figure(figsize=(10, 6))
sns.barplot(x=feat_imp[:15], y=feat_imp.index[:15])
plt.title("Top 15 Feature Importances")
plt.tight_layout()
plt.show()


In [None]:
# ---------------------
# Export Feature Importances (Optional for Slide)
# ---------------------
feat_imp[:15].to_csv("top_features.csv")
