In [None]:
!pip install -q --upgrade s3fs pyarrow scikit-learn xgboost joblib


In [None]:
import xgboost as xgb
import pandas as pd
import s3fs
import pyarrow as pa

print("xgboost version:", xgb.__version__)
print("All core libraries working fine.")


In [None]:
import pyarrow.dataset as ds

fs = s3fs.S3FileSystem()

dataset = ds.dataset("ys-flight-data-gold/ml-2024/", filesystem=fs, format="parquet")

df = dataset.to_table().to_pandas()

print("Shape:", df.shape)
df.head()


In [None]:
print("rows,cols:", df.shape)
display(df.dtypes)   


In [None]:
import pandas as pd

features = ['month', 'day_of_week', 'season', 'carrier', 'operating_carrier',
            'origin', 'destination', 'distance', 'dep_delay_minutes']
target = 'is_delayed'

df_model = df[features + [target]].copy()

# fill numeric nulls
df_model['distance'] = df_model['distance'].fillna(df_model['distance'].median())
df_model['dep_delay_minutes'] = df_model['dep_delay_minutes'].fillna(0)

# check nulls
print("Nulls:\n", df_model.isna().sum())


In [None]:
from sklearn.preprocessing import LabelEncoder

cat_cols = ['season', 'carrier', 'operating_carrier', 'origin', 'destination']
encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    df_model[col] = le.fit_transform(df_model[col].astype(str))
    encoders[col] = dict(zip(le.classes_, le.transform(le.classes_)))

print("Encoding complete!")
df_model.head()


In [None]:
from sklearn.model_selection import train_test_split

X = df_model.drop(columns=[target])
y = df_model[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape)
print("Target ratio:", y_train.mean())


In [None]:
import xgboost as xgb
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

# handle imbalance
scale_pos_weight = (len(y_train) - y_train.sum()) / y_train.sum()

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'eta': 0.1,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'scale_pos_weight': scale_pos_weight,
    'verbosity': 1
}

watchlist = [(dtrain, 'train'), (dval, 'eval')]
bst = xgb.train(params, dtrain, num_boost_round=200, evals=watchlist, early_stopping_rounds=20)


In [None]:
y_prob = bst.predict(dval)
y_pred = (y_prob > 0.5).astype(int)

print("AUC:", roc_auc_score(y_test, y_prob))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


In [None]:
import pandas as pd

fi = pd.Series(bst.get_score(importance_type='gain')).sort_values(ascending=False)
print(fi.head(10))
fi.to_csv("feature_importance.csv")


In [None]:
import joblib, boto3, os

local_dir = "/home/ec2-user/SageMaker/artifacts"
os.makedirs(local_dir, exist_ok=True)

bst.save_model(f"{local_dir}/xgb_flight_delay.json")
joblib.dump(encoders, f"{local_dir}/label_encoders.pkl")

s3 = boto3.client("s3")
bucket = "ys-flight-data-gold"

s3.upload_file(f"{local_dir}/xgb_flight_delay.json", bucket, "models/xgb_flight_delay.json")
s3.upload_file(f"{local_dir}/label_encoders.pkl", bucket, "models/label_encoders.pkl")
s3.upload_file("feature_importance.csv", bucket, "models/feature_importance.csv")

print(f"Model + encoders saved to s3://{bucket}/models/")
