##### Copyright 2018 The TensorFlow Authors.

In [39]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## **Introduction to Colab and Python**

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/examples/blob/master/courses/udacity_intro_to_tensorflow_for_deep_learning/l01c01_introduction_to_colab_and_python.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/tensorflow/examples/blob/master/courses/udacity_intro_to_tensorflow_for_deep_learning/l01c01_introduction_to_colab_and_python.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
</table>

## Numpy and lists
Python has lists built into the language.
However, we will use a library called numpy for this.
Numpy gives you lots of support functions that are useful when doing Machine Learning.

Here, you will also see an import statement. This statement makes the entire numpy package available and we can access those symbols using the abbreviated 'np' syntax.

## Colab Specifics

Colab is a virtual machine you can access directly. To run commands at the VM's terminal, prefix the line with an exclamation point (!).


In [40]:
print("\nDoing $ls on filesystem")
!ls -l
!pwd


Doing $ls on filesystem
total 13892
-rw-r--r-- 1 root root    14694 Nov  7 20:38 Correlations_heatmap.xlsx
drwxr-xr-x 2 root root     4096 Nov  7 20:37 Loan
-rw-r--r-- 1 root root     2982 Nov  7 20:39 main.py
-rw-r--r-- 1 root root    24064 Nov  7 20:38 metaData.xlsx
drwxr-xr-x 1 root root     4096 Nov  7 20:37 sample_data
-rw-r--r-- 1 root root   665500 Nov  7 20:38 sample_submission.csv
-rw-r--r-- 1 root root   483779 Nov  7 20:58 submission.csv
-rw-r--r-- 1 root root   485176 Nov  7 21:11 submission_nn.csv
-rw-r--r-- 1 root root  2440853 Nov  7 20:38 test.csv
-rw-r--r-- 1 root root 10085920 Nov  7 20:39 train.csv
/content


In [41]:
print("Install numpy")  # Just for test, numpy is actually preinstalled in all Colab instances
!pip install numpy

Install numpy


In [42]:
pip install tensorflow



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

# Load data
train = pd.read_csv("/content/train.csv")
test = pd.read_csv("/content/test.csv")
y = train["Default 12 Flag"]
train.drop(["ID", "Default 12 Flag"], axis=1, inplace=True)
test_ids = test["ID"]
test.drop("ID", axis=1, inplace=True)

# Remove object columns (e.g., dates)
train = train.select_dtypes(include=["number", "bool"])
test = test.select_dtypes(include=["number", "bool"])

# Fill missing values
train.fillna(train.median(numeric_only=True), inplace=True)
test.fillna(test.median(numeric_only=True), inplace=True)

# Scale features
scaler = StandardScaler()
X = scaler.fit_transform(train)
X_test = scaler.transform(test)

# Train-validation split
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Define base models
base_models = [
    ("rf", RandomForestClassifier(n_estimators=100, random_state=42)),
    ("gb", GradientBoostingClassifier(n_estimators=100, learning_rate=0.05, random_state=42)),
    ("svc", SVC(probability=True, kernel="rbf", C=1.0, gamma="scale", random_state=42))
]

# Train base models and collect predictions
val_preds = []
test_preds = []
for name, model in base_models:
    model.fit(X_tr, y_tr)
    val_pred = model.predict_proba(X_val)[:, 1]
    test_pred = model.predict_proba(X_test)[:, 1]
    val_preds.append(val_pred)
    test_preds.append(test_pred)

# Stack predictions as features
X_meta_val = np.column_stack(val_preds)
X_meta_test = np.column_stack(test_preds)

# Meta model: XGBoost
meta_model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.05, use_label_encoder=False, eval_metric="auc", random_state=42)
meta_model.fit(X_meta_val, y_val)

# Evaluate on validation
val_final_pred = meta_model.predict_proba(X_meta_val)[:, 1]
print(f"ðŸ“ˆ Stacked Ensemble AUC (XGBoost Meta): {roc_auc_score(y_val, val_final_pred):.4f}")

# Final prediction
test_final_pred = meta_model.predict_proba(X_meta_test)[:, 1]
submission = pd.DataFrame({"ID": test_ids, "Default 12 Flag": test_final_pred})
submission.to_csv("submission_stack_xgb.csv", index=False)
print("âœ… submission_stack_xgb.csv saved.")
