<a href="https://colab.research.google.com/github/dhananjayd99/Python-/blob/master/courses/udacity_intro_to_tensorflow_for_deep_learning/l01c01_introduction_to_colab_and_python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Load data
train = pd.read_csv("/content/train.csv", on_bad_lines="skip", low_memory=False)
test = pd.read_csv("/content/test.csv")

# Keep test ID
test_ids = test["ID"]

# Extract target and drop ID and target from train
y = train["Default 12 Flag"]
train.drop(["ID", "Default 12 Flag"], axis=1, inplace=True, errors="ignore")
test.drop(["ID"], axis=1, inplace=True, errors="ignore")

# Drop non-numeric columns (you can extract date features later if needed)
combined = pd.concat([train, test], axis=0)
for col in combined.select_dtypes(include="object").columns:
    combined[col] = combined[col].astype(str)
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col])

# Impute missing values
imputer = SimpleImputer(strategy="median")
combined = pd.DataFrame(imputer.fit_transform(combined), columns=combined.columns)

# Split back
X = combined.iloc[:len(train)]
X_test = combined.iloc[len(train):]

# Cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = GradientBoostingClassifier(
        n_estimators=200,
        learning_rate=0.03,
        max_depth=4,
        min_samples_leaf=20,
        random_state=42
    )
    model.fit(X_tr, y_tr)

    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
    test_preds += model.predict_proba(X_test)[:, 1] / kf.n_splits

    fold_auc = roc_auc_score(y_val, model.predict_proba(X_val)[:, 1])
    print(f"ðŸ“ˆ Fold {fold+1} AUC: {fold_auc:.4f}")

# Final AUC
print(f"\nâœ… Full Cross-Validation AUC: {roc_auc_score(y, oof_preds):.4f}")

# Save submission
submission = pd.DataFrame({
    "ID": test_ids,
    "Default 12 Flag": test_preds
})
submission.to_csv("submission_cv_gb.csv", index=False)
print("âœ… submission_cv_gb.csv saved.")


FileNotFoundError: [Errno 2] No such file or directory: '/content/train.csv'

In [11]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, make_scorer
from scipy.stats import uniform, randint
import warnings

# Suppress warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# --- 1. Data Loading ---

# Load the datasets
try:
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')
    sample_submission = pd.read_csv('sample_submission.csv')
except FileNotFoundError as e:
    print(f"Error: One of the files was not found. Please ensure all files are available. {e}")
    exit()

print("âœ… Data loaded successfully.")

# --- 2. Preprocessing and Feature Engineering Function ---

def preprocess_data(df):
    """Performs feature engineering and converts types."""

    df['Application Date'] = pd.to_datetime(df['Application Date'])
    df['Date of Birth'] = pd.to_datetime(df['Date of Birth'], errors='coerce')

    # Feature Engineering: Age and Days since application
    current_year = 2019
    df['Age'] = current_year - df['Date of Birth'].dt.year
    df['Days_Since_Application'] = (datetime(2020, 1, 1) - df['Application Date']).dt.days

    # Drop original date/time columns
    df = df.drop(columns=['Application Date', 'Application Time', 'Date of Birth'], errors='ignore')

    # Store ID and drop the column from features
    df_id = df.pop('ID') if 'ID' in df.columns else None

    # Handle 'JIS Address Code'
    if 'JIS Address Code' in df.columns:
        df['JIS Address Code'] = df['JIS Address Code'].astype(str).str.replace(r'\.0$', '', regex=True)

    # Log Transform Skewed Numerical Features
    for col in ['Total Annual Income', 'Amount of Unsecured Loans',
                'Declared Amount of Unsecured Loans', 'Rent Burden Amount']:
        if col in df.columns:
            df[col] = np.log1p(df[col])

    # Convert all object/string columns to 'category' dtype
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].astype('category')

    return df, df_id

# Apply preprocessing
train_df_processed, _ = preprocess_data(train_df.copy())
test_df_processed, test_id = preprocess_data(test_df.copy())

# --- 3. Align and Prepare Data for Modeling (HGB) ---

TARGET = 'Default 12 Flag'
y = train_df_processed[TARGET]
X = train_df_processed.drop(columns=[TARGET])

# Combine for alignment
full_data = pd.concat([X, test_df_processed], keys=['train', 'test'])

# Identify categorical columns
categorical_cols = [col for col in full_data.columns if full_data[col].dtype.name == 'category']

# CRITICAL FIX: Frequency Encoding for High-Cardinality features
HIGH_CARDINALITY_THRESHOLD = 255
low_card_cols = []
high_card_cols = []

for col in categorical_cols:
    if len(full_data.loc['train', col].cat.categories) > HIGH_CARDINALITY_THRESHOLD:
        high_card_cols.append(col)

        # Frequency Encoding
        freq_map = full_data[col].value_counts(normalize=True).to_dict()
        full_data[f'{col}_Freq'] = full_data[col].map(freq_map)

        # Drop the original high-cardinality column
        full_data = full_data.drop(columns=[col])
    else:
        low_card_cols.append(col)
        # For low-cardinality, impute 'Missing' as a category
        full_data[col] = full_data[col].cat.add_categories('Missing').fillna('Missing')
        full_data[col] = full_data[col].astype('category')


# Impute remaining numerical NaNs
numerical_cols = full_data.select_dtypes(include=np.number).columns
for col in numerical_cols:
    full_data[col] = full_data[col].fillna(full_data.loc['train', col].median())

# Separate back into training and testing sets
X_aligned = full_data.loc['train']
test_aligned = full_data.loc['test']

# --- 4. Model Training and Tuning (HistGradientBoostingClassifier) ---

# Split data into 80% train / 20% validation for tuning and final evaluation
X_train, X_val, y_train, y_val = train_test_split(
    X_aligned, y, test_size=0.2, random_state=42, stratify=y
)

# Identify the indices of the remaining low-cardinality categorical features
categorical_features_indices_final = [
    i for i, col in enumerate(X_train.columns)
    if X_train[col].dtype.name == 'category'
]

# Define the model base
hgb_base = HistGradientBoostingClassifier(
    max_iter=1500,
    early_stopping=True,
    n_iter_no_change=150,
    tol=1e-7,
    random_state=42,
    categorical_features=categorical_features_indices_final
)

# Define the parameter search space
param_dist = {
    'learning_rate': uniform(loc=0.01, scale=0.10), # Search between 0.01 and 0.11
    'max_depth': randint(low=5, high=15),          # Search depths between 5 and 14
    'max_leaf_nodes': randint(low=10, high=50),     # Search leaf nodes between 10 and 49
    'l2_regularization': uniform(loc=0.1, scale=1.0) # Search between 0.1 and 1.1
}

# Use AUC as the scoring metric
auc_scorer = make_scorer(roc_auc_score, needs_proba=True)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=hgb_base,
    param_distributions=param_dist,
    n_iter=20, # Number of different parameter combinations to try
    scoring=auc_scorer,
    cv=3, # 3-fold cross-validation
    random_state=42,
    verbose=0,
    n_jobs=1
)

# Train the tuner (using X_train/y_train for CV folds)
random_search.fit(X_train, y_train)

# Get the best model
best_hgb = random_search.best_estimator_

# Evaluate the best model on the held-out validation set
val_probabilities = best_hgb.predict_proba(X_val)[:, 1]
val_auc = roc_auc_score(y_val, val_probabilities)

# --- 5. Prediction and Submission ---

# Predict probabilities on the test set
test_probabilities = best_hgb.predict_proba(test_aligned)[:, 1]

# Create submission file
submission_df = pd.DataFrame({'ID': test_id, 'Default 12 Flag': test_probabilities})

# Save to CSV
submission_df.to_csv('submission_hgb_tuned.csv', index=False)

print("\n--- Execution Summary (Tuned HGB Model) ---")
print(f"Model: HistGradientBoostingClassifier (Tuned)")
print(f"Best Hyperparameters: {random_search.best_params_}")
print(f"AUC on Validation Set: {val_auc:.4f}")
print("Submission file 'submission_hgb_tuned.csv' has been generated.")

ParserError: Error tokenizing data. C error: Expected 31 fields in line 41493, saw 54
