In [18]:
# 1. Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

# 2. Load data
data = pd.read_csv('application_train.csv')

# 3. Preprocessing
# Fill missing values with median
#data = data.fillna(data.median())

# Encode categorical variables
cat_cols = data.select_dtypes(include='object').columns
le = LabelEncoder()

for col in cat_cols:
    data[col] = le.fit_transform(data[col].astype(str))

# 4. Features and target
X = data.drop(columns=['TARGET', 'SK_ID_CURR'])
y = data['TARGET']

# 5. Split into training and testing sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 6. Train Random Forest model
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=8,
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)

# 7. Predict and evaluate
y_pred_proba = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred_proba)
print(f"Validation AUC (Random Forest): {auc:.4f}")


Validation AUC (Random Forest): 0.7367


##### Four. Device engineering ingenuity

shared setup

In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

# Load data
data_raw = pd.read_csv('application_train.csv')

# Label encoding for object columns
def evaluate(df, name):
    df = df.copy()
    
    # Check that 'TARGET' and 'SK_ID_CURR' exist
    if 'TARGET' not in df.columns or 'SK_ID_CURR' not in df.columns:
        raise ValueError("DataFrame must contain 'TARGET' and 'SK_ID_CURR' columns.")
    
    # Drop rows where TARGET is missing
    df = df[df['TARGET'].notnull()]

    # Fill missing values
    df = df.fillna(df.median(numeric_only=True))

    # Encode categorical features
    df = encode_categoricals(df)

    # Features and target
    X = df.drop(columns=['TARGET', 'SK_ID_CURR'])
    y = df['TARGET']

    # Check for any non-numeric columns (should not be present)
    if X.select_dtypes(include='object').shape[1] > 0:
        raise TypeError("Non-numeric columns found after encoding.")

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Train model
    model = RandomForestClassifier(n_estimators=100, max_depth=8, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred)

    print(f"{name} AUC: {auc:.4f}")
    return model, auc



Pattern 1: Drop Highly Missing Columns

In [20]:
df1 = data_raw.copy()
missing_thresh = 0.4
to_drop = df1.columns[df1.isnull().mean() > missing_thresh]
df1 = df1.drop(columns=to_drop)

model1, auc1 = evaluate(df1, "Pattern 1 - Drop missing")


Pattern 1 - Drop missing AUC: 0.7350


Pattern 2: Add Ratio Features (Income/Credit)

In [21]:
df2 = data_raw.copy()
df2['INCOME_CREDIT_RATIO'] = df2['AMT_INCOME_TOTAL'] / df2['AMT_CREDIT']
df2['ANNUITY_INCOME_RATIO'] = df2['AMT_ANNUITY'] / df2['AMT_INCOME_TOTAL']

model2, auc2 = evaluate(df2, "Pattern 2 - Ratio features")


Pattern 2 - Ratio features AUC: 0.7372


Pattern 3: Create Age and Employment Features

In [22]:
df3 = data_raw.copy()
df3['AGE_YEARS'] = -df3['DAYS_BIRTH'] / 365
df3['YEARS_EMPLOYED'] = df3['DAYS_EMPLOYED'].replace(365243, np.nan) / -365
df3['EMPLOYED_TO_AGE'] = df3['YEARS_EMPLOYED'] / df3['AGE_YEARS']

model3, auc3 = evaluate(df3, "Pattern 3 - Age & Employment")


Pattern 3 - Age & Employment AUC: 0.7340


Pattern 4: External Source Average + Product

In [23]:
df4 = data_raw.copy()
ext = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']
df4['EXT_MEAN'] = df4[ext].mean(axis=1)
df4['EXT_PRODUCT'] = df4[ext].prod(axis=1)

model4, auc4 = evaluate(df4, "Pattern 4 - EXT Source engineered")


Pattern 4 - EXT Source engineered AUC: 0.7413


Pattern 5: Bin Age and Income

In [24]:
df5 = data_raw.copy()
df5['AGE_BIN'] = pd.qcut(-df5['DAYS_BIRTH'], 5, labels=False)
df5['INCOME_BIN'] = pd.qcut(df5['AMT_INCOME_TOTAL'], 5, labels=False)

model5, auc5 = evaluate(df5, "Pattern 5 - Binned age & income")


Pattern 5 - Binned age & income AUC: 0.7368


#### Final Model & Submission

In [26]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

# 1. Load the single dataset
data = pd.read_csv('application_train.csv')

# 2. Feature Engineering: EXT_SOURCE statistics
def add_ext_features(df):
    ext = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']
    df['EXT_MEAN'] = df[ext].mean(axis=1)
    df['EXT_PRODUCT'] = df[ext].prod(axis=1)
    return df

data = add_ext_features(data)

# 3. Preprocess: fill NA and label encode
def preprocess(df):
    df = df.copy()
    df = df.fillna(df.median(numeric_only=True))
    cat_cols = df.select_dtypes(include='object').columns
    le = LabelEncoder()
    for col in cat_cols:
        df[col] = le.fit_transform(df[col].astype(str))
    return df

data = preprocess(data)

# 4. Prepare X and y, drop ID
X = data.drop(columns=['TARGET', 'SK_ID_CURR'])
y = data['TARGET']
ids = data['SK_ID_CURR']

# 5. Split into train and test (80/20)
X_train, X_test, y_train, y_test, id_train, id_test = train_test_split(
    X, y, ids, test_size=0.2, random_state=42
)

# 6. Train final model
model = RandomForestClassifier(n_estimators=100, max_depth=8, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# 7. Predict on test split
y_pred = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred)
print(f"Validation AUC: {auc:.4f}")

# 8. Format as "submission" (mock)
submission = pd.DataFrame({
    'SK_ID_CURR': id_test,
    'TARGET': y_pred
})

submission.to_csv('rf_submission_mock.csv', index=False)
print("Mock submission file saved as rf_submission_mock.csv")


Validation AUC: 0.7413
Mock submission file saved as rf_submission_mock.csv
