In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

cu_2025_scoring_path = kagglehub.competition_download('cu-2025-scoring')

print('Data source import complete.')


# Kaggle Imports

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# EDA

## Imports

In [None]:
import pandas as pd
import numpy as np

In [None]:
train = pd.read_parquet('/kaggle/input/cu-2025-scoring/train.parquet')
test = pd.read_parquet('/kaggle/input/cu-2025-scoring/test.parquet')
sample_submission = pd.read_csv('/kaggle/input/cu-2025-scoring/sample_submission.csv')

## Checking train

In [None]:
print(train.shape)

In [None]:
train.info()

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
print(train['a6_flg'].value_counts(normalize=True))


In [None]:
train['a6_flg'].describe()

In [None]:
missing_percent = train.isnull().mean() * 100
missing_percent = missing_percent[missing_percent > 0].sort_values(ascending=False)

In [None]:
missing_percent.head(20)

In [None]:
features = [col for col in train.columns if col.startswith('feature_')]
train[features].describe()

In [None]:
train['product'].value_counts()

In [None]:
train['month_dt'].value_counts()

## Checking test

In [None]:
test[features].describe()

In [None]:
test['product'].value_counts()

In [None]:
test['month_dt'].value_counts()

In [None]:
import matplotlib.pyplot as plt

In [None]:
train['a6_flg'].hist()
plt.title('a6_flg distribution')
plt.show()

In [None]:
train['product'].value_counts().plot(kind='bar')
plt.title('Products distribution')
plt.show()

In [None]:
train['month_dt'].value_counts().plot(kind='bar')
plt.title('Months distribution')
plt.show()

## Missing values analysis

In [None]:
print(missing_percent.head(20))

In [None]:
plt.figure(figsize=(10,6))
missing_percent.plot(kind='hist', bins=30)
plt.title('Distribution of missing value percentage')
plt.xlabel('% missing')
plt.show()

In [None]:
high_missing_70 = missing_percent[missing_percent > 70]
high_missing_50 = missing_percent[missing_percent > 50]
high_missing_30 = missing_percent[missing_percent > 30]
print("Features with >70% missing:", list(high_missing_70.index))
print("Features with >50% missing:", list(high_missing_50.index))
print("Features with >30% missing:", list(high_missing_30.index))

In [None]:
missing_percent_test = test.isnull().mean() * 100
print(missing_percent_test.sort_values(ascending=False).head(20))

In [None]:
common_high_missing = set(high_missing_50.index).intersection(set(missing_percent_test[missing_percent_test > 50].index))
print("High-missing features in both train and test:", common_high_missing)

## Missing Value Feature Impact Analysis

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score

In [None]:
high_missing_50_list = list(high_missing_50.index)


In [None]:
not_all_nan_features = [f for f in high_missing_50_list if train[f].notnull().any()]
X_high_missing = train[not_all_nan_features]

imputer = SimpleImputer(strategy='median')
X_high_missing_filled = pd.DataFrame(imputer.fit_transform(X_high_missing), columns=not_all_nan_features)

In [None]:
y = train['a6_flg']

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X_high_missing_filled, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
clf.fit(X_train, y_train)

In [None]:
feat_importance = clf.feature_importances_
importance_df = pd.DataFrame({'feature': not_all_nan_features, 'importance': feat_importance})
print(importance_df.sort_values(by='importance', ascending=False).head(20))

In [None]:
y_pred = clf.predict_proba(X_val)[:, 1]
roc_auc = roc_auc_score(y_val, y_pred)
print("Validation ROC-AUC (only high-missing features):", roc_auc)

## Dataset processing

In [None]:
remove_features = list(missing_percent[missing_percent > 70].index)
features_to_keep = [f for f in features if f not in remove_features]

In [None]:
train_filtered = train[features_to_keep + ['a6_flg', 'month_dt', 'product']]
test_filtered = test[features_to_keep + ['month_dt', 'product']]

In [None]:
for f in features_to_keep:
    train_filtered[f + '_missing'] = train_filtered[f].isnull().astype(int)
    test_filtered[f + '_missing'] = test_filtered[f].isnull().astype(int)

In [None]:
imputer = SimpleImputer(strategy='median')
train_filtered[features_to_keep] = imputer.fit_transform(train_filtered[features_to_keep])
test_filtered[features_to_keep] = imputer.transform(test_filtered[features_to_keep])

In [None]:
train_final = pd.get_dummies(train_filtered, columns=['product'])
test_final = pd.get_dummies(test_filtered, columns=['product'])

In [None]:
missing_cols = set(train_final.columns) - set(test_final.columns)
for c in missing_cols:
    test_final[c] = 0
test_final = test_final[train_final.drop('a6_flg', axis=1).columns]

In [None]:
# train_final['month_num'] = train_final['month_dt'].rank(method='dense').astype(int)
# test_final['month_num'] = test_final['month_dt'].rank(method='dense').astype(int)
# train_final = train_final.drop('month_dt', axis=1)
# test_final = test_final.drop('month_dt', axis=1)

In [None]:
X_train = train_final.drop(['a6_flg', 'month_dt'], axis=1)
y_train = train_final['a6_flg']
X_test = test_final.drop('month_dt', axis=1)

# Baseline

## Catboost

In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [None]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

In [None]:
cb = CatBoostClassifier(
    iterations=700,
    learning_rate=0.03,
    depth=6,
    eval_metric='AUC',
    verbose=100,
    random_state=42,
)

In [None]:
cb.fit(X_tr, y_tr, eval_set=(X_val, y_val), use_best_model=True)

In [None]:
val_pred = cb.predict_proba(X_val)[:, 1]
roc_auc = roc_auc_score(y_val, val_pred)
print('CatBoost ROC-AUC on validation:', roc_auc)

In [None]:
test_pred = cb.predict_proba(X_test)[:, 1]

In [None]:
sample_submission['a6_flg'] = test_pred
sample_submission.to_csv('submission.csv', index=False)