In [3]:
!pip install boruta optuna

Collecting boruta
  Downloading Boruta-0.3-py3-none-any.whl.metadata (7.7 kB)
Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Downloading Boruta-0.3-py3-none-any.whl (56 kB)
   ---------------------------------------- 0.0/56.6 kB ? eta -:--:--
   ---------------------------------------- 56.6/56.6 kB 987.0 kB/s eta 0:00:00
Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
   ---------------------------------------- 0.0/380.1 kB ? eta -:--:--
   ------------------------------ --------- 286.7/380.1 kB 8.6 MB/s eta 0:00:01
   ---------------------------------------- 380.1/380.1 kB 5.9 MB/s eta 0:00:00
Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
   

In [18]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Paths to the datasets
x_train_path = 'x_train.txt'
y_train_path = 'y_train.txt'
x_test_path = 'x_test.txt'

# Load the data
# Assuming data files are simple whitespace-delimited
x_train = pd.read_csv(x_train_path, header=None, delim_whitespace=True)
y_train = pd.read_csv(y_train_path, header=None, delim_whitespace=True)[0]
x_test = pd.read_csv(x_test_path, header=None, delim_whitespace=True)

# Step 2: Feature Selection
# Correlation matrix and remove highly correlated features
cor_matrix = x_train.corr().abs()
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
x_train_reduced = x_train.drop(to_drop, axis=1)
x_test_reduced = x_test.drop(to_drop, axis=1)

# Print the number of features reduced
print(f"Reduced from {x_train.shape[1]} to {x_train_reduced.shape[1]} features.")

# Step 3: Model Training
# Splitting the training data for validation
X_train, X_val, Y_train, Y_val = train_test_split(x_train_reduced, y_train, test_size=0.2, random_state=0)

# Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, Y_train)
preds = model.predict_proba(X_val)[:, 1]
print(f"Validation ROC-AUC: {roc_auc_score(Y_val, preds)}")

# Step 4: Evaluation
# Predict on test data
test_preds = model.predict_proba(x_test_reduced)[:, 1]

# Select top 1000 customers likely to accept the offer
top_customers = np.argsort(test_preds)[-1000:]

# Save indices of the selected customers
np.savetxt('m123456_obs.txt', top_customers, fmt='%d')


Reduced from 500 to 499 features.
Validation ROC-AUC: 0.488581511702899


In [25]:
!pip install boruta



In [29]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import optuna
from scipy.stats import binom

class BorutaPy(BaseEstimator, TransformerMixin):
    def __init__(self, estimator, n_estimators=1000, perc=100, alpha=0.05,
                 two_step=True, max_iter=100, random_state=None, verbose=0):
        self.estimator = estimator
        self.n_estimators = n_estimators
        self.perc = perc
        self.alpha = alpha
        self.two_step = two_step
        self.max_iter = max_iter
        self.random_state = random_state
        self.verbose = verbose

    def fit(self, X, y):
        # Check X and y are consistent in length, X is Array and y is column
        X, y = self._validate_input(X, y)
        self.random_state_ = np.random.RandomState(self.random_state)

        # Main logic to simulate shadow features and select features
        # This is a placeholder for the actual Boruta algorithm
        # Assume all features are selected as important
        self.support_ = np.ones(X.shape[1], dtype=bool)
        return self

    def transform(self, X):
        return X[:, self.support_]

    def _validate_input(self, X, y):
        if not isinstance(X, np.ndarray):
            X = np.array(X)
        if not isinstance(y, np.ndarray):
            y = np.array(y)
        if X.shape[0] != y.shape[0]:
            raise ValueError("Mismatched number of samples.")
        return X, y

# Load the data
x_train = pd.read_csv('x_train.txt', header=None, delim_whitespace=True)
y_train = pd.read_csv('y_train.txt', header=None, delim_whitespace=True)[0]
x_test = pd.read_csv('x_test.txt', header=None, delim_whitespace=True)

# Initialize XGBoost classifier
xgb_model = xgb.XGBClassifier(
    n_estimators=1000,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

# Apply custom Boruta for feature selection
boruta_selector = BorutaPy(xgb_model, n_estimators='auto', random_state=42)
boruta_selector.fit(x_train.values, y_train.values)

# Filter the selected features for the training and test set
x_train_filtered = boruta_selector.transform(x_train.values)
x_test_filtered = boruta_selector.transform(x_test.values)

# Optuna for hyperparameter optimization
def objective(trial):
    params = {
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.6, 0.7, 0.8, 0.9, 1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.01, 0.02, 0.05, 0.1]),
        'n_estimators': 1000,
        'max_depth': trial.suggest_categorical('max_depth', [3, 4, 5, 6, 7, 8, 9]),
        'random_state': 42,
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }
    model = xgb.XGBClassifier(**params)
    strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    roc_aucs = []
    for train_index, val_index in strat_k_fold.split(x_train_filtered, y_train):
        X_train, X_val = x_train_filtered[train_index], x_train_filtered[val_index]
        Y_train, Y_val = y_train.iloc[train_index], y_train.iloc[val_index]
        model.fit(X_train, Y_train, eval_set=[(X_val, Y_val)], early_stopping_rounds=50, verbose=False)
        preds = model.predict_proba(X_val)[:, 1]
        roc_aucs.append(roc_auc_score(Y_val, preds))
    
    return np.mean(roc_aucs)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)
best_params = study.best_params

# Train the final model
best_model = xgb.XGBClassifier(**best_params)
best_model.fit(x_train_filtered, y_train)

# Predict on the test set
test_preds = best_model.predict_proba(x_test_filtered)[:, 1]

# Select the top 1000 customers likely to accept the offer
top_customers = np.argsort(test_preds)[-1000:]

# Save indices of the selected customers
np.savetxt('1233456_obs.txt', top_customers, fmt='%d')

print("Optimization complete. Best ROC-AUC:", study.best_value)
print("Best hyperparameters:", best_params)


[I 2024-05-14 19:46:00,737] A new study created in memory with name: no-name-244e091c-baca-46df-9b0b-2d16335f3ad8
  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
[I 2024-05-14 19:47:41,078] Trial 0 finished with value: 0.7105378793867174 and parameters: {'lambda': 0.017504759865604645, 'alpha': 0.06620111148731506, 'colsample_bytree': 0.8, 'subsample': 0.6, 'learning_rate': 0.01, 'max_depth': 7, 'min_child_weight': 69}. Best is trial 0 with value: 0.7105378793867174.
  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
[I 2024-05-14 19:48:04,738] Trial 1 finished with value: 0.6333236390801563 and parameters: {'lambda': 0.6899879009542048, 'alpha': 1.9510410638727897, 'colsample_bytree': 0.8, 'subsample': 1.0, 'learning_rate': 0.05, 'max_depth': 7, 'min_child_weight': 273}. Best is trial 0 with value: 0.7105378793867174.
  'lambda': trial.suggest_loguni

Optimization complete. Best ROC-AUC: 0.7345967580126322
Best hyperparameters: {'lambda': 0.10136924939707018, 'alpha': 0.01797208332194037, 'colsample_bytree': 0.7, 'subsample': 0.6, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 18}


In [30]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import optuna
from scipy.stats import binom

from boruta_py import BorutaPy

# Load the data
x_train = pd.read_csv('x_train.txt', header=None, delim_whitespace=True)
y_train = pd.read_csv('y_train.txt', header=None, delim_whitespace=True)[0]
x_test = pd.read_csv('x_test.txt', header=None, delim_whitespace=True)

# Initialize XGBoost classifier
xgb_model = xgb.XGBClassifier(
    n_estimators=1000,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

# Apply custom Boruta for feature selection
boruta_selector = BorutaPy(xgb_model, n_estimators='auto', random_state=42)
boruta_selector.fit(x_train.values, y_train.values)

# Filter the selected features for the training and test set
x_train_filtered = boruta_selector.transform(x_train.values)
x_test_filtered = boruta_selector.transform(x_test.values)

# Optuna for hyperparameter optimization
def objective(trial):
    params = {
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.6, 0.7, 0.8, 0.9, 1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.01, 0.02, 0.05, 0.1]),
        'n_estimators': 1000,
        'max_depth': trial.suggest_categorical('max_depth', [3, 4, 5, 6, 7, 8, 9]),
        'random_state': 42,
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }
    model = xgb.XGBClassifier(**params)
    strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    roc_aucs = []
    for train_index, val_index in strat_k_fold.split(x_train_filtered, y_train):
        X_train, X_val = x_train_filtered[train_index], x_train_filtered[val_index]
        Y_train, Y_val = y_train.iloc[train_index], y_train.iloc[val_index]
        model.fit(X_train, Y_train, eval_set=[(X_val, Y_val)], early_stopping_rounds=50, verbose=False)
        preds = model.predict_proba(X_val)[:, 1]
        roc_aucs.append(roc_auc_score(Y_val, preds))
    
    return np.mean(roc_aucs)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)
best_params = study.best_params

# Train the final model
best_model = xgb.XGBClassifier(**best_params)
best_model.fit(x_train_filtered, y_train)

# Predict on the test set
test_preds = best_model.predict_proba(x_test_filtered)[:, 1]

# Select the top 1000 customers likely to accept the offer
top_customers = np.argsort(test_preds)[-1000:]

# Save indices of the selected customers
np.savetxt('1233456555_obs.txt', top_customers, fmt='%d')

print("Optimization complete. Best ROC-AUC:", study.best_value)
print("Best hyperparameters:", best_params)


[I 2024-05-14 21:01:12,626] A new study created in memory with name: no-name-2f121830-bcbb-4898-990f-661c78f67c17
  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
[I 2024-05-14 21:01:14,252] Trial 0 finished with value: 0.651274489232357 and parameters: {'lambda': 0.0023666504980943814, 'alpha': 9.652819242173145, 'colsample_bytree': 0.7, 'subsample': 0.9, 'learning_rate': 0.05, 'max_depth': 5, 'min_child_weight': 251}. Best is trial 0 with value: 0.651274489232357.
  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
[I 2024-05-14 21:01:15,497] Trial 1 finished with value: 0.6428276566434266 and parameters: {'lambda': 0.01996851675660698, 'alpha': 1.2262117365125846, 'colsample_bytree': 0.8, 'subsample': 0.7, 'learning_rate': 0.05, 'max_depth': 6, 'min_child_weight': 213}. Best is trial 0 with value: 0.651274489232357.
  'lambda': trial.suggest_logunifo

Optimization complete. Best ROC-AUC: 0.7484199986399945
Best hyperparameters: {'lambda': 6.4149865204130565, 'alpha': 0.24487047801380527, 'colsample_bytree': 0.7, 'subsample': 0.7, 'learning_rate': 0.02, 'max_depth': 7, 'min_child_weight': 4}
