### Import Libs

In [25]:
import pandas as pd
import numpy as np
import os
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score
from sklearn.metrics import roc_auc_score
from scipy.stats import shapiro, kruskal, chi2
from scipy.stats import zscore

import warnings
warnings.filterwarnings('ignore')

### Read data

In [26]:
path = os.getcwd()

In [27]:
train = pd.read_csv(path + '/train.csv',index_col=0)
X_train = train.iloc[:,:-1]
y_train = train.iloc[:,-1]
X_test = pd.read_csv(path + '/test.csv',index_col=0)
id_test = X_test.index #for the submission file

### EDA

In [28]:
# data types of the features
X_train.dtypes.value_counts()

int64      258
float64    111
Name: count, dtype: int64

In [29]:
# Check how many int columns are binary
int_columns = X_train.select_dtypes(include='int')

# Check which integer columns are binary (i.e., contain only values {0, 1}), and which are not
binary_columns = [col for col in int_columns.columns if set(int_columns[col].unique()).issubset({0, 1})]
non_binary_columns = [col for col in int_columns.columns if col not in binary_columns]

# Display binary integer columns
print(f"Number of Binary columns: {len(binary_columns)} \n Binary columns: {binary_columns} \n Number of Non Binary columns: {len(non_binary_columns)} \n Non Binary columns: {non_binary_columns}")

Number of Binary columns: 100 
 Binary columns: ['ind_var1_0', 'ind_var1', 'ind_var2_0', 'ind_var2', 'ind_var5_0', 'ind_var5', 'ind_var6_0', 'ind_var6', 'ind_var8_0', 'ind_var8', 'ind_var12_0', 'ind_var12', 'ind_var13_0', 'ind_var13_corto_0', 'ind_var13_corto', 'ind_var13_largo_0', 'ind_var13_largo', 'ind_var13_medio_0', 'ind_var13_medio', 'ind_var13', 'ind_var14_0', 'ind_var14', 'ind_var17_0', 'ind_var17', 'ind_var18_0', 'ind_var18', 'ind_var19', 'ind_var20_0', 'ind_var20', 'ind_var24_0', 'ind_var24', 'ind_var25_cte', 'ind_var26_0', 'ind_var26_cte', 'ind_var26', 'ind_var25_0', 'ind_var25', 'ind_var27_0', 'ind_var28_0', 'ind_var28', 'ind_var27', 'ind_var29_0', 'ind_var29', 'ind_var30_0', 'ind_var30', 'ind_var31_0', 'ind_var31', 'ind_var32_cte', 'ind_var32_0', 'ind_var32', 'ind_var33_0', 'ind_var33', 'ind_var34_0', 'ind_var34', 'ind_var37_cte', 'ind_var37_0', 'ind_var37', 'ind_var39_0', 'ind_var40_0', 'ind_var40', 'ind_var41_0', 'ind_var41', 'ind_var39', 'ind_var44_0', 'ind_var44', 'ind

### Feature Engineering

In [30]:
# remove constant columns
constant = []
for col in X_train.columns:
    if X_train[col].std() == 0:
        constant.append(col)

X_train.drop(constant, axis=1, inplace=True)
X_test.drop(constant, axis=1, inplace=True)

print(f'removed {len(constant)} constant columns')

removed 34 constant columns


In [31]:
# remove duplicated columns
duplicated_columns = []
cols = X_train.columns
for i in range(len(cols)-1):
    v = X_train[cols[i]].values
    for j in range(i+1,len(cols)):
        if np.array_equal(v,X_train[cols[j]].values):
            duplicated_columns.append(cols[j])

X_train.drop(duplicated_columns, axis=1, inplace=True)


print(f'removed {len(duplicated_columns)} duplicated columns')

removed 29 duplicated columns


In [32]:
# Function that applies statistical tests to identify important variables
def select_relevant_features(X_train, y_train, score_threshold=0.05):
    """
    Selects relevant features from X_train based on statistical tests and a score threshold.
    Uses Chi-square test for binary features and an appropriate test for continuous features 
    based on parametric or non-parametric distribution check.

    Parameters:
    X_train (pd.DataFrame): Feature matrix with int or float columns.
    y_train (pd.Series or np.array): Target variable array.
    score_threshold (float): Threshold for p-value or score relevance (default is 0.05 for p-value).

    Returns:
    List[str]: Names of relevant features based on the threshold.
    """
    # Separate binary (int) and continuous (float) features
    int_columns = X_train.select_dtypes(include='int')
    continuous_features = X_train.select_dtypes(include='float')
    
    # Step 2: Identify binary and non-binary integer columns
    binary_columns = [col for col in int_columns.columns if set(int_columns[col].unique()).issubset({0, 1})]
    non_binary_columns = [col for col in int_columns.columns if col not in binary_columns]
    
    # Chi-square test for binary features
    if binary_columns:
        selector_chi2 = SelectKBest(score_func=chi2, k='all')
        selector_chi2.fit(X_train[binary_columns], y_train)
        chi2_pvalues = pd.Series(selector_chi2.pvalues_, index=binary_columns)
    else:
        chi2_pvalues = pd.Series(dtype=float)

    # Parametric/Non-parametric check for continuous and non-binary integer features
    def is_parametric(series):
        # Shapiro-Wilk test for normality (use for sample sizes < 2000)
        stat, p_value = shapiro(series)
        return p_value > 0.05  # True if data is likely normal

    # Apply the check and select test accordingly
    f_pvalues = pd.Series(dtype=float)  # Initialize an empty series for storing p-values
    
    if not continuous_features.empty or non_binary_columns:
        combined_features = continuous_features.join(X_train[non_binary_columns])
        parametric_features = [col for col in combined_features.columns if is_parametric(combined_features[col])]
        non_parametric_features = [col for col in combined_features.columns if col not in parametric_features]

        # Parametric test (ANOVA F-test) for normally distributed features
        if parametric_features:
            selector_f = SelectKBest(score_func=f_classif, k='all')
            selector_f.fit(X_train[parametric_features], y_train)
            f_pvalues = f_pvalues.append(pd.Series(selector_f.pvalues_, index=parametric_features))

        # Non-parametric test (Kruskal-Wallis) for non-normally distributed features
        for col in non_parametric_features:
            _, p_value = kruskal(*[combined_features[col][y_train == cls] for cls in y_train.unique()])
            f_pvalues[col] = p_value

    # Combine p-values
    all_pvalues = pd.concat([chi2_pvalues, f_pvalues])
    
    # Filter features based on the score threshold (p-value in this case)
    relevant_features = all_pvalues[all_pvalues < score_threshold].index.tolist()

    print(f'non parametric: {len(non_parametric_features)}, \n parametric: {len(parametric_features)} ')
    
    return relevant_features

In [33]:
# apply the function
relevant_features = select_relevant_features(X_train, y_train, score_threshold=0.05)

print(f'relevant features count: {len(relevant_features)} \n relevant_features: {relevant_features}')

non parametric: 250, 
 parametric: 0 
relevant features count: 106 
 relevant_features: ['imp_op_var40_efect_ult1', 'imp_op_var40_efect_ult3', 'imp_op_var41_efect_ult1', 'imp_op_var41_efect_ult3', 'imp_op_var39_efect_ult1', 'imp_op_var39_efect_ult3', 'imp_sal_var16_ult1', 'saldo_var1', 'saldo_var5', 'saldo_var8', 'saldo_var12', 'saldo_var13_corto', 'saldo_var13_largo', 'saldo_var13', 'saldo_var14', 'saldo_var20', 'saldo_var24', 'saldo_var26', 'saldo_var25', 'saldo_var30', 'saldo_var31', 'saldo_var40', 'saldo_var42', 'delta_imp_aport_var13_1y3', 'delta_num_aport_var13_1y3', 'imp_aport_var13_hace3', 'imp_aport_var13_ult1', 'imp_trans_var37_ult1', 'saldo_medio_var5_hace2', 'saldo_medio_var5_hace3', 'saldo_medio_var5_ult1', 'saldo_medio_var5_ult3', 'saldo_medio_var8_hace2', 'saldo_medio_var8_hace3', 'saldo_medio_var8_ult1', 'saldo_medio_var8_ult3', 'saldo_medio_var12_hace2', 'saldo_medio_var12_hace3', 'saldo_medio_var12_ult1', 'saldo_medio_var12_ult3', 'saldo_medio_var13_corto_hace2', 'sal

In [34]:
X_train = X_train.loc[:,relevant_features]
X_test = X_test.loc[:,relevant_features]

In [36]:
# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Model Implementation

In [37]:
# XGBoost for Classification
xgb_model = XGBClassifier(eval_metric="auc")

# Grid Search to find the best parameters, without regularization
param_grid = {
    'max_depth': [1, 3, 5, 7],  
    'learning_rate': [0.01, 0.03, 0.1],  
    'n_estimators': [150, 350, 500, 800],  
}

# Initialize KFold
kf_splits = KFold(n_splits=6, shuffle=True, random_state=42)

# Grid search with KFold validation, and adequate scoring given the evaluation method of the problem
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, 
                           scoring='roc_auc', cv=kf_splits, verbose=1, n_jobs=-1)

In [38]:
# Check scores when dividing training set into train and validation (before grid search)
scores = cross_val_score(xgb_model, X_train, y_train, cv=kf_splits, scoring="roc_auc")

# Print the AUC scores for each fold
print("AUC scores for each fold:", scores)
print("Mean AUC score:", np.mean(scores))

AUC scores for each fold: [0.82507157 0.79261619 0.83343576 0.81898829 0.83677116 0.82495463]
Mean AUC score: 0.8219729328982321


In [39]:
grid_search.fit(X_train, y_train)

Fitting 6 folds for each of 48 candidates, totalling 288 fits


In [40]:
best_model = grid_search.best_estimator_

In [41]:
best_model

In [42]:
y_pred_train = best_model.predict_proba(X_train)[:,1]

In [43]:
roc_auc_score(y_train,y_pred_train)

0.8557967186208679

Now let's add regularization to the grid search

In [44]:
reg = {
    'min_child_weight': [3, 5, 10],  
    'subsample': [0.9],  
    'colsample_bytree': [0.85],  
    'colsample_bylevel': [0.7, 0.9],  
    'min_child_weight': [1,3],  
    'reg_alpha': [0.01, 0.1],  
    'reg_lambda': [0.5, 1.0],  
    'gamma': [0.1, 0.3] 
}
params_reg = {key: [value] for key, value in grid_search.best_params_.items()}
params_reg.update(reg)

# grid search, adding regularization parameters
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params_reg, 
                           scoring='roc_auc', cv=kf_splits, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

Fitting 6 folds for each of 32 candidates, totalling 192 fits


In [45]:
best_model2 = grid_search.best_estimator_
best_model2

In [46]:
y_pred_train2 = best_model2.predict_proba(X_train)[:,1]
roc_auc_score(y_train,y_pred_train2)

0.8558809800356338

In [47]:
y_pred_test= best_model2.predict_proba(X_test)[:,1]

In [48]:
final = pd.DataFrame({"ID":id_test, "TARGET":y_pred_test})
final.to_csv("final.csv", index=False)