In [22]:
import numpy as np
import random

import pandas as pd
from scipy import stats

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from xgboost import XGBClassifier

In [23]:
# set seed
seed = 42
random.seed(seed)
np.random.seed(seed)

In [24]:
# load data
train_data = pd.read_csv('../data/preprocessed_train.csv')
test_data = pd.read_csv('../data/preprocessed_test.csv')

# train_data = pd.read_csv('../data/preprocessed_train_CorrDrop.csv')
# test_data = pd.read_csv('../data/preprocessed_test_CorrDrop.csv')

In [25]:
# Set the 'UID' column as the index
train_data.set_index('UID', inplace=True)

# Set the 'UID' column as the index
test_data.set_index('UID', inplace=True)

In [26]:
# Define the mapping for 'Target' column
target_mapping = {'low': 0, 'medium': 1, 'high': 2}

# Apply the mapping to the 'Target' column
train_labels = train_data['Target'].map(target_mapping)

# Drop the 'Target' column from the training data
train_data = train_data.drop('Target', axis=1)

In [27]:
# make train data with 22514 data of each class

# Separate the data based on the target classes
# low_class = train_data[train_labels == 0]
# medium_class = train_data[train_labels == 1]
# high_class = train_data[train_labels == 2]

# # Get the number of samples in each class
# low_class_count = len(low_class)
# medium_class_count = len(medium_class)
# high_class_count = len(high_class)

# # Set the number of samples to be selected from each class
# num_samples = min(low_class_count, medium_class_count, high_class_count)
# num_samples_m = int(num_samples * 1.0)

# # Randomly sample data from each class
# low_class_sample = low_class.sample(n=num_samples, random_state=seed)
# medium_class_sample = medium_class.sample(n=num_samples_m, random_state=seed)
# high_class_sample = high_class.sample(n=num_samples, random_state=seed)

# # Concatenate the sampled data
# train_data_sampled = pd.concat([low_class_sample, medium_class_sample, high_class_sample])

# # Separate the features and target variable
# X_sampled = train_data_sampled
# y_sampled = train_labels.loc[train_data_sampled.index]

# # Display the count of unique values in the target variable
# print(y_sampled.value_counts())

In [28]:
# Split the data into training and validation sets
# X_train, X_valid, y_train, y_valid = train_test_split(train_data, train_labels, test_size=0.1, random_state=seed)
# X_train, X_valid, y_train, y_valid = train_test_split(X_sampled, y_sampled, test_size=0.1, random_state=seed)
# X_train, y_train = X_sampled, y_sampled
# X_valid, y_valid = X_sampled, y_sampled
X_train, y_train = train_data, train_labels
X_valid, y_valid = train_data, train_labels

# Display the shapes of the training and validation sets
print(f"X_train shape: {X_train.shape}")
print(f"X_valid shape: {X_valid.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_valid shape: {y_valid.shape}")

X_train shape: (112569, 20)
X_valid shape: (112569, 20)
y_train shape: (112569,)
y_valid shape: (112569,)


## XGBoost

In [32]:
# initialize the models
# params = {
#     'n_estimators': 200,
#     'learning_rate': 0.2,
#     'max_depth': 10,
#     'min_child_weight': 3,
#     'subsample': 0.6,
#     'random_state': seed
# }

from sklearn.utils import class_weight
classes_weights = class_weight.compute_sample_weight(
    class_weight='balanced',
    y=train_labels
)

xgb = XGBClassifier(
    n_estimators=200,
    random_state=seed,
    class_weights=[3, 1, 3]
)

# fit the model
xgb.fit(X_train, y_train)

# make predictions
train_preds = xgb.predict(X_train)
valid_preds = xgb.predict(X_valid)

# calculate the f1 score
train_f1 = f1_score(y_train, train_preds, average='macro')
valid_f1 = f1_score(y_valid, valid_preds, average='macro')

# calculate the accuracy
train_accuracy = accuracy_score(y_train, train_preds)
valid_accuracy = accuracy_score(y_valid, valid_preds)

# display the f1 score and accuracy
print(f"Train F1 Score: {train_f1}")
print(f"Validation F1 Score: {valid_f1}")
print(f"Train Accuracy: {train_accuracy}")
print(f"Validation Accuracy: {valid_accuracy}")

Parameters: { "class_weights" } are not used.



Train F1 Score: 0.5258120047187965
Validation F1 Score: 0.5258120047187965
Train Accuracy: 0.6916113672503087
Validation Accuracy: 0.6916113672503087


In [9]:
# make predictinos on test data
test_preds = xgb.predict(test_data)

# convert predictions to original target values
target_mapping = {v: k for k, v in target_mapping.items()}
test_preds = pd.Series(test_preds).map(target_mapping)

# make csv file for submission
submission = pd.DataFrame({
    'UID': test_data.index,
    'Target': test_preds
})

submission.to_csv('../data/output/xgboost.csv', index=False)

## Hyperparameter Optimization

In [9]:
cv_params = {
    'n_estimators': [100, 200, 300, 400],
    'learning_rate': [0.01, 0.1, 0.3, 0.5],
}

csv = GridSearchCV(
    estimator=XGBClassifier(random_state=seed),
    param_grid=cv_params,
    scoring='f1_macro',
    cv=5,
    n_jobs=-1, 
    verbose=2
)

csv.fit(X_train, y_train)

# Get the best parameters
best_params = csv.best_params_

# Display the best parameters
print(best_params)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END ................learning_rate=0.1, n_estimators=100; total time=   7.9s
[CV] END ................learning_rate=0.1, n_estimators=100; total time=   7.8s
[CV] END ................learning_rate=0.1, n_estimators=100; total time=   7.8s
[CV] END ................learning_rate=0.1, n_estimators=100; total time=   7.9s
[CV] END ................learning_rate=0.1, n_estimators=100; total time=   8.2s
[CV] END ...............learning_rate=0.01, n_estimators=100; total time=   8.9s
[CV] END ...............learning_rate=0.01, n_estimators=100; total time=   8.8s
[CV] END ...............learning_rate=0.01, n_estimators=100; total time=   8.9s
[CV] END ...............learning_rate=0.01, n_estimators=100; total time=   9.0s
[CV] END ...............learning_rate=0.01, n_estimators=100; total time=   9.1s
[CV] END ................learning_rate=0.1, n_estimators=200; total time=  14.4s
[CV] END ................learning_rate=0.1, n_es

In [10]:
cv_params = {
    'max_depth': [ 2, 3, 5,  7],
    'min_child_weight': [1, 2, 3, 4]
}

fixed_params = {
    'n_estimators': 400,
    'learning_rate': 0.5, 
    'random_state': seed
}

csv = GridSearchCV(
    estimator=XGBClassifier(**fixed_params),
    param_grid=cv_params,
    scoring='f1_macro',
    cv=5,
    n_jobs=-1, 
    verbose=2
)

csv.fit(X_train, y_train)

# Get the best parameters
best_params = csv.best_params_

# Display the best parameters
print(best_params)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END ....................max_depth=2, min_child_weight=1; total time=  13.9s
[CV] END ....................max_depth=2, min_child_weight=1; total time=  14.0s
[CV] END ....................max_depth=2, min_child_weight=2; total time=  14.0s
[CV] END ....................max_depth=2, min_child_weight=1; total time=  14.2s
[CV] END ....................max_depth=2, min_child_weight=3; total time=  14.0s
[CV] END ....................max_depth=2, min_child_weight=3; total time=  14.0s
[CV] END ....................max_depth=2, min_child_weight=1; total time=  14.3s
[CV] END ....................max_depth=2, min_child_weight=2; total time=  14.1s
[CV] END ....................max_depth=2, min_child_weight=4; total time=  14.0s
[CV] END ....................max_depth=2, min_child_weight=2; total time=  14.3s
[CV] END ....................max_depth=2, min_child_weight=1; total time=  14.4s
[CV] END ....................max_depth=2, min_ch

In [11]:
cv_params = {
    'subsample': [0.6, 0.8, 0.9, 1.0],
    'max_delta_step': [0, 1, 2, 3, 4]
}

fixed_params = {
    'n_estimators': 400,
    'learning_rate': 0.5,
    'max_depth': 7,
    'min_child_weight': 1,
    'random_state': seed
}

csv = GridSearchCV(
    estimator=XGBClassifier(**fixed_params),
    param_grid=cv_params,
    scoring='f1_macro',
    cv=5,
    n_jobs=-1, 
    verbose=2
)

csv.fit(X_train, y_train)

# Get the best parameters
best_params = csv.best_params_

# Display the best parameters
print(best_params)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] END ......................max_delta_step=0, subsample=9; total time=   0.2s
[CV] END ......................max_delta_step=0, subsample=9; total time=   0.2s
[CV] END ......................max_delta_step=0, subsample=9; total time=   0.3s
[CV] END ......................max_delta_step=0, subsample=9; total time=   0.3s
[CV] END ......................max_delta_step=0, subsample=9; total time=   0.3s
[CV] END ......................max_delta_step=0, subsample=0; total time=  11.8s
[CV] END ......................max_delta_step=0, subsample=0; total time=  12.2s
[CV] END ......................max_delta_step=0, subsample=0; total time=  12.3s
[CV] END ......................max_delta_step=0, subsample=0; total time=  12.4s
[CV] END ......................max_delta_step=0, subsample=0; total time=  12.5s
[CV] END ......................max_delta_step=1, subsample=9; total time=   0.3s
[CV] END ......................max_delta_step=1

25 fits failed out of a total of 125.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/export/home/darpan/anaconda3/envs/env_1/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/export/home/darpan/anaconda3/envs/env_1/lib/python3.11/site-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "/export/home/darpan/anaconda3/envs/env_1/lib/python3.11/site-packages/xgboost/sklearn.py", line 1531, in fit
    self._Booster = train(
                    ^^^^^^
  File "/export/home/darpan/anaconda3/envs/env_1

{'max_delta_step': 0, 'subsample': 0.6}


In [12]:
final_params = {
    'n_estimators': 400,
    'learning_rate': 0.5,
    'max_depth': 7,
    'min_child_weight': 1,
    'subsample': 0.6,
    'max_delta_step': 0,
    'random_state': seed,
    'class_weight': {0: 0.2, 1: 0.6, 2: 0.2}
}

# Initialize the XGBoost Classifier
xgb_model = XGBClassifier(**final_params)

# Fit the model
xgb_model.fit(X_train, y_train)

# Make predictions on the training set
train_preds = xgb_model.predict(X_train)
train_f1 = f1_score(y_train, train_preds, average='macro')
train_accuracy = accuracy_score(y_train, train_preds)

# Make predictions on the validation set
valid_preds = xgb_model.predict(X_valid)
valid_f1 = f1_score(y_valid, valid_preds, average='macro')
valid_accuracy = accuracy_score(y_valid, valid_preds)

# Display the F1 scores
print(f"Training F1 Score: {train_f1:.4f}")
print(f"Validation F1 Score: {valid_f1:.4f}")
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Validation Accuracy: {valid_accuracy:.4f}")

Parameters: { "class_weight" } are not used.



Training F1 Score: 0.9196
Validation F1 Score: 0.3822
Training Accuracy: 0.9327
Validation Accuracy: 0.5550


In [13]:
n_splits = 10
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

# Initialize the model
xgb = XGBClassifier(**final_params)

f1 = []
acc = []

for (train, test), i  in zip(cv.split(X_train, y_train), range(n_splits)):
    xgb.fit(X_train.iloc[train], y_train.iloc[train])

    train_preds = xgb.predict(X_train.iloc[train])
    valid_preds = xgb.predict(X_train.iloc[test])

    train_accuracy = accuracy_score(y_train.iloc[train], train_preds)
    train_f1 = f1_score(y_train.iloc[train], train_preds, average='macro')

    valid_accuracy = accuracy_score(y_train.iloc[test], valid_preds)
    valid_f1 = f1_score(y_train.iloc[test], valid_preds, average='macro')

    f1.append(valid_f1)
    acc.append(valid_accuracy)

    print(f"Fold {i + 1}")
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Training F1 Score: {train_f1:.4f}")
    print(f"Validation Accuracy: {valid_accuracy:.4f}")
    print(f"Validation F1 Score: {valid_f1:.4f}") 

print(f"Mean F1 Score: {np.mean(f1):.4f}")
print(f"Mean Accuracy: {np.mean(acc):.4f}")

Parameters: { "class_weight" } are not used.



Fold 1
Training Accuracy: 0.9493
Training F1 Score: 0.9402
Validation Accuracy: 0.5505
Validation F1 Score: 0.3765


Parameters: { "class_weight" } are not used.



Fold 2
Training Accuracy: 0.9483
Training F1 Score: 0.9389
Validation Accuracy: 0.5525
Validation F1 Score: 0.3845


Parameters: { "class_weight" } are not used.



Fold 3
Training Accuracy: 0.9482
Training F1 Score: 0.9388
Validation Accuracy: 0.5586
Validation F1 Score: 0.3933


Parameters: { "class_weight" } are not used.



Fold 4
Training Accuracy: 0.9482
Training F1 Score: 0.9388
Validation Accuracy: 0.5586
Validation F1 Score: 0.3972


Parameters: { "class_weight" } are not used.



Fold 5
Training Accuracy: 0.9467
Training F1 Score: 0.9369
Validation Accuracy: 0.5592
Validation F1 Score: 0.3887


Parameters: { "class_weight" } are not used.



Fold 6
Training Accuracy: 0.9486
Training F1 Score: 0.9393
Validation Accuracy: 0.5577
Validation F1 Score: 0.3856


Parameters: { "class_weight" } are not used.



Fold 7
Training Accuracy: 0.9488
Training F1 Score: 0.9395
Validation Accuracy: 0.5523
Validation F1 Score: 0.3861


Parameters: { "class_weight" } are not used.



Fold 8
Training Accuracy: 0.9481
Training F1 Score: 0.9386
Validation Accuracy: 0.5532
Validation F1 Score: 0.3887


Parameters: { "class_weight" } are not used.



Fold 9
Training Accuracy: 0.9496
Training F1 Score: 0.9406
Validation Accuracy: 0.5530
Validation F1 Score: 0.3807


Parameters: { "class_weight" } are not used.



KeyboardInterrupt: 

In [24]:
# test data
test_preds = xgb.predict(test_data)

# convert predictions to original target values
target_mapping = {v: k for k, v in target_mapping.items()}
test_preds = pd.Series(test_preds).map(target_mapping)

# make csv file for submission
submission = pd.DataFrame({
    'UID': test_data.index,
    'Target': test_preds
})

submission.to_csv('../data/output/xgboost.csv', index=False)

In [25]:
# unique values in the test predictions
print(submission['Target'].value_counts())

Target
medium    7063
low       4511
high      4347
Name: count, dtype: int64


- for num_samples_m = 1.2 * num_samples, on trian               
Params: -               
final_params = {
    'n_estimators': 300,
    'learning_rate': 0.1,
    'max_depth': 5,
    'min_child_weight': 2,
    'subsample': 0.8,
    'max_delta_step': 1,
    'random_state': seed
}

- for num_samples_m = * num_samples, on train               
Params: -                   
final_params = {
    'n_estimators': 100,
    'learning_rate': 0.1,
    'max_depth': 5,
    'min_child_weight': 2,
    'subsample': 1.0,
    'max_delta_step': 0,
    'random_state': seed
}

- for num_samples_m = 0.85 * num_samples, on train               
Params: -          (got 0.400 score)          
final_params = {
    'n_estimators': 100,
    'learning_rate': 0.1,
    'max_depth': 5,
    'min_child_weight': 1,
    'subsample': 0.8,
    'max_delta_step': 1,
    'random_state': seed
}

- for num_samples_m = num_samples, on train , CorrDrop data        
Params: -               
final_params = {
    'n_estimators': 400,
    'learning_rate': 0.01,
    'max_depth': 5,
    'min_child_weight': 1,
    'subsample': 0.6,
    'max_delta_step': 1,
    'random_state': seed
}