In [1]:
import numpy as np
import random

import pandas as pd
from scipy import stats

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier

In [2]:
# set seed
seed = 42
random.seed(seed)
np.random.seed(seed)

In [3]:
# load data
train_data = pd.read_csv('../data/preprocessed_train.csv')
test_data = pd.read_csv('../data/preprocessed_test.csv')

In [4]:
# Set the 'UID' column as the index
train_data.set_index('UID', inplace=True)

# Set the 'UID' column as the index
test_data.set_index('UID', inplace=True)

In [5]:
# Define the mapping for 'Target' column
target_mapping = {'low': 0, 'medium': 1, 'high': 2}

# Apply the mapping to the 'Target' column
train_labels = train_data['Target'].map(target_mapping)

# Drop the 'Target' column from the training data
train_data = train_data.drop('Target', axis=1)

In [6]:
# make train data with 22514 data of each class

# Separate the data based on the target classes
low_class = train_data[train_labels == 0]
medium_class = train_data[train_labels == 1]
high_class = train_data[train_labels == 2]

# Get the number of samples in each class
low_class_count = len(low_class)
medium_class_count = len(medium_class)
high_class_count = len(high_class)

# Set the number of samples to be selected from each class
num_samples = min(low_class_count, medium_class_count, high_class_count)

# Randomly sample data from each class
low_class_sample = low_class.sample(n=num_samples, random_state=seed)
medium_class_sample = medium_class.sample(n=num_samples, random_state=seed)
high_class_sample = high_class.sample(n=num_samples, random_state=seed)

# Concatenate the sampled data
train_data_sampled = pd.concat([low_class_sample, medium_class_sample, high_class_sample])

# Separate the features and target variable
X_sampled = train_data_sampled
y_sampled = train_labels.loc[train_data_sampled.index]

# Display the count of unique values in the target variable
print(y_sampled.value_counts())

Target
0    22514
1    22514
2    22514
Name: count, dtype: int64


In [7]:
# Split the data into training and validation sets
# X_train, X_valid, y_train, y_valid = train_test_split(train_data, train_labels, test_size=0.05, random_state=seed)
X_train, X_valid, y_train, y_valid = train_test_split(X_sampled, y_sampled, test_size=0.1, random_state=seed)

# Display the shapes of the training and validation sets
print(f"X_train shape: {X_train.shape}")
print(f"X_valid shape: {X_valid.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_valid shape: {y_valid.shape}")

X_train shape: (60787, 20)
X_valid shape: (6755, 20)
y_train shape: (60787,)
y_valid shape: (6755,)


## XGBoost

In [8]:
# initialize the models
# params = {
#     'n_estimators': 200,
#     'learning_rate': 0.2,
#     'max_depth': 10,
#     'min_child_weight': 3,
#     'subsample': 0.6,
#     'random_state': seed
# }

xgb = XGBClassifier(
    n_estimators=200,
    random_state=seed,
)

# fit the model
xgb.fit(X_train, y_train)

# make predictions
train_preds = xgb.predict(X_train)
valid_preds = xgb.predict(X_valid)

# calculate the f1 score
train_f1 = f1_score(y_train, train_preds, average='micro')
valid_f1 = f1_score(y_valid, valid_preds, average='micro')

# calculate the accuracy
train_accuracy = accuracy_score(y_train, train_preds)
valid_accuracy = accuracy_score(y_valid, valid_preds)

# display the f1 score and accuracy
print(f"Train F1 Score: {train_f1}")
print(f"Validation F1 Score: {valid_f1}")
print(f"Train Accuracy: {train_accuracy}")
print(f"Validation Accuracy: {valid_accuracy}")

Train F1 Score: 0.7131788046786319
Validation F1 Score: 0.43375277572168763
Train Accuracy: 0.7131788046786319
Validation Accuracy: 0.43375277572168763


In [9]:
# make predictinos on test data
test_preds = xgb.predict(test_data)

# convert predictions to original target values
target_mapping = {v: k for k, v in target_mapping.items()}
test_preds = pd.Series(test_preds).map(target_mapping)

# make csv file for submission
submission = pd.DataFrame({
    'UID': test_data.index,
    'Target': test_preds
})

submission.to_csv('../data/output/xgboost.csv', index=False)

## Hyperparameter Optimization

In [None]:
cv_params = {
    'n_estimators': [100, 200, 300, 400],
    'learning_rate': [0.01, 0.1, 0.3, 0.5],
}

csv = GridSearchCV(
    estimator=XGBClassifier(random_state=seed),
    param_grid=cv_params,
    scoring='f1_macro',
    cv=5,
    n_jobs=-1
)

csv.fit(X_train, y_train)

# Get the best parameters
best_params = csv.best_params_

# Display the best parameters
print(best_params)

In [None]:
cv_params = {
    'max_depth': [ 2, 3, 4, 5, 6, 7],
    'min_child_weight': [1, 2, 3, 4]
}

fixed_params = {
    'n_estimators': 100,
    'learning_rate': 0.1, 
    'random_state': seed
}

csv = GridSearchCV(
    estimator=XGBClassifier(**fixed_params),
    param_grid=cv_params,
    scoring='f1_macro',
    cv=5,
    n_jobs=-1
)

csv.fit(X_train, y_train)

# Get the best parameters
best_params = csv.best_params_

# Display the best parameters
print(best_params)

In [None]:
cv_params = {
    'subsample': [0.6, 0.8, 0,9, 1.0],
    'max_delta_step': [0, 1, 2, 3, 4]
}

fixed_params = {
    'n_estimators': 100,
    'learning_rate': 0.1,
    'max_depth': 6,
    'min_child_weight': 3,
    'random_state': seed
}

csv = GridSearchCV(
    estimator=XGBClassifier(**fixed_params),
    param_grid=cv_params,
    scoring='f1_macro',
    cv=5,
    n_jobs=-1
)

csv.fit(X_train, y_train)

# Get the best parameters
best_params = csv.best_params_

# Display the best parameters
print(best_params)

In [None]:
final_params = {
    'n_estimators': 100,
    'learning_rate': 0.1,
    'max_depth': 6,
    'min_child_weight': 3,
    'subsample': 0.6,
    'max_delta_step': 1,
    'random_state': seed
}

# Initialize the XGBoost Classifier
xgb_model = XGBClassifier(**final_params)

# Fit the model
xgb_model.fit(X_train, y_train)

# Make predictions on the training set
train_preds = xgb_model.predict(X_train)
train_f1 = f1_score(y_train, train_preds, average='macro')
train_accuracy = accuracy_score(y_train, train_preds)

# Make predictions on the validation set
valid_preds = xgb_model.predict(X_valid)
valid_f1 = f1_score(y_valid, valid_preds, average='macro')
valid_accuracy = accuracy_score(y_valid, valid_preds)

# Display the F1 scores
print(f"Training F1 Score: {train_f1:.4f}")
print(f"Validation F1 Score: {valid_f1:.4f}")
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Validation Accuracy: {valid_accuracy:.4f}")