In [1]:
import numpy as np
import random

import pandas as pd
from scipy import stats

# decision tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [2]:
# set seed, 42
seed = 23
random.seed(seed)
np.random.seed(seed)

In [3]:
# load data
train_data = pd.read_csv('../data/preprocessed_train.csv')
test_data = pd.read_csv('../data/preprocessed_test.csv')

# train_data = pd.read_csv('../data/preprocessed_train_CorrDrop.csv')
# test_data = pd.read_csv('../data/preprocessed_test_CorrDrop.csv')

In [4]:
# Set the 'UID' column as the index
train_data.set_index('UID', inplace=True)

# Set the 'UID' column as the index
test_data.set_index('UID', inplace=True)

In [5]:
# Define the mapping for 'Target' column
target_mapping = {'low': 0, 'medium': 1, 'high': 2}

# Apply the mapping to the 'Target' column
train_labels = train_data['Target'].map(target_mapping)

# Drop the 'Target' column from the training data
train_data = train_data.drop('Target', axis=1)

In [6]:
# make train data with 22514 data of each class

# Separate the data based on the target classes
low_class = train_data[train_labels == 0]
medium_class = train_data[train_labels == 1]
high_class = train_data[train_labels == 2]

# Get the number of samples in each class
low_class_count = len(low_class)
medium_class_count = len(medium_class)
high_class_count = len(high_class)

# Set the number of samples to be selected from each class
num_samples = min(low_class_count, medium_class_count, high_class_count)
num_samples_m = int(num_samples*1.0)

# Randomly sample data from each class
low_class_sample = low_class.sample(n=num_samples, random_state=seed)
medium_class_sample = medium_class.sample(n=num_samples_m, random_state=seed)
high_class_sample = high_class.sample(n=num_samples, random_state=seed)

# Concatenate the sampled data
train_data_sampled = pd.concat([low_class_sample, medium_class_sample, high_class_sample])

# Separate the features and target variable
X_sampled = train_data_sampled
y_sampled = train_labels.loc[train_data_sampled.index]

# Display the count of unique values in the target variable
print(y_sampled.value_counts())

Target
0    22514
1    22514
2    22514
Name: count, dtype: int64


In [7]:
# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(train_data, train_labels, test_size=0.1, random_state=seed)
# X_train, X_valid, y_train, y_valid = train_test_split(X_sampled, y_sampled, test_size=0.1, random_state=seed)
# X_train, y_train = X_sampled, y_sampled
# X_valid, y_valid = X_sampled, y_sampled

# Display the shapes of the training and validation sets
print(f"X_train shape: {X_train.shape}")
print(f"X_valid shape: {X_valid.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_valid shape: {y_valid.shape}")

X_train shape: (101312, 20)
X_valid shape: (11257, 20)
y_train shape: (101312,)
y_valid shape: (11257,)


## bagging

In [8]:
# decision tree
# clf = DecisionTreeClassifier(random_state=seed)
clf = LGBMClassifier(
    random_state=seed,
    n_jobs=1
)
# clf = RandomForestClassifier(
#     n_estimators=200, 
#     max_depth=10,
#     random_state=seed,
# )


# bagging_clf = BaggingClassifier(estimator=clf, n_estimators=100, random_state=seed, n_jobs=-1, bootstrap_features=True, oob_score=True)
bagging_clf = BaggingClassifier(estimator=clf, n_estimators=100, random_state=seed, n_jobs=-1, bootstrap_features=True)

# fit the model
bagging_clf.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014140 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1587
[LightGBM] [Info] Number of data points in the train set: 101312, number of used features: 20
[LightGBM] [Info] Start training from score -1.612225
[LightGBM] [Info] Start training from score -0.510549
[LightGBM] [Info] Start training from score -1.607485
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020369 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3377
[LightGBM] [Info] Number of data points in the train set: 101312, number of used features: 20
[LightGBM] [Info] Start training from score -1.617137
[LightGBM] [Info] Start training from score -0.508397
[LightGBM] [Info] Star

In [9]:
# cross validation
n_splits=10
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

f1 = []
acc = []

for (train, test), i in zip(cv.split(X_train, y_train), range(n_splits)):
    bagging_clf.fit(X_train.iloc[train], y_train.iloc[train])
    
    train_preds = bagging_clf.predict(X_train.iloc[train])
    valid_preds = bagging_clf.predict(X_train.iloc[test])

    train_accuracy = accuracy_score(y_train.iloc[train], train_preds)
    train_f1 = f1_score(y_train.iloc[train], train_preds, average='macro')

    valid_accuracy = accuracy_score(y_train.iloc[test], valid_preds)
    valid_f1 = f1_score(y_train.iloc[test], valid_preds, average='macro')

    f1.append(valid_f1)
    acc.append(valid_accuracy)

    print(f"Fold {i+1}/{n_splits}")
    print(f"Train Accuracy: {train_accuracy:.4f}, Train F1: {train_f1:.4f}")
    print(f"Validation Accuracy: {valid_accuracy:.4f}, Validation F1: {valid_f1:.4f}")
    print("")
    

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012432 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2370
[LightGBM] [Info] Number of data points in the train set: 91180, number of used features: 20
[LightGBM] [Info] Start training from score -1.610261
[LightGBM] [Info] Start training from score -0.505121
[LightGBM] [Info] Start training from score -1.625914
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017993 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1747
[LightGBM] [Info] Number of data points in the train set: 91180, number of used features: 20
[LightGBM] [Info] Start training from score -1.609822
[LightGBM] [Info] Start training from score -0.511063
[LightGBM] [Info] Start 

KeyboardInterrupt: 

In [None]:
# print average f1 and accuracy
print(f"Average F1: {np.mean(f1):.4f}")
print(f"Average Accuracy: {np.mean(acc):.4f}")

Average F1: 0.4454
Average Accuracy: 0.4479


In [None]:
# test prediction
test_preds = bagging_clf.predict(test_data)

# map the target back to the original classes
target_mapping_rev = {0: 'low', 1: 'medium', 2: 'high'}
test_preds = pd.Series(test_preds).map(target_mapping_rev)

# Save the predictions to a CSV file
output = pd.DataFrame({'UID': test_data.index, 'Target': test_preds})
output.to_csv('../data/output/bagging_LGBM.csv', index=False)

In [None]:
# unique output
print(output['Target'].value_counts())

Target
medium    7246
low       4598
high      4077
Name: count, dtype: int64
