In [1]:
import numpy as np
import random

import pandas as pd
from scipy import stats

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [2]:
# set seed
seed = 42
random.seed(seed)
np.random.seed(seed)

In [3]:
# load data
# train_data = pd.read_csv('../data/preprocessed_train.csv')
# test_data = pd.read_csv('../data/preprocessed_test.csv')

# train_data = pd.read_csv('../data/preprocessed_train_smote.csv')
# test_data = pd.read_csv('../data/preprocessed_test_smote.csv')

train_data = pd.read_csv('../data/preprocessed_train_KNNim.csv')
test_data = pd.read_csv('../data/preprocessed_test_KNNim.csv')

In [4]:
# Set the 'UID' column as the index
train_data.set_index('UID', inplace=True)

# Set the 'UID' column as the index
test_data.set_index('UID', inplace=True)

In [5]:
# Define the mapping for 'Target' column
target_mapping = {'low': 0, 'medium': 1, 'high': 2}

# Apply the mapping to the 'Target' column
train_labels = train_data['Target'].map(target_mapping)

# Drop the 'Target' column from the training data
train_data = train_data.drop('Target', axis=1)

In [6]:
# apply pca to reduce the number of features
# from sklearn.decomposition import PCA

# pca = PCA(n_components=10)
# train_data = pd.DataFrame(pca.fit_transform(train_data), index=train_data.index)
# test_data = pd.DataFrame(pca.transform(test_data), index=test_data.index)

In [7]:
# make train data with 22514 data of each class

# Separate the data based on the target classes
# low_class = train_data[train_labels == 0]
# medium_class = train_data[train_labels == 1]
# high_class = train_data[train_labels == 2]

# # Get the number of samples in each class
# low_class_count = len(low_class)
# medium_class_count = len(medium_class)
# high_class_count = len(high_class)

# # Set the number of samples to be selected from each class
# num_samples = min(low_class_count, medium_class_count, high_class_count)

# # Randomly sample data from each class
# low_class_sample = low_class.sample(n=num_samples, random_state=seed)
# medium_class_sample = medium_class.sample(n=num_samples, random_state=seed)
# high_class_sample = high_class.sample(n=num_samples, random_state=seed)

# # Concatenate the sampled data
# train_data_sampled = pd.concat([low_class_sample, medium_class_sample, high_class_sample])

# # Separate the features and target variable
# X_sampled = train_data_sampled
# y_sampled = train_labels.loc[train_data_sampled.index]

# # Display the count of unique values in the target variable
# print(y_sampled.value_counts())

In [8]:
# Split the data into training and validation sets
# X_train, X_valid, y_train, y_valid = train_test_split(train_data, train_labels, test_size=0.1, random_state=seed)
# X_train, X_valid, y_train, y_valid = train_test_split(X_sampled, y_sampled, test_size=0.1, random_state=seed)
# X_train, y_train = X_sampled, y_sampled
# X_valid, y_valid = X_sampled, y_sampled
X_train, y_train = train_data, train_labels
X_valid, y_valid = train_data, train_labels

# Display the shapes of the training and validation sets
print(f"X_train shape: {X_train.shape}")
print(f"X_valid shape: {X_valid.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_valid shape: {y_valid.shape}")

X_train shape: (112569, 20)
X_valid shape: (112569, 20)
y_train shape: (112569,)
y_valid shape: (112569,)


In [9]:
# compute class weights
class_weights = dict(1 / train_labels.value_counts())

## CatBoost

In [10]:
# initialize the models
# cat = CatBoostClassifier(
#     n_estimators=1000,
#     random_seed=seed, 
#     class_weights=[3, 1, 3]
# )

cat = CatBoostClassifier(
    iterations=500, 
    learning_rate=0.5, 
    random_seed=seed,
    class_weights=[3, 1, 3]
)

# fit the model
cat.fit(X_train, y_train)

# make predictions
train_preds = cat.predict(X_train)
valid_preds = cat.predict(X_valid)

# calculate the f1 score
train_f1 = f1_score(y_train, train_preds, average='macro')
valid_f1 = f1_score(y_valid, valid_preds, average='macro')

# calculate the accuracy
train_accuracy = accuracy_score(y_train, train_preds)
valid_accuracy = accuracy_score(y_valid, valid_preds)

# display the f1 score and accuracy
print(f"Train F1 Score: {train_f1}")
print(f"Valid F1 Score: {valid_f1}")
print(f"Train Accuracy: {train_accuracy}")
print(f"Valid Accuracy: {valid_accuracy}")

0:	learn: 1.0816545	total: 82.5ms	remaining: 41.1s
1:	learn: 1.0745463	total: 110ms	remaining: 27.4s
2:	learn: 1.0694174	total: 137ms	remaining: 22.7s
3:	learn: 1.0652277	total: 164ms	remaining: 20.3s
4:	learn: 1.0630164	total: 189ms	remaining: 18.7s
5:	learn: 1.0609549	total: 217ms	remaining: 17.8s
6:	learn: 1.0597623	total: 243ms	remaining: 17.1s
7:	learn: 1.0590102	total: 267ms	remaining: 16.4s
8:	learn: 1.0577009	total: 291ms	remaining: 15.9s
9:	learn: 1.0561003	total: 316ms	remaining: 15.5s
10:	learn: 1.0549942	total: 342ms	remaining: 15.2s
11:	learn: 1.0541923	total: 367ms	remaining: 14.9s
12:	learn: 1.0536517	total: 391ms	remaining: 14.6s
13:	learn: 1.0530011	total: 412ms	remaining: 14.3s
14:	learn: 1.0525697	total: 435ms	remaining: 14.1s
15:	learn: 1.0519268	total: 458ms	remaining: 13.8s
16:	learn: 1.0509342	total: 483ms	remaining: 13.7s
17:	learn: 1.0502182	total: 507ms	remaining: 13.6s
18:	learn: 1.0494119	total: 531ms	remaining: 13.4s
19:	learn: 1.0486381	total: 553ms	remain

In [10]:
# make predictinos on test data
test_preds = cat.predict(test_data)

# reshape to 1D array
test_preds = test_preds.ravel()

# convert predictions to original target values
target_mapping = {v: k for k, v in target_mapping.items()}
test_preds = pd.Series(test_preds).map(target_mapping)

# make csv file for submission
submission = pd.DataFrame({
    'UID': test_data.index,
    'Target': test_preds
})

submission.to_csv('../data/output/catboost.csv', index=False)

In [9]:
# cross validation
n_splits = 10
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

# initialize the models
# cat = CatBoostClassifier(
#     random_seed=seed,
#     class_weights=[3, 1, 3]
# )
cat = CatBoostClassifier(
    iterations=2000, 
    learning_rate=0.1, 
    random_seed=seed,
    class_weights=[3, 1, 3]
)

f1 = []
acc = []

for (train, test), i in zip(cv.split(X_train, y_train), range(n_splits)):
    cat.fit(X_train.iloc[train], y_train.iloc[train])
    
    train_preds = cat.predict(X_train.iloc[train])
    valid_preds = cat.predict(X_train.iloc[test])

    train_accuracy = accuracy_score(y_train.iloc[train], train_preds)
    valid_accuracy = accuracy_score(y_train.iloc[test], valid_preds)

    train_f1 = f1_score(y_train.iloc[train], train_preds, average='macro')
    valid_f1 = f1_score(y_train.iloc[test], valid_preds, average='macro')

    f1.append(valid_f1)
    acc.append(valid_accuracy)

    print(f"Fold {i + 1} - Train F1 Score: {train_f1} - Valid F1 Score: {valid_f1}")
    print(f"Fold {i + 1} - Train Accuracy: {train_accuracy} - Valid Accuracy: {valid_accuracy}")
    

0:	learn: 1.0943803	total: 80ms	remaining: 2m 39s
1:	learn: 1.0904040	total: 106ms	remaining: 1m 46s
2:	learn: 1.0869367	total: 132ms	remaining: 1m 28s
3:	learn: 1.0842069	total: 158ms	remaining: 1m 19s
4:	learn: 1.0817139	total: 183ms	remaining: 1m 13s
5:	learn: 1.0799803	total: 207ms	remaining: 1m 8s
6:	learn: 1.0784039	total: 232ms	remaining: 1m 6s
7:	learn: 1.0769801	total: 257ms	remaining: 1m 3s
8:	learn: 1.0753774	total: 283ms	remaining: 1m 2s
9:	learn: 1.0741357	total: 307ms	remaining: 1m 1s
10:	learn: 1.0730172	total: 331ms	remaining: 59.8s
11:	learn: 1.0719459	total: 358ms	remaining: 59.3s
12:	learn: 1.0710112	total: 381ms	remaining: 58.2s
13:	learn: 1.0700917	total: 404ms	remaining: 57.3s
14:	learn: 1.0692747	total: 426ms	remaining: 56.4s
15:	learn: 1.0684400	total: 451ms	remaining: 55.9s
16:	learn: 1.0678594	total: 472ms	remaining: 55.1s
17:	learn: 1.0673262	total: 495ms	remaining: 54.5s
18:	learn: 1.0666713	total: 519ms	remaining: 54.1s
19:	learn: 1.0661155	total: 543ms	rem

In [10]:
# mean f1 score
print(f"Mean F1 Score: {np.mean(f1)}")

Mean F1 Score: 0.42809372212357816


## Hyperparameter Tuning

In [11]:
params = {
    'n_estimators': [200, 300, 400, 500],
    'max_depth': [2, 3, 5],
}

# Initialize the model
cat = CatBoostClassifier(
    class_weights=[3, 1, 3],
    random_seed=seed, 
)

# Initialize GridSearchCV
grid_search = GridSearchCV(cat, params, cv=3, scoring='f1_macro', n_jobs=-1, verbose=1)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters
print(grid_search.best_params_)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
Learning rate set to 0.384233
Learning rate set to 0.384233
Learning rate set to 0.213907
Learning rate set to 0.384233
Learning rate set to 0.272771
Learning rate set to 0.272771
Learning rate set to 0.177148
Learning rate set to 0.213907
Learning rate set to 0.213907
0:	learn: 1.0925063	total: 220ms	remaining: 1m 5s
Learning rate set to 0.272771
Learning rate set to 0.384233
0:	learn: 1.0906092	total: 323ms	remaining: 1m 4s
1:	learn: 1.0854649	total: 359ms	remaining: 35.5s
0:	learn: 1.0905735	total: 474ms	remaining: 1m 34s
2:	learn: 1.0828017	total: 433ms	remaining: 28.4s
0:	learn: 1.0944247	total: 209ms	remaining: 1m 44s
1:	learn: 1.0883406	total: 493ms	remaining: 1m 13s
Learning rate set to 0.272771
0:	learn: 1.0905686	total: 523ms	remaining: 1m 44s
0:	learn: 1.0936591	total: 462ms	remaining: 3m 4s
0:	learn: 1.0925434	total: 387ms	remaining: 1m 55s
2:	learn: 1.0861927	total: 591ms	remaining: 58.5s
Learning rate set to 0.2

In [13]:
params = {
    'l2_leaf_reg': [1, 3, 5, 7],
}

# Initialize the model
cat = CatBoostClassifier(
    class_weights=[3, 1, 3],
    random_seed=seed, 
    n_estimators=400,
    max_depth=3
)

# Initialize GridSearchCV
grid_search = GridSearchCV(cat, params, cv=3, scoring='f1_macro', n_jobs=-1, verbose=1)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters
print(grid_search.best_params_)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
0:	learn: 1.0977685	total: 72.3ms	remaining: 28.8s
1:	learn: 1.0969568	total: 114ms	remaining: 22.6s
2:	learn: 1.0960745	total: 146ms	remaining: 19.3s
3:	learn: 1.0952740	total: 189ms	remaining: 18.7s
0:	learn: 1.0977683	total: 153ms	remaining: 1m 1s
4:	learn: 1.0944703	total: 246ms	remaining: 19.4s
0:	learn: 1.0977702	total: 187ms	remaining: 1m 14s
0:	learn: 1.0977735	total: 236ms	remaining: 1m 34s
1:	learn: 1.0969595	total: 257ms	remaining: 51.1s
5:	learn: 1.0937459	total: 446ms	remaining: 29.3s
1:	learn: 1.0969189	total: 401ms	remaining: 1m 19s
6:	learn: 1.0930380	total: 486ms	remaining: 27.3s
2:	learn: 1.0960784	total: 346ms	remaining: 45.8s
1:	learn: 1.0969395	total: 353ms	remaining: 1m 10s
2:	learn: 1.0960855	total: 458ms	remaining: 1m
2:	learn: 1.0961758	total: 433ms	remaining: 57.3s
0:	learn: 1.0977692	total: 193ms	remaining: 1m 17s
0:	learn: 1.0977770	total: 152ms	remaining: 1m
3:	learn: 1.0952785	total: 465ms	remaini