In [1]:
import numpy as np
import random

import pandas as pd
from scipy import stats

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [2]:
# set seed
seed = 42
random.seed(seed)
np.random.seed(seed)

In [3]:
# load data
train_data = pd.read_csv('../data/preprocessed_train.csv')
test_data = pd.read_csv('../data/preprocessed_test.csv')

# train_data = pd.read_csv('../data/preprocessed_train_smote.csv')
# test_data = pd.read_csv('../data/preprocessed_test_smote.csv')

In [4]:
# Set the 'UID' column as the index
train_data.set_index('UID', inplace=True)

# Set the 'UID' column as the index
test_data.set_index('UID', inplace=True)

In [5]:
# Define the mapping for 'Target' column
target_mapping = {'low': 0, 'medium': 1, 'high': 2}

# Apply the mapping to the 'Target' column
train_labels = train_data['Target'].map(target_mapping)

# Drop the 'Target' column from the training data
train_data = train_data.drop('Target', axis=1)

In [6]:
# apply pca to reduce the number of features
from sklearn.decomposition import PCA

pca = PCA(n_components=10)
train_data = pd.DataFrame(pca.fit_transform(train_data), index=train_data.index)
test_data = pd.DataFrame(pca.transform(test_data), index=test_data.index)

In [7]:
# make train data with 22514 data of each class

# Separate the data based on the target classes
low_class = train_data[train_labels == 0]
medium_class = train_data[train_labels == 1]
high_class = train_data[train_labels == 2]

# Get the number of samples in each class
low_class_count = len(low_class)
medium_class_count = len(medium_class)
high_class_count = len(high_class)

# Set the number of samples to be selected from each class
num_samples = min(low_class_count, medium_class_count, high_class_count)

# Randomly sample data from each class
low_class_sample = low_class.sample(n=num_samples, random_state=seed)
medium_class_sample = medium_class.sample(n=num_samples, random_state=seed)
high_class_sample = high_class.sample(n=num_samples, random_state=seed)

# Concatenate the sampled data
train_data_sampled = pd.concat([low_class_sample, medium_class_sample, high_class_sample])

# Separate the features and target variable
X_sampled = train_data_sampled
y_sampled = train_labels.loc[train_data_sampled.index]

# Display the count of unique values in the target variable
print(y_sampled.value_counts())

Target
0    22514
1    22514
2    22514
Name: count, dtype: int64


In [8]:
# Split the data into training and validation sets
# X_train, X_valid, y_train, y_valid = train_test_split(train_data, train_labels, test_size=0.05, random_state=seed)
X_train, X_valid, y_train, y_valid = train_test_split(X_sampled, y_sampled, test_size=0.1, random_state=seed)
# X_train, y_train = X_sampled, y_sampled
# X_valid, y_valid = X_sampled, y_sampled

# Display the shapes of the training and validation sets
print(f"X_train shape: {X_train.shape}")
print(f"X_valid shape: {X_valid.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_valid shape: {y_valid.shape}")

X_train shape: (67542, 10)
X_valid shape: (67542, 10)
y_train shape: (67542,)
y_valid shape: (67542,)


## CatBoost

In [9]:
# initialize the models
cat = CatBoostClassifier(
    n_estimators=1000,
    random_seed=seed
)

# fit the model
cat.fit(X_train, y_train)

# make predictions
train_preds = cat.predict(X_train)
valid_preds = cat.predict(X_valid)

# calculate the f1 score
train_f1 = f1_score(y_train, train_preds, average='micro')
valid_f1 = f1_score(y_valid, valid_preds, average='micro')

# calculate the accuracy
train_accuracy = accuracy_score(y_train, train_preds)
valid_accuracy = accuracy_score(y_valid, valid_preds)

# display the f1 score and accuracy
print(f"Train F1 Score: {train_f1}")
print(f"Valid F1 Score: {valid_f1}")
print(f"Train Accuracy: {train_accuracy}")
print(f"Valid Accuracy: {valid_accuracy}")

Learning rate set to 0.098092
0:	learn: 1.0952545	total: 59.8ms	remaining: 59.7s
1:	learn: 1.0921501	total: 71.5ms	remaining: 35.7s
2:	learn: 1.0896045	total: 78.6ms	remaining: 26.1s
3:	learn: 1.0874363	total: 89.1ms	remaining: 22.2s
4:	learn: 1.0853020	total: 96.2ms	remaining: 19.1s
5:	learn: 1.0834684	total: 105ms	remaining: 17.4s
6:	learn: 1.0819164	total: 112ms	remaining: 16s
7:	learn: 1.0807320	total: 122ms	remaining: 15.1s
8:	learn: 1.0794570	total: 128ms	remaining: 14.1s
9:	learn: 1.0783169	total: 138ms	remaining: 13.6s
10:	learn: 1.0772506	total: 144ms	remaining: 13s
11:	learn: 1.0764142	total: 157ms	remaining: 12.9s
12:	learn: 1.0755327	total: 168ms	remaining: 12.7s
13:	learn: 1.0745390	total: 175ms	remaining: 12.3s
14:	learn: 1.0738946	total: 184ms	remaining: 12.1s
15:	learn: 1.0732412	total: 195ms	remaining: 12s
16:	learn: 1.0724376	total: 204ms	remaining: 11.8s
17:	learn: 1.0718074	total: 213ms	remaining: 11.6s
18:	learn: 1.0711584	total: 223ms	remaining: 11.5s
19:	learn: 1

In [10]:
# make predictinos on test data
test_preds = cat.predict(test_data)

# reshape to 1D array
test_preds = test_preds.ravel()

# convert predictions to original target values
target_mapping = {v: k for k, v in target_mapping.items()}
test_preds = pd.Series(test_preds).map(target_mapping)

# make csv file for submission
submission = pd.DataFrame({
    'UID': test_data.index,
    'Target': test_preds
})

submission.to_csv('../data/output/catboost.csv', index=False)