In [1]:
import numpy as np
import random

import pandas as pd
from scipy import stats

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

In [2]:
# set seed
seed = 42
random.seed(seed)
np.random.seed(seed)

In [3]:
# load data
train_data = pd.read_csv('../data/preprocessed_train.csv')
test_data = pd.read_csv('../data/preprocessed_test.csv')

In [4]:
# Set the 'UID' column as the index
train_data.set_index('UID', inplace=True)

# Set the 'UID' column as the index
test_data.set_index('UID', inplace=True)

In [5]:
# Define the mapping for 'Target' column
target_mapping = {'low': 0, 'medium': 1, 'high': 2}

# Apply the mapping to the 'Target' column
train_labels = train_data['Target'].map(target_mapping)

# Drop the 'Target' column from the training data
train_data = train_data.drop('Target', axis=1)

In [6]:
# make train data with 22514 data of each class

# Separate the data based on the target classes
low_class = train_data[train_labels == 0]
medium_class = train_data[train_labels == 1]
high_class = train_data[train_labels == 2]

# Get the number of samples in each class
low_class_count = len(low_class)
medium_class_count = len(medium_class)
high_class_count = len(high_class)

# Set the number of samples to be selected from each class
num_samples = min(low_class_count, medium_class_count, high_class_count)

# Randomly sample data from each class
low_class_sample = low_class.sample(n=num_samples, random_state=seed)
medium_class_sample = medium_class.sample(n=num_samples, random_state=seed)
high_class_sample = high_class.sample(n=num_samples, random_state=seed)

# Concatenate the sampled data
train_data_sampled = pd.concat([low_class_sample, medium_class_sample, high_class_sample])

# Separate the features and target variable
X_sampled = train_data_sampled
y_sampled = train_labels.loc[train_data_sampled.index]

# Display the count of unique values in the target variable
print(y_sampled.value_counts())

Target
0    22514
1    22514
2    22514
Name: count, dtype: int64


In [7]:
# Split the data into training and validation sets
# X_train, X_valid, y_train, y_valid = train_test_split(train_data, train_labels, test_size=0.05, random_state=seed)
X_train, X_valid, y_train, y_valid = train_test_split(X_sampled, y_sampled, test_size=0.1, random_state=seed)

# Display the shapes of the training and validation sets
print(f"X_train shape: {X_train.shape}")
print(f"X_valid shape: {X_valid.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_valid shape: {y_valid.shape}")

X_train shape: (60787, 20)
X_valid shape: (6755, 20)
y_train shape: (60787,)
y_valid shape: (6755,)


## CatBoost

In [8]:
# initialize the models
cat = CatBoostClassifier(
    random_seed=seed
)

# fit the model
# cat.fit(X_train, y_train)

## lightGBM

In [9]:
lgbm = LGBMClassifier(
    random_state=seed
)

# lgbm.fit(X_train, y_train)

## Random Forest

In [10]:
# Initialize the Random Forest Classifier
rf = RandomForestClassifier(
    n_estimators=200,
    min_samples_split=10,
    min_samples_leaf=4,
    max_features=5,
    max_depth=10,
    bootstrap=False,
    random_state=seed,
    n_jobs=-1
)

# Fit the model
# rf.fit(X_train, y_train)

## XGBoost

In [11]:
xgb = XGBClassifier(
    n_estimators=200, 
    random_state=seed
)

# xgb.fit(X_train, y_train)

## Ensemble

In [12]:
# preds = pd.DataFrame({
#     'cat': test_preds_cat,
#     'lgbm': test_preds_lgbm,
#     'rf': test_preds_rf,
#     'xgb': test_preds_xgb
# })

# Initialize the Voting Classifier
voting = VotingClassifier(
    estimators=[
        ('cat', cat),
        ('lgbm', lgbm),
        ('rf', rf),
        ('xgb', xgb)
    ],
    voting='soft',
    n_jobs=-1
)

# Fit the model
voting.fit(X_train, y_train)

Learning rate set to 0.097566
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.125319 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2830
[LightGBM] [Info] Number of data points in the train set: 60787, number of used features: 20
0:	learn: 1.0949732	total: 150ms	remaining: 2m 29s
[LightGBM] [Info] Start training from score -1.097494
[LightGBM] [Info] Start training from score -1.099271
[LightGBM] [Info] Start training from score -1.099073
1:	learn: 1.0915023	total: 384ms	remaining: 3m 11s
2:	learn: 1.0887578	total: 598ms	remaining: 3m 18s
3:	learn: 1.0861141	total: 778ms	remaining: 3m 13s
4:	learn: 1.0836679	total: 929ms	remaining: 3m 4s
5:	learn: 1.0814459	total: 1.03s	remaining: 2m 50s
6:	learn: 1.0793128	total: 1.18s	remaining: 2m 48s
7:	learn: 1.0776821	total: 1.24s	remaining: 2m 34s
8:	learn: 1.0760443	total: 1.26s	remaining: 2m 18s
9:	learn: 1.0746964	total: 1.28s	remaining: 2m 7s
10:	learn: 

In [13]:
# make predictions on validation set
valid_preds = voting.predict(X_valid)

# Calculate the F1 score
f1 = f1_score(y_valid, valid_preds, average='weighted')

# Calculate the accuracy
accuracy = accuracy_score(y_valid, valid_preds)

# Display the F1 score and accuracy
print(f"F1 Score: {f1}")
print(f"Accuracy: {accuracy}")

F1 Score: 0.4495567536303393
Accuracy: 0.45062916358253147


In [14]:
# make predictions on test set
test_preds = voting.predict(test_data)

In [15]:
# Save the predictions to a CSV file
output = pd.DataFrame({'UID': test_data.index, 'Target': test_preds})

output['Target'] = output['Target'].map({0: 'low', 1: 'medium', 2: 'high'})

output.to_csv('../data/output/ensemble.csv', index=False)