In [2]:
import numpy as np
import random

import pandas as pd
from scipy import stats

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

In [3]:
# set seed, 42, 23, 11
seed = 42
random.seed(seed)
np.random.seed(seed)

In [3]:
# load data
# train_data = pd.read_csv('../data/preprocessed_train.csv')
# test_data = pd.read_csv('../data/preprocessed_test.csv')

train_data = pd.read_csv('../data/preprocessed_train_KNNim.csv')
test_data = pd.read_csv('../data/preprocessed_test_KNNim.csv')

In [4]:
# Set the 'UID' column as the index
train_data.set_index('UID', inplace=True)

# Set the 'UID' column as the index
test_data.set_index('UID', inplace=True)

In [5]:
# Define the mapping for 'Target' column
target_mapping = {'low': 0, 'medium': 1, 'high': 2}

# Apply the mapping to the 'Target' column
train_labels = train_data['Target'].map(target_mapping)

# Drop the 'Target' column from the training data
train_data = train_data.drop('Target', axis=1)

In [6]:
# cols_to_drop = ['AgricultureZoningCode', 'CropSpeciesVariety', 'LandUsageType',
#        'ValuationYear', 'StorageAndFacilityCount', 'WaterAccessPoints',
#        'MainIrrigationSystemCount', 'WaterAccessPointsCalc',
#        'NationalRegionCode']

# cols_to_drop = ['AgricultureZoningCode', 'AgriculturalPostalZone', 'LandUsageType',
#                 'Longitude',  'RawLocationId', 'ValuationYear', 'Latitude', 
#                  'NationalRegionCode']
# cols_to_drop = ['MainIrrigationSystemCount',
#                 'NationalRegionCode',
#                 'ValuationYear',
#                 'StorageAndFacilityCount',
#                 'LandUsageType',
#                 'AgricultureZoningCode']

# train_data.drop(cols_to_drop, axis=1, inplace=True)
# test_data.drop(cols_to_drop, axis=1, inplace=True)

In [7]:
# unique target values, and counts
print(train_labels.value_counts())

Target
1    67541
2    22514
0    22514
Name: count, dtype: int64


In [8]:
# make train data with 22514 data of each class

# Separate the data based on the target classes
# low_class = train_data[train_labels == 0]
# medium_class = train_data[train_labels == 1]
# high_class = train_data[train_labels == 2]

# Get the number of samples in each class
# low_class_count = len(low_class)
# medium_class_count = len(medium_class)
# high_class_count = len(high_class)

# # Set the number of samples to be selected from each class
# num_samples = min(low_class_count, medium_class_count, high_class_count)

# # Randomly sample data from each class
# low_class_sample = low_class.sample(n=num_samples, random_state=seed)
# medium_class_sample = medium_class.sample(n=num_samples, random_state=seed)
# high_class_sample = high_class.sample(n=num_samples, random_state=seed)

# # Concatenate the sampled data
# train_data_sampled = pd.concat([low_class_sample, medium_class_sample, high_class_sample])

# # Separate the features and target variable
# X_sampled = train_data_sampled
# y_sampled = train_labels.loc[train_data_sampled.index]

# # Display the count of unique values in the target variable
# print(y_sampled.value_counts())

In [9]:
# Split the data into training and validation sets
# X_train, X_valid, y_train, y_valid = train_test_split(train_data, train_labels, test_size=0.1, random_state=seed)
# X_train, X_valid, y_train, y_valid = train_test_split(X_sampled, y_sampled, test_size=0.1, random_state=seed)
X_train, y_train = train_data, train_labels
X_valid, y_valid = train_data, train_labels

# Display the shapes of the training and validation sets
print(f"X_train shape: {X_train.shape}")
print(f"X_valid shape: {X_valid.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_valid shape: {y_valid.shape}")

X_train shape: (112569, 20)
X_valid shape: (112569, 20)
y_train shape: (112569,)
y_valid shape: (112569,)


## CatBoost

In [10]:
# initialize the models
cat = CatBoostClassifier(
    random_seed=seed,
    class_weights=[3, 1, 3],
    iterations=2000,
    learning_rate=0.1,
)

# cat = CatBoostClassifier(
#     class_weights=[3, 1, 3],
#     random_seed=seed,
#     n_estimators=1000, 
#     max_depth=3, 
#     l2_leaf_reg=5, 
# )

# fit the model
cat.fit(X_train, y_train)

# get f1
y_pred = cat.predict(X_valid)
cat_f1 = f1_score(y_valid, y_pred, average='macro')

0:	learn: 1.0943782	total: 81.6ms	remaining: 2m 43s
1:	learn: 1.0907815	total: 108ms	remaining: 1m 47s
2:	learn: 1.0875261	total: 132ms	remaining: 1m 28s
3:	learn: 1.0849568	total: 158ms	remaining: 1m 18s
4:	learn: 1.0826390	total: 183ms	remaining: 1m 13s
5:	learn: 1.0806267	total: 205ms	remaining: 1m 8s
6:	learn: 1.0794549	total: 228ms	remaining: 1m 4s
7:	learn: 1.0778955	total: 249ms	remaining: 1m 1s
8:	learn: 1.0764078	total: 270ms	remaining: 59.7s
9:	learn: 1.0747974	total: 293ms	remaining: 58.3s
10:	learn: 1.0735846	total: 317ms	remaining: 57.3s
11:	learn: 1.0723400	total: 339ms	remaining: 56.2s
12:	learn: 1.0714113	total: 361ms	remaining: 55.2s
13:	learn: 1.0705003	total: 382ms	remaining: 54.1s
14:	learn: 1.0699563	total: 402ms	remaining: 53.2s
15:	learn: 1.0690178	total: 424ms	remaining: 52.5s
16:	learn: 1.0680230	total: 444ms	remaining: 51.8s
17:	learn: 1.0674768	total: 464ms	remaining: 51.1s
18:	learn: 1.0668464	total: 487ms	remaining: 50.8s
19:	learn: 1.0661878	total: 509ms	r

## lightGBM

In [11]:
lgbm = LGBMClassifier(
    random_state=seed,
    # class weight
    class_weight='balanced',
    n_estimators=300,
    subsample=0.8,
    reg_lambda=0.1,
    reg_alpha=0.1,
    learning_rate=0.1,
    num_leaves=300,
    max_depth=10,
    min_child_samples=50,
    colsample_bytree=0.8,
)

lgbm.fit(X_train, y_train)

# get f1 score
y_pred = lgbm.predict(X_valid)
lgbm_f1 = f1_score(y_valid, y_pred, average='macro')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005151 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2909
[LightGBM] [Info] Number of data points in the train set: 112569, number of used features: 20
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


## Random Forest

In [12]:
# Initialize the Random Forest Classifier
rf = RandomForestClassifier(
    n_estimators=200,
    min_samples_split=10,
    min_samples_leaf=4,
    max_features=5,
    max_depth=10,
    bootstrap=False,
    random_state=seed,
    n_jobs=-1, 
    class_weight='balanced',
)

# rf = RandomForestClassifier(
#     n_estimators=100,
#     random_state=seed,
#     class_weight='balanced',
#     n_jobs=-1,
#     max_depth=None,
#     min_samples_split=2,
#     min_samples_leaf=1,
#     max_features=5, 
# )

# Fit the model
rf.fit(X_train, y_train)

# get f1
y_pred = rf.predict(X_valid)
rf_f1 = f1_score(y_valid, y_pred, average='macro')

## XGBoost

In [13]:
# sample_weight = np.array([3 if i == 2 else 1 for i in y_train])
# xgb = XGBClassifier(
#     n_estimators=300,
#     learning_rate=0.1,
#     max_depth=5, 
#     min_child_weight=2,
#     subsample=0.8,
#     max_delta_step=1,
#     random_state=seed,
# )

# xgb.fit(X_train, y_train, sample_weight=sample_weight)

# # get f1
# y_pred = xgb.predict(X_valid)
# xgb_f1 = f1_score(y_valid, y_pred, average='macro')
# print(f"XGB F1: {xgb_f1}")

## Ensemble

In [14]:
# preds = pd.DataFrame({
#     'cat': test_preds_cat,
#     'lgbm': test_preds_lgbm,
#     'rf': test_preds_rf,
#     'xgb': test_preds_xgb
# })

# Initialize the Voting Classifier
# voting = VotingClassifier(
#     estimators=[
#         ('cat', cat),
#         ('lgbm', lgbm),
#         ('rf', rf),
#         # ('xgb', xgb)
#     ],
#     voting='soft',
#     n_jobs=-1
# )

# weighted ensemble
voting = VotingClassifier(
    estimators=[
        ('cat', cat),
        ('lgbm', lgbm),
        ('rf', rf),
        # ('xgb', xgb)
    ],
    voting='soft',
    n_jobs=-1,
    weights=[cat_f1, lgbm_f1, rf_f1]
)

# Fit the model
voting.fit(X_train, y_train)

0:	learn: 1.0943782	total: 158ms	remaining: 5m 16s
1:	learn: 1.0907815	total: 201ms	remaining: 3m 20s
2:	learn: 1.0875261	total: 260ms	remaining: 2m 53s
3:	learn: 1.0849568	total: 285ms	remaining: 2m 22s
4:	learn: 1.0826390	total: 308ms	remaining: 2m 2s
5:	learn: 1.0806267	total: 345ms	remaining: 1m 54s
6:	learn: 1.0794549	total: 376ms	remaining: 1m 47s
7:	learn: 1.0778955	total: 431ms	remaining: 1m 47s
8:	learn: 1.0764078	total: 488ms	remaining: 1m 47s
9:	learn: 1.0747974	total: 534ms	remaining: 1m 46s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.173790 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2909
10:	learn: 1.0735846	total: 577ms	remaining: 1m 44s
11:	learn: 1.0723400	total: 603ms	remaining: 1m 39s
[LightGBM] [Info] Number of data points in the train set: 112569, number of used features: 20
12:	learn: 1.0714113	total: 632ms	remaining: 1m 36s
[LightGBM] [Info] Start training from score -

In [15]:
# make predictions on validation set
valid_preds = voting.predict(X_valid)

# Calculate the F1 score
f1 = f1_score(y_valid, valid_preds, average='macro')

# Calculate the accuracy
accuracy = accuracy_score(y_valid, valid_preds)

# Display the F1 score and accuracy
print(f"F1 Score: {f1}")
print(f"Accuracy: {accuracy}")

F1 Score: 0.6738635542402228
Accuracy: 0.6990734571684922


In [16]:
# make predictions on test set
test_preds = voting.predict(test_data)

In [17]:
# Save the predictions to a CSV file
output = pd.DataFrame({'UID': test_data.index, 'Target': test_preds})

output['Target'] = output['Target'].map({0: 'low', 1: 'medium', 2: 'high'})

output.to_csv('../data/output/ensemble_classWeights_KNN_it_ft.csv', index=False)

In [18]:
unique, counts = np.unique(test_preds, return_counts=True)
print(dict(zip(unique, counts)))

{0: 4201, 1: 7938, 2: 3782}


In [19]:
# cross validation
n_splits = 10
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

# initialize the models
ensemble = VotingClassifier(
    estimators=[
        ('cat', cat),
        ('lgbm', lgbm),
        ('rf', rf),
        # ('xgb', xgb)
    ],
    voting='soft',
    n_jobs=-1
)

f1 = []
acc = []

for (train, test), i in zip(cv.split(X_train, y_train), range(n_splits)):
    ensemble.fit(X_train.iloc[train], y_train.iloc[train])
    
    train_preds = ensemble.predict(X_train.iloc[train])
    valid_preds = ensemble.predict(X_train.iloc[test])

    train_acc = accuracy_score(y_train.iloc[train], train_preds)
    valid_acc = accuracy_score(y_train.iloc[test], valid_preds)

    train_f1 = f1_score(y_train.iloc[train], train_preds, average='macro')
    valid_f1 = f1_score(y_train.iloc[test], valid_preds, average='macro')

    f1.append(valid_f1)
    acc.append(valid_acc)

    print(f"Fold {i + 1} - Train F1: {train_f1:.6f}, Train Acc: {train_acc:.6f}, Valid F1: {valid_f1:.6f}, Valid Acc: {valid_acc:.6f}")



0:	learn: 1.0943111	total: 222ms	remaining: 7m 24s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.179274 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2903
1:	learn: 1.0909750	total: 275ms	remaining: 4m 34s
[LightGBM] [Info] Number of data points in the train set: 90055, number of used features: 20
2:	learn: 1.0881186	total: 392ms	remaining: 4m 20s
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
3:	learn: 1.0851775	total: 459ms	remaining: 3m 49s
[LightGBM] [Info] Start training from score -1.098612
4:	learn: 1.0831593	total: 497ms	remaining: 3m 18s
5:	learn: 1.0810401	total: 562ms	remaining: 3m 6s
6:	learn: 1.0791063	total: 640ms	remaining: 3m 2s
7:	learn: 1.0775336	total: 728ms	remaining: 3m 1s
8:	learn: 1.0761835	total: 766ms	remaining: 2m 49s
9:	learn: 1.0748649	total: 841ms	remaining: 2m 47s
10:	learn: 1.0735839	total: 971ms	remainin

KeyboardInterrupt: 

In [None]:
print(f"Mean F1: {np.mean(f1):.6f}, Mean Acc: {np.mean(acc):.6f}")

Mean F1: 0.433313, Mean Acc: 0.496300
